No OneTemporary
Actions

Size

6 MB

Referenced Files

None

Subscribers

None

View Options

This file is larger than 256 KB, so syntax highlighting was skipped.

	Index: head/contrib/compiler-rt/lib/asan/asan_linux.cc
	===================================================================
	--- head/contrib/compiler-rt/lib/asan/asan_linux.cc (revision 329409)
	+++ head/contrib/compiler-rt/lib/asan/asan_linux.cc (revision 329410)
	@@ -1,253 +1,254 @@
	//===-- asan_linux.cc -----------------------------------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file is a part of AddressSanitizer, an address sanity checker.
	//
	// Linux-specific details.
	//===----------------------------------------------------------------------===//

	#include "sanitizer_common/sanitizer_platform.h"
	#if SANITIZER_FREEBSD \|\| SANITIZER_LINUX \|\| SANITIZER_NETBSD \|\| \
	SANITIZER_SOLARIS

	#include "asan_interceptors.h"
	#include "asan_internal.h"
	#include "asan_premap_shadow.h"
	#include "asan_thread.h"
	#include "sanitizer_common/sanitizer_flags.h"
	#include "sanitizer_common/sanitizer_freebsd.h"
	#include "sanitizer_common/sanitizer_libc.h"
	#include "sanitizer_common/sanitizer_procmaps.h"

	#include <sys/time.h>
	#include <sys/resource.h>
	#include <sys/mman.h>
	#include <sys/syscall.h>
	#include <sys/types.h>
	#include <dlfcn.h>
	#include <fcntl.h>
	+#include <limits.h>
	#include <pthread.h>
	#include <stdio.h>
	#include <unistd.h>
	#include <unwind.h>

	#if SANITIZER_FREEBSD
	#include <sys/link_elf.h>
	#endif

	#if SANITIZER_SOLARIS
	#include <link.h>
	#endif

	#if SANITIZER_ANDROID \|\| SANITIZER_FREEBSD \|\| SANITIZER_SOLARIS
	#include <ucontext.h>
	extern "C" void* _DYNAMIC;
	#elif SANITIZER_NETBSD
	#include <link_elf.h>
	#include <ucontext.h>
	extern Elf_Dyn _DYNAMIC;
	#else
	#include <sys/ucontext.h>
	#include <link.h>
	#endif

	// x86-64 FreeBSD 9.2 and older define 'ucontext_t' incorrectly in
	// 32-bit mode.
	#if SANITIZER_FREEBSD && (SANITIZER_WORDSIZE == 32) && \
	__FreeBSD_version <= 902001 // v9.2
	#define ucontext_t xucontext_t
	#endif

	typedef enum {
	ASAN_RT_VERSION_UNDEFINED = 0,
	ASAN_RT_VERSION_DYNAMIC,
	ASAN_RT_VERSION_STATIC,
	} asan_rt_version_t;

	// FIXME: perhaps also store abi version here?
	extern "C" {
	SANITIZER_INTERFACE_ATTRIBUTE
	asan_rt_version_t __asan_rt_version;
	}

	namespace __asan {

	void InitializePlatformInterceptors() {}
	void InitializePlatformExceptionHandlers() {}
	bool IsSystemHeapAddress (uptr addr) { return false; }

	void *AsanDoesNotSupportStaticLinkage() {
	// This will fail to link with -static.
	return &_DYNAMIC; // defined in link.h
	}

	static void UnmapFromTo(uptr from, uptr to) {
	CHECK(to >= from);
	if (to == from) return;
	uptr res = internal_munmap(reinterpret_cast<void *>(from), to - from);
	if (UNLIKELY(internal_iserror(res))) {
	Report(
	"ERROR: AddresSanitizer failed to unmap 0x%zx (%zd) bytes at address "
	"%p\n",
	to - from, to - from, from);
	CHECK("unable to unmap" && 0);
	}
	}

	#if ASAN_PREMAP_SHADOW
	uptr FindPremappedShadowStart() {
	uptr granularity = GetMmapGranularity();
	uptr shadow_start = reinterpret_cast<uptr>(&__asan_shadow);
	uptr premap_shadow_size = PremapShadowSize();
	uptr shadow_size = RoundUpTo(kHighShadowEnd, granularity);
	// We may have mapped too much. Release extra memory.
	UnmapFromTo(shadow_start + shadow_size, shadow_start + premap_shadow_size);
	return shadow_start;
	}
	#endif

	uptr FindDynamicShadowStart() {
	#if ASAN_PREMAP_SHADOW
	if (!PremapShadowFailed())
	return FindPremappedShadowStart();
	#endif

	uptr granularity = GetMmapGranularity();
	uptr alignment = granularity * 8;
	uptr left_padding = granularity;
	uptr shadow_size = RoundUpTo(kHighShadowEnd, granularity);
	uptr map_size = shadow_size + left_padding + alignment;

	uptr map_start = (uptr)MmapNoAccess(map_size);
	CHECK_NE(map_start, ~(uptr)0);

	uptr shadow_start = RoundUpTo(map_start + left_padding, alignment);
	UnmapFromTo(map_start, shadow_start - left_padding);
	UnmapFromTo(shadow_start + shadow_size, map_start + map_size);

	return shadow_start;
	}

	void AsanApplyToGlobals(globals_op_fptr op, const void *needle) {
	UNIMPLEMENTED();
	}

	#if SANITIZER_ANDROID
	// FIXME: should we do anything for Android?
	void AsanCheckDynamicRTPrereqs() {}
	void AsanCheckIncompatibleRT() {}
	#else
	static int FindFirstDSOCallback(struct dl_phdr_info *info, size_t size,
	void *data) {
	VReport(2, "info->dlpi_name = %s\tinfo->dlpi_addr = %p\n",
	info->dlpi_name, info->dlpi_addr);

	// Continue until the first dynamic library is found
	if (!info->dlpi_name \|\| info->dlpi_name[0] == 0)
	return 0;

	// Ignore vDSO
	if (internal_strncmp(info->dlpi_name, "linux-", sizeof("linux-") - 1) == 0)
	return 0;

	#if SANITIZER_FREEBSD \|\| SANITIZER_NETBSD
	// Ignore first entry (the main program)
	char p = (char )data;
	if (!(*p)) {
	p = (char )-1;
	return 0;
	}
	#endif

	#if SANITIZER_SOLARIS
	// Ignore executable on Solaris
	if (info->dlpi_addr == 0)
	return 0;
	#endif

	(const char *)data = info->dlpi_name;
	return 1;
	}

	static bool IsDynamicRTName(const char *libname) {
	return internal_strstr(libname, "libclang_rt.asan") \|\|
	internal_strstr(libname, "libasan.so");
	}

	static void ReportIncompatibleRT() {
	Report("Your application is linked against incompatible ASan runtimes.\n");
	Die();
	}

	void AsanCheckDynamicRTPrereqs() {
	if (!ASAN_DYNAMIC \|\| !flags()->verify_asan_link_order)
	return;

	// Ensure that dynamic RT is the first DSO in the list
	const char *first_dso_name = nullptr;
	dl_iterate_phdr(FindFirstDSOCallback, &first_dso_name);
	if (first_dso_name && !IsDynamicRTName(first_dso_name)) {
	Report("ASan runtime does not come first in initial library list; "
	"you should either link runtime to your application or "
	"manually preload it with LD_PRELOAD.\n");
	Die();
	}
	}

	void AsanCheckIncompatibleRT() {
	if (ASAN_DYNAMIC) {
	if (__asan_rt_version == ASAN_RT_VERSION_UNDEFINED) {
	__asan_rt_version = ASAN_RT_VERSION_DYNAMIC;
	} else if (__asan_rt_version != ASAN_RT_VERSION_DYNAMIC) {
	ReportIncompatibleRT();
	}
	} else {
	if (__asan_rt_version == ASAN_RT_VERSION_UNDEFINED) {
	// Ensure that dynamic runtime is not present. We should detect it
	// as early as possible, otherwise ASan interceptors could bind to
	// the functions in dynamic ASan runtime instead of the functions in
	// system libraries, causing crashes later in ASan initialization.
	MemoryMappingLayout proc_maps(/cache_enabled/true);
	- char filename[128];
	+ char filename[PATH_MAX];
	MemoryMappedSegment segment(filename, sizeof(filename));
	while (proc_maps.Next(&segment)) {
	if (IsDynamicRTName(segment.filename)) {
	Report("Your application is linked against "
	"incompatible ASan runtimes.\n");
	Die();
	}
	}
	__asan_rt_version = ASAN_RT_VERSION_STATIC;
	} else if (__asan_rt_version != ASAN_RT_VERSION_STATIC) {
	ReportIncompatibleRT();
	}
	}
	}
	#endif // SANITIZER_ANDROID

	#if !SANITIZER_ANDROID
	void ReadContextStack(void context, uptr stack, uptr *ssize) {
	ucontext_t ucp = (ucontext_t)context;
	*stack = (uptr)ucp->uc_stack.ss_sp;
	*ssize = ucp->uc_stack.ss_size;
	}
	#else
	void ReadContextStack(void context, uptr stack, uptr *ssize) {
	UNIMPLEMENTED();
	}
	#endif

	void AsanDlSymNext(const char sym) {
	return dlsym(RTLD_NEXT, sym);
	}

	} // namespace __asan

	#endif // SANITIZER_FREEBSD \|\| SANITIZER_LINUX \|\| SANITIZER_NETBSD \|\|
	// SANITIZER_SOLARIS
	Index: head/contrib/compiler-rt
	===================================================================
	--- head/contrib/compiler-rt (revision 329409)
	+++ head/contrib/compiler-rt (revision 329410)

	Property changes on: head/contrib/compiler-rt
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/compiler-rt/dist-release_60:r328795-329405
	Index: head/contrib/libc++
	===================================================================
	--- head/contrib/libc++ (revision 329409)
	+++ head/contrib/libc++ (revision 329410)

	Property changes on: head/contrib/libc++
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/libc++/dist-release_60:r328795-329405
	Index: head/contrib/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
	===================================================================
	--- head/contrib/llvm/include/llvm/IR/IntrinsicsAMDGPU.td (revision 329409)
	+++ head/contrib/llvm/include/llvm/IR/IntrinsicsAMDGPU.td (revision 329410)
	@@ -1,874 +1,894 @@
	//===- IntrinsicsAMDGPU.td - Defines AMDGPU intrinsics ------ tablegen --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines all of the R600-specific intrinsics.
	//
	//===----------------------------------------------------------------------===//

	class AMDGPUReadPreloadRegisterIntrinsic
	: Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;

	class AMDGPUReadPreloadRegisterIntrinsicNamed<string name>
	: Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>, GCCBuiltin<name>;

	let TargetPrefix = "r600" in {

	multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz {
	def _x : AMDGPUReadPreloadRegisterIntrinsic;
	def _y : AMDGPUReadPreloadRegisterIntrinsic;
	def _z : AMDGPUReadPreloadRegisterIntrinsic;
	}

	multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz_named<string prefix> {
	def _x : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_x")>;
	def _y : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_y")>;
	def _z : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_z")>;
	}

	defm int_r600_read_global_size : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
	<"__builtin_r600_read_global_size">;
	defm int_r600_read_ngroups : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
	<"__builtin_r600_read_ngroups">;
	defm int_r600_read_tgid : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
	<"__builtin_r600_read_tgid">;

	defm int_r600_read_local_size : AMDGPUReadPreloadRegisterIntrinsic_xyz;
	defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz;

	def int_r600_group_barrier : GCCBuiltin<"__builtin_r600_group_barrier">,
	Intrinsic<[], [], [IntrConvergent]>;

	// AS 7 is PARAM_I_ADDRESS, used for kernel arguments
	def int_r600_implicitarg_ptr :
	GCCBuiltin<"__builtin_r600_implicitarg_ptr">,
	Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 7>], [],
	[IntrNoMem, IntrSpeculatable]>;

	def int_r600_rat_store_typed :
	// 1st parameter: Data
	// 2nd parameter: Index
	// 3rd parameter: Constant RAT ID
	Intrinsic<[], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], []>,
	GCCBuiltin<"__builtin_r600_rat_store_typed">;

	def int_r600_recipsqrt_ieee : Intrinsic<
	[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
	>;

	def int_r600_recipsqrt_clamped : Intrinsic<
	[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
	>;

	def int_r600_cube : Intrinsic<
	[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable]
	>;

	} // End TargetPrefix = "r600"

	let TargetPrefix = "amdgcn" in {

	//===----------------------------------------------------------------------===//
	// ABI Special Intrinsics
	//===----------------------------------------------------------------------===//

	defm int_amdgcn_workitem_id : AMDGPUReadPreloadRegisterIntrinsic_xyz;
	defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named
	<"__builtin_amdgcn_workgroup_id">;

	def int_amdgcn_dispatch_ptr :
	GCCBuiltin<"__builtin_amdgcn_dispatch_ptr">,
	Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 2>], [],
	[IntrNoMem, IntrSpeculatable]>;

	def int_amdgcn_queue_ptr :
	GCCBuiltin<"__builtin_amdgcn_queue_ptr">,
	Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 2>], [],
	[IntrNoMem, IntrSpeculatable]>;

	def int_amdgcn_kernarg_segment_ptr :
	GCCBuiltin<"__builtin_amdgcn_kernarg_segment_ptr">,
	Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 2>], [],
	[IntrNoMem, IntrSpeculatable]>;

	def int_amdgcn_implicitarg_ptr :
	GCCBuiltin<"__builtin_amdgcn_implicitarg_ptr">,
	Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 2>], [],
	[IntrNoMem, IntrSpeculatable]>;

	def int_amdgcn_groupstaticsize :
	GCCBuiltin<"__builtin_amdgcn_groupstaticsize">,
	Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;

	def int_amdgcn_dispatch_id :
	GCCBuiltin<"__builtin_amdgcn_dispatch_id">,
	Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable]>;

	def int_amdgcn_implicit_buffer_ptr :
	GCCBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">,
	Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 2>], [],
	[IntrNoMem, IntrSpeculatable]>;

	// Set EXEC to the 64-bit value given.
	// This is always moved to the beginning of the basic block.
	def int_amdgcn_init_exec : Intrinsic<[],
	[llvm_i64_ty], // 64-bit literal constant
	[IntrConvergent]>;

	// Set EXEC according to a thread count packed in an SGPR input:
	// thread_count = (input >> bitoffset) & 0x7f;
	// This is always moved to the beginning of the basic block.
	def int_amdgcn_init_exec_from_input : Intrinsic<[],
	[llvm_i32_ty, // 32-bit SGPR input
	llvm_i32_ty], // bit offset of the thread count
	[IntrConvergent]>;


	//===----------------------------------------------------------------------===//
	// Instruction Intrinsics
	//===----------------------------------------------------------------------===//

	// The first parameter is s_sendmsg immediate (i16),
	// the second one is copied to m0
	def int_amdgcn_s_sendmsg : GCCBuiltin<"__builtin_amdgcn_s_sendmsg">,
	Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], []>;
	def int_amdgcn_s_sendmsghalt : GCCBuiltin<"__builtin_amdgcn_s_sendmsghalt">,
	Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], []>;

	def int_amdgcn_s_barrier : GCCBuiltin<"__builtin_amdgcn_s_barrier">,
	Intrinsic<[], [], [IntrConvergent]>;

	def int_amdgcn_wave_barrier : GCCBuiltin<"__builtin_amdgcn_wave_barrier">,
	Intrinsic<[], [], [IntrConvergent]>;

	def int_amdgcn_s_waitcnt : GCCBuiltin<"__builtin_amdgcn_s_waitcnt">,
	Intrinsic<[], [llvm_i32_ty], []>;

	def int_amdgcn_div_scale : Intrinsic<
	// 1st parameter: Numerator
	// 2nd parameter: Denominator
	// 3rd parameter: Constant to select select between first and
	// second. (0 = first, 1 = second).
	[llvm_anyfloat_ty, llvm_i1_ty],
	[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty],
	[IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_div_fmas : Intrinsic<[llvm_anyfloat_ty],
	[LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty],
	[IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_div_fixup : Intrinsic<[llvm_anyfloat_ty],
	[LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
	[IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_trig_preop : Intrinsic<
	[llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty],
	[IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_sin : Intrinsic<
	[llvm_anyfloat_ty], [LLVMMatchType<0>],
	[IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_cos : Intrinsic<
	[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_log_clamp : Intrinsic<
	[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_fmul_legacy : GCCBuiltin<"__builtin_amdgcn_fmul_legacy">,
	Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
	[IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_rcp : Intrinsic<
	[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_rcp_legacy : GCCBuiltin<"__builtin_amdgcn_rcp_legacy">,
	Intrinsic<[llvm_float_ty], [llvm_float_ty],
	[IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_rsq : Intrinsic<
	[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_rsq_legacy : GCCBuiltin<"__builtin_amdgcn_rsq_legacy">,
	Intrinsic<
	[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_rsq_clamp : Intrinsic<
	[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>;

	def int_amdgcn_ldexp : Intrinsic<
	[llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty],
	[IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_frexp_mant : Intrinsic<
	[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_frexp_exp : Intrinsic<
	[llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem, IntrSpeculatable]
	>;

	// v_fract is buggy on SI/CI. It mishandles infinities, may return 1.0
	// and always uses rtz, so is not suitable for implementing the OpenCL
	// fract function. It should be ok on VI.
	def int_amdgcn_fract : Intrinsic<
	[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_cvt_pkrtz : Intrinsic<
	[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty],
	[IntrNoMem, IntrSpeculatable]
	>;

	+def int_amdgcn_cvt_pknorm_i16 : Intrinsic<
	+ [llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
	+ [IntrNoMem, IntrSpeculatable]
	+>;
	+
	+def int_amdgcn_cvt_pknorm_u16 : Intrinsic<
	+ [llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
	+ [IntrNoMem, IntrSpeculatable]
	+>;
	+
	+def int_amdgcn_cvt_pk_i16 : Intrinsic<
	+ [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
	+ [IntrNoMem, IntrSpeculatable]
	+>;
	+
	+def int_amdgcn_cvt_pk_u16 : Intrinsic<
	+ [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
	+ [IntrNoMem, IntrSpeculatable]
	+>;
	+
	def int_amdgcn_class : Intrinsic<
	[llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty],
	[IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_fmed3 : GCCBuiltin<"__builtin_amdgcn_fmed3">,
	Intrinsic<[llvm_anyfloat_ty],
	[LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
	[IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_cubeid : GCCBuiltin<"__builtin_amdgcn_cubeid">,
	Intrinsic<[llvm_float_ty],
	[llvm_float_ty, llvm_float_ty, llvm_float_ty],
	[IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_cubema : GCCBuiltin<"__builtin_amdgcn_cubema">,
	Intrinsic<[llvm_float_ty],
	[llvm_float_ty, llvm_float_ty, llvm_float_ty],
	[IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_cubesc : GCCBuiltin<"__builtin_amdgcn_cubesc">,
	Intrinsic<[llvm_float_ty],
	[llvm_float_ty, llvm_float_ty, llvm_float_ty],
	[IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_cubetc : GCCBuiltin<"__builtin_amdgcn_cubetc">,
	Intrinsic<[llvm_float_ty],
	[llvm_float_ty, llvm_float_ty, llvm_float_ty],
	[IntrNoMem, IntrSpeculatable]
	>;

	// v_ffbh_i32, as opposed to v_ffbh_u32. For v_ffbh_u32, llvm.ctlz
	// should be used.
	def int_amdgcn_sffbh :
	Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>],
	[IntrNoMem, IntrSpeculatable]
	>;


	// Fields should mirror atomicrmw
	class AMDGPUAtomicIncIntrin : Intrinsic<[llvm_anyint_ty],
	[llvm_anyptr_ty,
	LLVMMatchType<0>,
	llvm_i32_ty, // ordering
	llvm_i32_ty, // scope
	llvm_i1_ty], // isVolatile
	[IntrArgMemOnly, NoCapture<0>], "",
	[SDNPMemOperand]
	>;

	def int_amdgcn_atomic_inc : AMDGPUAtomicIncIntrin;
	def int_amdgcn_atomic_dec : AMDGPUAtomicIncIntrin;

	class AMDGPUImageLoad<bit NoMem = 0> : Intrinsic <
	[llvm_anyfloat_ty], // vdata(VGPR)
	[llvm_anyint_ty, // vaddr(VGPR)
	llvm_anyint_ty, // rsrc(SGPR)
	llvm_i32_ty, // dmask(imm)
	llvm_i1_ty, // glc(imm)
	llvm_i1_ty, // slc(imm)
	llvm_i1_ty, // lwe(imm)
	llvm_i1_ty], // da(imm)
	!if(NoMem, [IntrNoMem], [IntrReadMem]), "",
	!if(NoMem, [], [SDNPMemOperand])>;

	def int_amdgcn_image_load : AMDGPUImageLoad;
	def int_amdgcn_image_load_mip : AMDGPUImageLoad;
	def int_amdgcn_image_getresinfo : AMDGPUImageLoad<1>;

	class AMDGPUImageStore : Intrinsic <
	[],
	[llvm_anyfloat_ty, // vdata(VGPR)
	llvm_anyint_ty, // vaddr(VGPR)
	llvm_anyint_ty, // rsrc(SGPR)
	llvm_i32_ty, // dmask(imm)
	llvm_i1_ty, // glc(imm)
	llvm_i1_ty, // slc(imm)
	llvm_i1_ty, // lwe(imm)
	llvm_i1_ty], // da(imm)
	[IntrWriteMem], "", [SDNPMemOperand]>;

	def int_amdgcn_image_store : AMDGPUImageStore;
	def int_amdgcn_image_store_mip : AMDGPUImageStore;

	class AMDGPUImageSample<bit NoMem = 0> : Intrinsic <
	[llvm_anyfloat_ty], // vdata(VGPR)
	[llvm_anyfloat_ty, // vaddr(VGPR)
	llvm_anyint_ty, // rsrc(SGPR)
	llvm_v4i32_ty, // sampler(SGPR)
	llvm_i32_ty, // dmask(imm)
	llvm_i1_ty, // unorm(imm)
	llvm_i1_ty, // glc(imm)
	llvm_i1_ty, // slc(imm)
	llvm_i1_ty, // lwe(imm)
	llvm_i1_ty], // da(imm)
	!if(NoMem, [IntrNoMem], [IntrReadMem]), "",
	!if(NoMem, [], [SDNPMemOperand])>;

	// Basic sample
	def int_amdgcn_image_sample : AMDGPUImageSample;
	def int_amdgcn_image_sample_cl : AMDGPUImageSample;
	def int_amdgcn_image_sample_d : AMDGPUImageSample;
	def int_amdgcn_image_sample_d_cl : AMDGPUImageSample;
	def int_amdgcn_image_sample_l : AMDGPUImageSample;
	def int_amdgcn_image_sample_b : AMDGPUImageSample;
	def int_amdgcn_image_sample_b_cl : AMDGPUImageSample;
	def int_amdgcn_image_sample_lz : AMDGPUImageSample;
	def int_amdgcn_image_sample_cd : AMDGPUImageSample;
	def int_amdgcn_image_sample_cd_cl : AMDGPUImageSample;

	// Sample with comparison
	def int_amdgcn_image_sample_c : AMDGPUImageSample;
	def int_amdgcn_image_sample_c_cl : AMDGPUImageSample;
	def int_amdgcn_image_sample_c_d : AMDGPUImageSample;
	def int_amdgcn_image_sample_c_d_cl : AMDGPUImageSample;
	def int_amdgcn_image_sample_c_l : AMDGPUImageSample;
	def int_amdgcn_image_sample_c_b : AMDGPUImageSample;
	def int_amdgcn_image_sample_c_b_cl : AMDGPUImageSample;
	def int_amdgcn_image_sample_c_lz : AMDGPUImageSample;
	def int_amdgcn_image_sample_c_cd : AMDGPUImageSample;
	def int_amdgcn_image_sample_c_cd_cl : AMDGPUImageSample;

	// Sample with offsets
	def int_amdgcn_image_sample_o : AMDGPUImageSample;
	def int_amdgcn_image_sample_cl_o : AMDGPUImageSample;
	def int_amdgcn_image_sample_d_o : AMDGPUImageSample;
	def int_amdgcn_image_sample_d_cl_o : AMDGPUImageSample;
	def int_amdgcn_image_sample_l_o : AMDGPUImageSample;
	def int_amdgcn_image_sample_b_o : AMDGPUImageSample;
	def int_amdgcn_image_sample_b_cl_o : AMDGPUImageSample;
	def int_amdgcn_image_sample_lz_o : AMDGPUImageSample;
	def int_amdgcn_image_sample_cd_o : AMDGPUImageSample;
	def int_amdgcn_image_sample_cd_cl_o : AMDGPUImageSample;

	// Sample with comparison and offsets
	def int_amdgcn_image_sample_c_o : AMDGPUImageSample;
	def int_amdgcn_image_sample_c_cl_o : AMDGPUImageSample;
	def int_amdgcn_image_sample_c_d_o : AMDGPUImageSample;
	def int_amdgcn_image_sample_c_d_cl_o : AMDGPUImageSample;
	def int_amdgcn_image_sample_c_l_o : AMDGPUImageSample;
	def int_amdgcn_image_sample_c_b_o : AMDGPUImageSample;
	def int_amdgcn_image_sample_c_b_cl_o : AMDGPUImageSample;
	def int_amdgcn_image_sample_c_lz_o : AMDGPUImageSample;
	def int_amdgcn_image_sample_c_cd_o : AMDGPUImageSample;
	def int_amdgcn_image_sample_c_cd_cl_o : AMDGPUImageSample;

	// Basic gather4
	def int_amdgcn_image_gather4 : AMDGPUImageSample;
	def int_amdgcn_image_gather4_cl : AMDGPUImageSample;
	def int_amdgcn_image_gather4_l : AMDGPUImageSample;
	def int_amdgcn_image_gather4_b : AMDGPUImageSample;
	def int_amdgcn_image_gather4_b_cl : AMDGPUImageSample;
	def int_amdgcn_image_gather4_lz : AMDGPUImageSample;

	// Gather4 with comparison
	def int_amdgcn_image_gather4_c : AMDGPUImageSample;
	def int_amdgcn_image_gather4_c_cl : AMDGPUImageSample;
	def int_amdgcn_image_gather4_c_l : AMDGPUImageSample;
	def int_amdgcn_image_gather4_c_b : AMDGPUImageSample;
	def int_amdgcn_image_gather4_c_b_cl : AMDGPUImageSample;
	def int_amdgcn_image_gather4_c_lz : AMDGPUImageSample;

	// Gather4 with offsets
	def int_amdgcn_image_gather4_o : AMDGPUImageSample;
	def int_amdgcn_image_gather4_cl_o : AMDGPUImageSample;
	def int_amdgcn_image_gather4_l_o : AMDGPUImageSample;
	def int_amdgcn_image_gather4_b_o : AMDGPUImageSample;
	def int_amdgcn_image_gather4_b_cl_o : AMDGPUImageSample;
	def int_amdgcn_image_gather4_lz_o : AMDGPUImageSample;

	// Gather4 with comparison and offsets
	def int_amdgcn_image_gather4_c_o : AMDGPUImageSample;
	def int_amdgcn_image_gather4_c_cl_o : AMDGPUImageSample;
	def int_amdgcn_image_gather4_c_l_o : AMDGPUImageSample;
	def int_amdgcn_image_gather4_c_b_o : AMDGPUImageSample;
	def int_amdgcn_image_gather4_c_b_cl_o : AMDGPUImageSample;
	def int_amdgcn_image_gather4_c_lz_o : AMDGPUImageSample;

	def int_amdgcn_image_getlod : AMDGPUImageSample<1>;

	class AMDGPUImageAtomic : Intrinsic <
	[llvm_i32_ty],
	[llvm_i32_ty, // vdata(VGPR)
	llvm_anyint_ty, // vaddr(VGPR)
	llvm_v8i32_ty, // rsrc(SGPR)
	llvm_i1_ty, // r128(imm)
	llvm_i1_ty, // da(imm)
	llvm_i1_ty], // slc(imm)
	[], "", [SDNPMemOperand]>;

	def int_amdgcn_image_atomic_swap : AMDGPUImageAtomic;
	def int_amdgcn_image_atomic_add : AMDGPUImageAtomic;
	def int_amdgcn_image_atomic_sub : AMDGPUImageAtomic;
	def int_amdgcn_image_atomic_smin : AMDGPUImageAtomic;
	def int_amdgcn_image_atomic_umin : AMDGPUImageAtomic;
	def int_amdgcn_image_atomic_smax : AMDGPUImageAtomic;
	def int_amdgcn_image_atomic_umax : AMDGPUImageAtomic;
	def int_amdgcn_image_atomic_and : AMDGPUImageAtomic;
	def int_amdgcn_image_atomic_or : AMDGPUImageAtomic;
	def int_amdgcn_image_atomic_xor : AMDGPUImageAtomic;
	def int_amdgcn_image_atomic_inc : AMDGPUImageAtomic;
	def int_amdgcn_image_atomic_dec : AMDGPUImageAtomic;
	def int_amdgcn_image_atomic_cmpswap : Intrinsic <
	[llvm_i32_ty],
	[llvm_i32_ty, // src(VGPR)
	llvm_i32_ty, // cmp(VGPR)
	llvm_anyint_ty, // vaddr(VGPR)
	llvm_v8i32_ty, // rsrc(SGPR)
	llvm_i1_ty, // r128(imm)
	llvm_i1_ty, // da(imm)
	llvm_i1_ty], // slc(imm)
	[], "", [SDNPMemOperand]>;

	class AMDGPUBufferLoad : Intrinsic <
	[llvm_anyfloat_ty],
	[llvm_v4i32_ty, // rsrc(SGPR)
	llvm_i32_ty, // vindex(VGPR)
	llvm_i32_ty, // offset(SGPR/VGPR/imm)
	llvm_i1_ty, // glc(imm)
	llvm_i1_ty], // slc(imm)
	[IntrReadMem], "", [SDNPMemOperand]>;
	def int_amdgcn_buffer_load_format : AMDGPUBufferLoad;
	def int_amdgcn_buffer_load : AMDGPUBufferLoad;

	class AMDGPUBufferStore : Intrinsic <
	[],
	[llvm_anyfloat_ty, // vdata(VGPR) -- can currently only select f32, v2f32, v4f32
	llvm_v4i32_ty, // rsrc(SGPR)
	llvm_i32_ty, // vindex(VGPR)
	llvm_i32_ty, // offset(SGPR/VGPR/imm)
	llvm_i1_ty, // glc(imm)
	llvm_i1_ty], // slc(imm)
	[IntrWriteMem], "", [SDNPMemOperand]>;
	def int_amdgcn_buffer_store_format : AMDGPUBufferStore;
	def int_amdgcn_buffer_store : AMDGPUBufferStore;

	def int_amdgcn_tbuffer_load : Intrinsic <
	[llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
	[llvm_v4i32_ty, // rsrc(SGPR)
	llvm_i32_ty, // vindex(VGPR)
	llvm_i32_ty, // voffset(VGPR)
	llvm_i32_ty, // soffset(SGPR)
	llvm_i32_ty, // offset(imm)
	llvm_i32_ty, // dfmt(imm)
	llvm_i32_ty, // nfmt(imm)
	llvm_i1_ty, // glc(imm)
	llvm_i1_ty], // slc(imm)
	[IntrReadMem], "", [SDNPMemOperand]>;

	def int_amdgcn_tbuffer_store : Intrinsic <
	[],
	[llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
	llvm_v4i32_ty, // rsrc(SGPR)
	llvm_i32_ty, // vindex(VGPR)
	llvm_i32_ty, // voffset(VGPR)
	llvm_i32_ty, // soffset(SGPR)
	llvm_i32_ty, // offset(imm)
	llvm_i32_ty, // dfmt(imm)
	llvm_i32_ty, // nfmt(imm)
	llvm_i1_ty, // glc(imm)
	llvm_i1_ty], // slc(imm)
	[IntrWriteMem], "", [SDNPMemOperand]>;

	class AMDGPUBufferAtomic : Intrinsic <
	[llvm_i32_ty],
	[llvm_i32_ty, // vdata(VGPR)
	llvm_v4i32_ty, // rsrc(SGPR)
	llvm_i32_ty, // vindex(VGPR)
	llvm_i32_ty, // offset(SGPR/VGPR/imm)
	llvm_i1_ty], // slc(imm)
	[], "", [SDNPMemOperand]>;
	def int_amdgcn_buffer_atomic_swap : AMDGPUBufferAtomic;
	def int_amdgcn_buffer_atomic_add : AMDGPUBufferAtomic;
	def int_amdgcn_buffer_atomic_sub : AMDGPUBufferAtomic;
	def int_amdgcn_buffer_atomic_smin : AMDGPUBufferAtomic;
	def int_amdgcn_buffer_atomic_umin : AMDGPUBufferAtomic;
	def int_amdgcn_buffer_atomic_smax : AMDGPUBufferAtomic;
	def int_amdgcn_buffer_atomic_umax : AMDGPUBufferAtomic;
	def int_amdgcn_buffer_atomic_and : AMDGPUBufferAtomic;
	def int_amdgcn_buffer_atomic_or : AMDGPUBufferAtomic;
	def int_amdgcn_buffer_atomic_xor : AMDGPUBufferAtomic;
	def int_amdgcn_buffer_atomic_cmpswap : Intrinsic<
	[llvm_i32_ty],
	[llvm_i32_ty, // src(VGPR)
	llvm_i32_ty, // cmp(VGPR)
	llvm_v4i32_ty, // rsrc(SGPR)
	llvm_i32_ty, // vindex(VGPR)
	llvm_i32_ty, // offset(SGPR/VGPR/imm)
	llvm_i1_ty], // slc(imm)
	[], "", [SDNPMemOperand]>;

	// Uses that do not set the done bit should set IntrWriteMem on the
	// call site.
	def int_amdgcn_exp : Intrinsic <[], [
	llvm_i32_ty, // tgt,
	llvm_i32_ty, // en
	llvm_any_ty, // src0 (f32 or i32)
	LLVMMatchType<0>, // src1
	LLVMMatchType<0>, // src2
	LLVMMatchType<0>, // src3
	llvm_i1_ty, // done
	llvm_i1_ty // vm
	],
	[]
	>;

	// exp with compr bit set.
	def int_amdgcn_exp_compr : Intrinsic <[], [
	llvm_i32_ty, // tgt,
	llvm_i32_ty, // en
	llvm_anyvector_ty, // src0 (v2f16 or v2i16)
	LLVMMatchType<0>, // src1
	llvm_i1_ty, // done
	llvm_i1_ty], // vm
	[]
	>;

	def int_amdgcn_buffer_wbinvl1_sc :
	GCCBuiltin<"__builtin_amdgcn_buffer_wbinvl1_sc">,
	Intrinsic<[], [], []>;

	def int_amdgcn_buffer_wbinvl1 :
	GCCBuiltin<"__builtin_amdgcn_buffer_wbinvl1">,
	Intrinsic<[], [], []>;

	def int_amdgcn_s_dcache_inv :
	GCCBuiltin<"__builtin_amdgcn_s_dcache_inv">,
	Intrinsic<[], [], []>;

	def int_amdgcn_s_memtime :
	GCCBuiltin<"__builtin_amdgcn_s_memtime">,
	Intrinsic<[llvm_i64_ty], [], [IntrReadMem]>;

	def int_amdgcn_s_sleep :
	GCCBuiltin<"__builtin_amdgcn_s_sleep">,
	Intrinsic<[], [llvm_i32_ty], []> {
	}

	def int_amdgcn_s_incperflevel :
	GCCBuiltin<"__builtin_amdgcn_s_incperflevel">,
	Intrinsic<[], [llvm_i32_ty], []> {
	}

	def int_amdgcn_s_decperflevel :
	GCCBuiltin<"__builtin_amdgcn_s_decperflevel">,
	Intrinsic<[], [llvm_i32_ty], []> {
	}

	def int_amdgcn_s_getreg :
	GCCBuiltin<"__builtin_amdgcn_s_getreg">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
	[IntrReadMem, IntrSpeculatable]
	>;

	// int_amdgcn_s_getpc is provided to allow a specific style of position
	// independent code to determine the high part of its address when it is
	// known (through convention) that the code and any data of interest does
	// not cross a 4Gb address boundary. Use for any other purpose may not
	// produce the desired results as optimizations may cause code movement,
	// especially as we explicitly use IntrNoMem to allow optimizations.
	def int_amdgcn_s_getpc :
	GCCBuiltin<"__builtin_amdgcn_s_getpc">,
	Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable]>;

	// __builtin_amdgcn_interp_mov <param>, <attr_chan>, <attr>, <m0>
	// param values: 0 = P10, 1 = P20, 2 = P0
	def int_amdgcn_interp_mov :
	GCCBuiltin<"__builtin_amdgcn_interp_mov">,
	Intrinsic<[llvm_float_ty],
	[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
	[IntrNoMem, IntrSpeculatable]>;

	// __builtin_amdgcn_interp_p1 <i>, <attr_chan>, <attr>, <m0>
	// This intrinsic reads from lds, but the memory values are constant,
	// so it behaves like IntrNoMem.
	def int_amdgcn_interp_p1 :
	GCCBuiltin<"__builtin_amdgcn_interp_p1">,
	Intrinsic<[llvm_float_ty],
	[llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
	[IntrNoMem, IntrSpeculatable]>;

	// __builtin_amdgcn_interp_p2 <p1>, <j>, <attr_chan>, <attr>, <m0>
	def int_amdgcn_interp_p2 :
	GCCBuiltin<"__builtin_amdgcn_interp_p2">,
	Intrinsic<[llvm_float_ty],
	[llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
	[IntrNoMem, IntrSpeculatable]>;
	// See int_amdgcn_v_interp_p1 for why this is IntrNoMem.

	// Pixel shaders only: whether the current pixel is live (i.e. not a helper
	// invocation for derivative computation).
	def int_amdgcn_ps_live : Intrinsic <
	[llvm_i1_ty],
	[],
	[IntrNoMem]>;

	def int_amdgcn_mbcnt_lo :
	GCCBuiltin<"__builtin_amdgcn_mbcnt_lo">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;

	def int_amdgcn_mbcnt_hi :
	GCCBuiltin<"__builtin_amdgcn_mbcnt_hi">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;

	// llvm.amdgcn.ds.swizzle src offset
	def int_amdgcn_ds_swizzle :
	GCCBuiltin<"__builtin_amdgcn_ds_swizzle">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;

	def int_amdgcn_ubfe : Intrinsic<[llvm_anyint_ty],
	[LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty],
	[IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_sbfe : Intrinsic<[llvm_anyint_ty],
	[LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty],
	[IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_lerp :
	GCCBuiltin<"__builtin_amdgcn_lerp">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
	[IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_sad_u8 :
	GCCBuiltin<"__builtin_amdgcn_sad_u8">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
	[IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_msad_u8 :
	GCCBuiltin<"__builtin_amdgcn_msad_u8">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
	[IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_sad_hi_u8 :
	GCCBuiltin<"__builtin_amdgcn_sad_hi_u8">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
	[IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_sad_u16 :
	GCCBuiltin<"__builtin_amdgcn_sad_u16">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
	[IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_qsad_pk_u16_u8 :
	GCCBuiltin<"__builtin_amdgcn_qsad_pk_u16_u8">,
	Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
	[IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_mqsad_pk_u16_u8 :
	GCCBuiltin<"__builtin_amdgcn_mqsad_pk_u16_u8">,
	Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
	[IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_mqsad_u32_u8 :
	GCCBuiltin<"__builtin_amdgcn_mqsad_u32_u8">,
	Intrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_v4i32_ty],
	[IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_cvt_pk_u8_f32 :
	GCCBuiltin<"__builtin_amdgcn_cvt_pk_u8_f32">,
	Intrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty],
	[IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_icmp :
	Intrinsic<[llvm_i64_ty], [llvm_anyint_ty, LLVMMatchType<0>, llvm_i32_ty],
	[IntrNoMem, IntrConvergent]>;

	def int_amdgcn_fcmp :
	Intrinsic<[llvm_i64_ty], [llvm_anyfloat_ty, LLVMMatchType<0>, llvm_i32_ty],
	[IntrNoMem, IntrConvergent]>;

	def int_amdgcn_readfirstlane :
	GCCBuiltin<"__builtin_amdgcn_readfirstlane">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent]>;

	// The lane argument must be uniform across the currently active threads of the
	// current wave. Otherwise, the result is undefined.
	def int_amdgcn_readlane :
	GCCBuiltin<"__builtin_amdgcn_readlane">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;

	def int_amdgcn_alignbit : Intrinsic<[llvm_i32_ty],
	[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
	[IntrNoMem, IntrSpeculatable]
	>;

	def int_amdgcn_alignbyte : Intrinsic<[llvm_i32_ty],
	[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
	[IntrNoMem, IntrSpeculatable]
	>;


	// Copies the source value to the destination value, with the guarantee that
	// the source value is computed as if the entire program were executed in WQM.
	def int_amdgcn_wqm : Intrinsic<[llvm_any_ty],
	[LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
	>;

	// Return true if at least one thread within the pixel quad passes true into
	// the function.
	def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty],
	[llvm_i1_ty], [IntrNoMem, IntrConvergent]
	>;

	// If false, set EXEC=0 for the current thread until the end of program.
	def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], []>;

	// Copies the active channels of the source value to the destination value,
	// with the guarantee that the source value is computed as if the entire
	// program were executed in Whole Wavefront Mode, i.e. with all channels
	// enabled, with a few exceptions: - Phi nodes with require WWM return an
	// undefined value.
	def int_amdgcn_wwm : Intrinsic<[llvm_any_ty],
	[LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
	>;

	// Given a value, copies it while setting all the inactive lanes to a given
	// value. Note that OpenGL helper lanes are considered active, so if the
	// program ever uses WQM, then the instruction and the first source will be
	// computed in WQM.
	def int_amdgcn_set_inactive :
	Intrinsic<[llvm_anyint_ty],
	[LLVMMatchType<0>, // value to be copied
	LLVMMatchType<0>], // value for the inactive lanes to take
	[IntrNoMem, IntrConvergent]>;

	//===----------------------------------------------------------------------===//
	// CI+ Intrinsics
	//===----------------------------------------------------------------------===//

	def int_amdgcn_s_dcache_inv_vol :
	GCCBuiltin<"__builtin_amdgcn_s_dcache_inv_vol">,
	Intrinsic<[], [], []>;

	def int_amdgcn_buffer_wbinvl1_vol :
	GCCBuiltin<"__builtin_amdgcn_buffer_wbinvl1_vol">,
	Intrinsic<[], [], []>;

	//===----------------------------------------------------------------------===//
	// VI Intrinsics
	//===----------------------------------------------------------------------===//

	// llvm.amdgcn.mov.dpp.i32 <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
	def int_amdgcn_mov_dpp :
	Intrinsic<[llvm_anyint_ty],
	[LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
	llvm_i1_ty], [IntrNoMem, IntrConvergent]>;

	// llvm.amdgcn.update.dpp.i32 <old> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
	// Should be equivalent to:
	// v_mov_b32 <dest> <old>
	// v_mov_b32 <dest> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
	def int_amdgcn_update_dpp :
	Intrinsic<[llvm_anyint_ty],
	[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty,
	llvm_i32_ty, llvm_i1_ty], [IntrNoMem, IntrConvergent]>;

	def int_amdgcn_s_dcache_wb :
	GCCBuiltin<"__builtin_amdgcn_s_dcache_wb">,
	Intrinsic<[], [], []>;

	def int_amdgcn_s_dcache_wb_vol :
	GCCBuiltin<"__builtin_amdgcn_s_dcache_wb_vol">,
	Intrinsic<[], [], []>;

	def int_amdgcn_s_memrealtime :
	GCCBuiltin<"__builtin_amdgcn_s_memrealtime">,
	Intrinsic<[llvm_i64_ty], [], [IntrReadMem]>;

	// llvm.amdgcn.ds.permute <index> <src>
	def int_amdgcn_ds_permute :
	GCCBuiltin<"__builtin_amdgcn_ds_permute">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;

	// llvm.amdgcn.ds.bpermute <index> <src>
	def int_amdgcn_ds_bpermute :
	GCCBuiltin<"__builtin_amdgcn_ds_bpermute">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;


	//===----------------------------------------------------------------------===//
	// Special Intrinsics for backend internal use only. No frontend
	// should emit calls to these.
	// ===----------------------------------------------------------------------===//
	def int_amdgcn_if : Intrinsic<[llvm_i1_ty, llvm_i64_ty],
	[llvm_i1_ty], [IntrConvergent]
	>;

	def int_amdgcn_else : Intrinsic<[llvm_i1_ty, llvm_i64_ty],
	[llvm_i64_ty], [IntrConvergent]
	>;

	def int_amdgcn_break : Intrinsic<[llvm_i64_ty],
	[llvm_i64_ty], [IntrNoMem, IntrConvergent]
	>;

	def int_amdgcn_if_break : Intrinsic<[llvm_i64_ty],
	[llvm_i1_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]
	>;

	def int_amdgcn_else_break : Intrinsic<[llvm_i64_ty],
	[llvm_i64_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]
	>;

	def int_amdgcn_loop : Intrinsic<[llvm_i1_ty],
	[llvm_i64_ty], [IntrConvergent]
	>;

	def int_amdgcn_end_cf : Intrinsic<[], [llvm_i64_ty], [IntrConvergent]>;

	// Represent unreachable in a divergent region.
	def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent]>;

	// Emit 2.5 ulp, no denormal division. Should only be inserted by
	// pass based on !fpmath metadata.
	def int_amdgcn_fdiv_fast : Intrinsic<
	[llvm_float_ty], [llvm_float_ty, llvm_float_ty],
	[IntrNoMem, IntrSpeculatable]
	>;
	}
	Index: head/contrib/llvm/include/llvm/IR/IntrinsicsX86.td
	===================================================================
	--- head/contrib/llvm/include/llvm/IR/IntrinsicsX86.td (revision 329409)
	+++ head/contrib/llvm/include/llvm/IR/IntrinsicsX86.td (revision 329410)
	@@ -1,6511 +1,6520 @@
	//===- IntrinsicsX86.td - Defines X86 intrinsics ------------ tablegen --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines all of the X86-specific intrinsics.
	//
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// Interrupt traps
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_int : Intrinsic<[], [llvm_i8_ty]>;
	}

	//===----------------------------------------------------------------------===//
	// SEH intrinsics for Windows
	let TargetPrefix = "x86" in {
	def int_x86_seh_lsda : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], [IntrNoMem]>;

	// Marks the EH registration node created in LLVM IR prior to code generation.
	def int_x86_seh_ehregnode : Intrinsic<[], [llvm_ptr_ty], []>;

	// Marks the EH guard slot node created in LLVM IR prior to code generation.
	def int_x86_seh_ehguard : Intrinsic<[], [llvm_ptr_ty], []>;

	// Given a pointer to the end of an EH registration object, returns the true
	// parent frame address that can be used with llvm.localrecover.
	def int_x86_seh_recoverfp : Intrinsic<[llvm_ptr_ty],
	[llvm_ptr_ty, llvm_ptr_ty],
	[IntrNoMem]>;
	}

	//===----------------------------------------------------------------------===//
	// FLAGS.
	let TargetPrefix = "x86" in {
	def int_x86_flags_read_u32 : GCCBuiltin<"__builtin_ia32_readeflags_u32">,
	Intrinsic<[llvm_i32_ty], [], []>;
	def int_x86_flags_read_u64 : GCCBuiltin<"__builtin_ia32_readeflags_u64">,
	Intrinsic<[llvm_i64_ty], [], []>;
	def int_x86_flags_write_u32 : GCCBuiltin<"__builtin_ia32_writeeflags_u32">,
	Intrinsic<[], [llvm_i32_ty], []>;
	def int_x86_flags_write_u64 : GCCBuiltin<"__builtin_ia32_writeeflags_u64">,
	Intrinsic<[], [llvm_i64_ty], []>;
	}

	//===----------------------------------------------------------------------===//
	// Read Time Stamp Counter.
	let TargetPrefix = "x86" in {
	def int_x86_rdtsc : GCCBuiltin<"__builtin_ia32_rdtsc">,
	Intrinsic<[llvm_i64_ty], [], []>;
	def int_x86_rdtscp : GCCBuiltin<"__builtin_ia32_rdtscp">,
	Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], [IntrArgMemOnly]>;
	}

	// Read Performance-Monitoring Counter.
	let TargetPrefix = "x86" in {
	def int_x86_rdpmc : GCCBuiltin<"__builtin_ia32_rdpmc">,
	Intrinsic<[llvm_i64_ty], [llvm_i32_ty], []>;
	}

	//===----------------------------------------------------------------------===//
	// CET SS
	let TargetPrefix = "x86" in {
	def int_x86_incsspd : GCCBuiltin<"__builtin_ia32_incsspd">,
	Intrinsic<[], [llvm_i32_ty], []>;
	def int_x86_incsspq : GCCBuiltin<"__builtin_ia32_incsspq">,
	Intrinsic<[], [llvm_i64_ty], []>;
	def int_x86_rdsspd : GCCBuiltin<"__builtin_ia32_rdsspd">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty], []>;
	def int_x86_rdsspq : GCCBuiltin<"__builtin_ia32_rdsspq">,
	Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>;
	def int_x86_saveprevssp : GCCBuiltin<"__builtin_ia32_saveprevssp">,
	Intrinsic<[], [], []>;
	def int_x86_rstorssp : GCCBuiltin<"__builtin_ia32_rstorssp">,
	Intrinsic<[], [llvm_ptr_ty], []>;
	def int_x86_wrssd : GCCBuiltin<"__builtin_ia32_wrssd">,
	Intrinsic<[], [llvm_i32_ty, llvm_ptr_ty], []>;
	def int_x86_wrssq : GCCBuiltin<"__builtin_ia32_wrssq">,
	Intrinsic<[], [llvm_i64_ty, llvm_ptr_ty], []>;
	def int_x86_wrussd : GCCBuiltin<"__builtin_ia32_wrussd">,
	Intrinsic<[], [llvm_i32_ty, llvm_ptr_ty], []>;
	def int_x86_wrussq : GCCBuiltin<"__builtin_ia32_wrussq">,
	Intrinsic<[], [llvm_i64_ty, llvm_ptr_ty], []>;
	def int_x86_setssbsy : GCCBuiltin<"__builtin_ia32_setssbsy">,
	Intrinsic<[], [], []>;
	def int_x86_clrssbsy : GCCBuiltin<"__builtin_ia32_clrssbsy">,
	Intrinsic<[], [llvm_ptr_ty], []>;
	}

	//===----------------------------------------------------------------------===//
	// 3DNow!

	let TargetPrefix = "x86" in {
	def int_x86_3dnow_pavgusb : GCCBuiltin<"__builtin_ia32_pavgusb">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem]>;
	def int_x86_3dnow_pf2id : GCCBuiltin<"__builtin_ia32_pf2id">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
	def int_x86_3dnow_pfacc : GCCBuiltin<"__builtin_ia32_pfacc">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem]>;
	def int_x86_3dnow_pfadd : GCCBuiltin<"__builtin_ia32_pfadd">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem]>;
	def int_x86_3dnow_pfcmpeq : GCCBuiltin<"__builtin_ia32_pfcmpeq">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem]>;
	def int_x86_3dnow_pfcmpge : GCCBuiltin<"__builtin_ia32_pfcmpge">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem]>;
	def int_x86_3dnow_pfcmpgt : GCCBuiltin<"__builtin_ia32_pfcmpgt">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem]>;
	def int_x86_3dnow_pfmax : GCCBuiltin<"__builtin_ia32_pfmax">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem]>;
	def int_x86_3dnow_pfmin : GCCBuiltin<"__builtin_ia32_pfmin">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem]>;
	def int_x86_3dnow_pfmul : GCCBuiltin<"__builtin_ia32_pfmul">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem]>;
	def int_x86_3dnow_pfrcp : GCCBuiltin<"__builtin_ia32_pfrcp">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
	def int_x86_3dnow_pfrcpit1 : GCCBuiltin<"__builtin_ia32_pfrcpit1">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem]>;
	def int_x86_3dnow_pfrcpit2 : GCCBuiltin<"__builtin_ia32_pfrcpit2">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem]>;
	def int_x86_3dnow_pfrsqrt : GCCBuiltin<"__builtin_ia32_pfrsqrt">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
	def int_x86_3dnow_pfrsqit1 : GCCBuiltin<"__builtin_ia32_pfrsqit1">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem]>;
	def int_x86_3dnow_pfsub : GCCBuiltin<"__builtin_ia32_pfsub">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem]>;
	def int_x86_3dnow_pfsubr : GCCBuiltin<"__builtin_ia32_pfsubr">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem]>;
	def int_x86_3dnow_pi2fd : GCCBuiltin<"__builtin_ia32_pi2fd">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
	def int_x86_3dnow_pmulhrw : GCCBuiltin<"__builtin_ia32_pmulhrw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem]>;
	}

	//===----------------------------------------------------------------------===//
	// 3DNow! extensions

	let TargetPrefix = "x86" in {
	def int_x86_3dnowa_pf2iw : GCCBuiltin<"__builtin_ia32_pf2iw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
	def int_x86_3dnowa_pfnacc : GCCBuiltin<"__builtin_ia32_pfnacc">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem]>;
	def int_x86_3dnowa_pfpnacc : GCCBuiltin<"__builtin_ia32_pfpnacc">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem]>;
	def int_x86_3dnowa_pi2fw : GCCBuiltin<"__builtin_ia32_pi2fw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
	def int_x86_3dnowa_pswapd :
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
	}

	//===----------------------------------------------------------------------===//
	// SSE1

	// Arithmetic ops
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_sse_sqrt_ss : GCCBuiltin<"__builtin_ia32_sqrtss">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty],
	[IntrNoMem]>;
	def int_x86_sse_sqrt_ps : GCCBuiltin<"__builtin_ia32_sqrtps">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty],
	[IntrNoMem]>;
	def int_x86_sse_rcp_ss : GCCBuiltin<"__builtin_ia32_rcpss">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty],
	[IntrNoMem]>;
	def int_x86_sse_rcp_ps : GCCBuiltin<"__builtin_ia32_rcpps">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty],
	[IntrNoMem]>;
	def int_x86_sse_rsqrt_ss : GCCBuiltin<"__builtin_ia32_rsqrtss">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty],
	[IntrNoMem]>;
	def int_x86_sse_rsqrt_ps : GCCBuiltin<"__builtin_ia32_rsqrtps">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty],
	[IntrNoMem]>;
	def int_x86_sse_min_ss : GCCBuiltin<"__builtin_ia32_minss">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
	llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_sse_min_ps : GCCBuiltin<"__builtin_ia32_minps">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
	llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_sse_max_ss : GCCBuiltin<"__builtin_ia32_maxss">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
	llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_sse_max_ps : GCCBuiltin<"__builtin_ia32_maxps">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
	llvm_v4f32_ty], [IntrNoMem]>;
	}

	// Comparison ops
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_sse_cmp_ss : GCCBuiltin<"__builtin_ia32_cmpss">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
	llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_sse_cmp_ps :
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
	llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_sse_comieq_ss : GCCBuiltin<"__builtin_ia32_comieq">,
	Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
	llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_sse_comilt_ss : GCCBuiltin<"__builtin_ia32_comilt">,
	Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
	llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_sse_comile_ss : GCCBuiltin<"__builtin_ia32_comile">,
	Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
	llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_sse_comigt_ss : GCCBuiltin<"__builtin_ia32_comigt">,
	Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
	llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_sse_comige_ss : GCCBuiltin<"__builtin_ia32_comige">,
	Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
	llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_sse_comineq_ss : GCCBuiltin<"__builtin_ia32_comineq">,
	Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
	llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_sse_ucomieq_ss : GCCBuiltin<"__builtin_ia32_ucomieq">,
	Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
	llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_sse_ucomilt_ss : GCCBuiltin<"__builtin_ia32_ucomilt">,
	Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
	llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_sse_ucomile_ss : GCCBuiltin<"__builtin_ia32_ucomile">,
	Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
	llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_sse_ucomigt_ss : GCCBuiltin<"__builtin_ia32_ucomigt">,
	Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
	llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_sse_ucomige_ss : GCCBuiltin<"__builtin_ia32_ucomige">,
	Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
	llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_sse_ucomineq_ss : GCCBuiltin<"__builtin_ia32_ucomineq">,
	Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
	llvm_v4f32_ty], [IntrNoMem]>;
	}


	// Conversion ops
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_sse_cvtss2si : GCCBuiltin<"__builtin_ia32_cvtss2si">,
	Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_sse_cvtss2si64 : GCCBuiltin<"__builtin_ia32_cvtss2si64">,
	Intrinsic<[llvm_i64_ty], [llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_sse_cvttss2si : GCCBuiltin<"__builtin_ia32_cvttss2si">,
	Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_sse_cvttss2si64 : GCCBuiltin<"__builtin_ia32_cvttss2si64">,
	Intrinsic<[llvm_i64_ty], [llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_sse_cvtsi2ss : // TODO: Remove this intrinsic.
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_sse_cvtsi642ss : // TODO: Remove this intrinsic.
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
	llvm_i64_ty], [IntrNoMem]>;

	def int_x86_sse_cvtps2pi : GCCBuiltin<"__builtin_ia32_cvtps2pi">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_sse_cvttps2pi: GCCBuiltin<"__builtin_ia32_cvttps2pi">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_sse_cvtpi2ps : GCCBuiltin<"__builtin_ia32_cvtpi2ps">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;
	}

	// Cacheability support ops
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_sse_sfence : GCCBuiltin<"__builtin_ia32_sfence">,
	Intrinsic<[], [], []>;
	}

	// Control register.
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_sse_stmxcsr :
	Intrinsic<[], [llvm_ptr_ty], []>;
	def int_x86_sse_ldmxcsr :
	Intrinsic<[], [llvm_ptr_ty], []>;
	}

	// Misc.
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_sse_movmsk_ps : GCCBuiltin<"__builtin_ia32_movmskps">,
	Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
	}

	//===----------------------------------------------------------------------===//
	// SSE2

	// FP arithmetic ops
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_sse2_sqrt_sd : GCCBuiltin<"__builtin_ia32_sqrtsd">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty],
	[IntrNoMem]>;
	def int_x86_sse2_sqrt_pd : GCCBuiltin<"__builtin_ia32_sqrtpd">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty],
	[IntrNoMem]>;
	def int_x86_sse2_min_sd : GCCBuiltin<"__builtin_ia32_minsd">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
	llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_sse2_min_pd : GCCBuiltin<"__builtin_ia32_minpd">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
	llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_sse2_max_sd : GCCBuiltin<"__builtin_ia32_maxsd">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
	llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_sse2_max_pd : GCCBuiltin<"__builtin_ia32_maxpd">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
	llvm_v2f64_ty], [IntrNoMem]>;
	}

	// FP comparison ops
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_sse2_cmp_sd : GCCBuiltin<"__builtin_ia32_cmpsd">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
	llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_sse2_cmp_pd :
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
	llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_sse2_comieq_sd : GCCBuiltin<"__builtin_ia32_comisdeq">,
	Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
	llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_sse2_comilt_sd : GCCBuiltin<"__builtin_ia32_comisdlt">,
	Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
	llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_sse2_comile_sd : GCCBuiltin<"__builtin_ia32_comisdle">,
	Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
	llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_sse2_comigt_sd : GCCBuiltin<"__builtin_ia32_comisdgt">,
	Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
	llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_sse2_comige_sd : GCCBuiltin<"__builtin_ia32_comisdge">,
	Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
	llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_sse2_comineq_sd : GCCBuiltin<"__builtin_ia32_comisdneq">,
	Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
	llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_sse2_ucomieq_sd : GCCBuiltin<"__builtin_ia32_ucomisdeq">,
	Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
	llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_sse2_ucomilt_sd : GCCBuiltin<"__builtin_ia32_ucomisdlt">,
	Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
	llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_sse2_ucomile_sd : GCCBuiltin<"__builtin_ia32_ucomisdle">,
	Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
	llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_sse2_ucomigt_sd : GCCBuiltin<"__builtin_ia32_ucomisdgt">,
	Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
	llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_sse2_ucomige_sd : GCCBuiltin<"__builtin_ia32_ucomisdge">,
	Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
	llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_sse2_ucomineq_sd : GCCBuiltin<"__builtin_ia32_ucomisdneq">,
	Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
	llvm_v2f64_ty], [IntrNoMem]>;
	}

	// Integer arithmetic ops.
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_sse2_padds_b : GCCBuiltin<"__builtin_ia32_paddsb128">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
	llvm_v16i8_ty], [IntrNoMem, Commutative]>;
	def int_x86_sse2_padds_w : GCCBuiltin<"__builtin_ia32_paddsw128">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
	llvm_v8i16_ty], [IntrNoMem, Commutative]>;
	def int_x86_sse2_paddus_b : GCCBuiltin<"__builtin_ia32_paddusb128">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
	llvm_v16i8_ty], [IntrNoMem, Commutative]>;
	def int_x86_sse2_paddus_w : GCCBuiltin<"__builtin_ia32_paddusw128">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
	llvm_v8i16_ty], [IntrNoMem, Commutative]>;
	def int_x86_sse2_psubs_b : GCCBuiltin<"__builtin_ia32_psubsb128">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
	llvm_v16i8_ty], [IntrNoMem]>;
	def int_x86_sse2_psubs_w : GCCBuiltin<"__builtin_ia32_psubsw128">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
	llvm_v8i16_ty], [IntrNoMem]>;
	def int_x86_sse2_psubus_b : GCCBuiltin<"__builtin_ia32_psubusb128">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
	llvm_v16i8_ty], [IntrNoMem]>;
	def int_x86_sse2_psubus_w : GCCBuiltin<"__builtin_ia32_psubusw128">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
	llvm_v8i16_ty], [IntrNoMem]>;
	def int_x86_sse2_pmulhu_w : GCCBuiltin<"__builtin_ia32_pmulhuw128">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
	llvm_v8i16_ty], [IntrNoMem, Commutative]>;
	def int_x86_sse2_pmulh_w : GCCBuiltin<"__builtin_ia32_pmulhw128">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
	llvm_v8i16_ty], [IntrNoMem, Commutative]>;
	def int_x86_sse2_pmulu_dq : GCCBuiltin<"__builtin_ia32_pmuludq128">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty,
	llvm_v4i32_ty], [IntrNoMem, Commutative]>;
	def int_x86_sse2_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd128">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty,
	llvm_v8i16_ty], [IntrNoMem, Commutative]>;
	def int_x86_sse2_psad_bw : GCCBuiltin<"__builtin_ia32_psadbw128">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v16i8_ty,
	llvm_v16i8_ty], [IntrNoMem, Commutative]>;
	}

	// Integer shift ops.
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_sse2_psll_w : GCCBuiltin<"__builtin_ia32_psllw128">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
	llvm_v8i16_ty], [IntrNoMem]>;
	def int_x86_sse2_psll_d : GCCBuiltin<"__builtin_ia32_pslld128">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
	llvm_v4i32_ty], [IntrNoMem]>;
	def int_x86_sse2_psll_q : GCCBuiltin<"__builtin_ia32_psllq128">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
	llvm_v2i64_ty], [IntrNoMem]>;
	def int_x86_sse2_psrl_w : GCCBuiltin<"__builtin_ia32_psrlw128">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
	llvm_v8i16_ty], [IntrNoMem]>;
	def int_x86_sse2_psrl_d : GCCBuiltin<"__builtin_ia32_psrld128">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
	llvm_v4i32_ty], [IntrNoMem]>;
	def int_x86_sse2_psrl_q : GCCBuiltin<"__builtin_ia32_psrlq128">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
	llvm_v2i64_ty], [IntrNoMem]>;
	def int_x86_sse2_psra_w : GCCBuiltin<"__builtin_ia32_psraw128">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
	llvm_v8i16_ty], [IntrNoMem]>;
	def int_x86_sse2_psra_d : GCCBuiltin<"__builtin_ia32_psrad128">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
	llvm_v4i32_ty], [IntrNoMem]>;

	def int_x86_sse2_pslli_w : GCCBuiltin<"__builtin_ia32_psllwi128">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_sse2_pslli_d : GCCBuiltin<"__builtin_ia32_pslldi128">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_sse2_pslli_q : GCCBuiltin<"__builtin_ia32_psllqi128">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_sse2_psrli_w : GCCBuiltin<"__builtin_ia32_psrlwi128">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_sse2_psrli_d : GCCBuiltin<"__builtin_ia32_psrldi128">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_sse2_psrli_q : GCCBuiltin<"__builtin_ia32_psrlqi128">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_sse2_psrai_w : GCCBuiltin<"__builtin_ia32_psrawi128">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_sse2_psrai_d : GCCBuiltin<"__builtin_ia32_psradi128">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
	llvm_i32_ty], [IntrNoMem]>;
	}

	// Conversion ops
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_sse2_cvtdq2ps : GCCBuiltin<"__builtin_ia32_cvtdq2ps">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4i32_ty], [IntrNoMem]>;
	def int_x86_sse2_cvtpd2dq : GCCBuiltin<"__builtin_ia32_cvtpd2dq">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_sse2_cvttpd2dq : GCCBuiltin<"__builtin_ia32_cvttpd2dq">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_sse2_cvtpd2ps : GCCBuiltin<"__builtin_ia32_cvtpd2ps">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_sse2_cvtps2dq : GCCBuiltin<"__builtin_ia32_cvtps2dq">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_sse2_cvttps2dq : GCCBuiltin<"__builtin_ia32_cvttps2dq">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_sse2_cvtsd2si : GCCBuiltin<"__builtin_ia32_cvtsd2si">,
	Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_sse2_cvtsd2si64 : GCCBuiltin<"__builtin_ia32_cvtsd2si64">,
	Intrinsic<[llvm_i64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_sse2_cvttsd2si : GCCBuiltin<"__builtin_ia32_cvttsd2si">,
	Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_sse2_cvttsd2si64 : GCCBuiltin<"__builtin_ia32_cvttsd2si64">,
	Intrinsic<[llvm_i64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_sse2_cvtsi2sd : // TODO: Remove this intrinsic.
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_sse2_cvtsi642sd : // TODO: Remove this intrinsic.
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
	llvm_i64_ty], [IntrNoMem]>;
	def int_x86_sse2_cvtsd2ss : GCCBuiltin<"__builtin_ia32_cvtsd2ss">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
	llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_sse2_cvtss2sd : // TODO: Remove this intrinsic.
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
	llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_sse_cvtpd2pi : GCCBuiltin<"__builtin_ia32_cvtpd2pi">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_sse_cvttpd2pi: GCCBuiltin<"__builtin_ia32_cvttpd2pi">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_sse_cvtpi2pd : GCCBuiltin<"__builtin_ia32_cvtpi2pd">,
	Intrinsic<[llvm_v2f64_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
	}

	// Misc.
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_sse2_packsswb_128 : GCCBuiltin<"__builtin_ia32_packsswb128">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty,
	llvm_v8i16_ty], [IntrNoMem]>;
	def int_x86_sse2_packssdw_128 : GCCBuiltin<"__builtin_ia32_packssdw128">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty,
	llvm_v4i32_ty], [IntrNoMem]>;
	def int_x86_sse2_packuswb_128 : GCCBuiltin<"__builtin_ia32_packuswb128">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty,
	llvm_v8i16_ty], [IntrNoMem]>;
	def int_x86_sse2_movmsk_pd : GCCBuiltin<"__builtin_ia32_movmskpd">,
	Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_sse2_pmovmskb_128 : GCCBuiltin<"__builtin_ia32_pmovmskb128">,
	Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty], [IntrNoMem]>;
	def int_x86_sse2_maskmov_dqu : GCCBuiltin<"__builtin_ia32_maskmovdqu">,
	Intrinsic<[], [llvm_v16i8_ty,
	llvm_v16i8_ty, llvm_ptr_ty], []>;
	def int_x86_sse2_clflush : GCCBuiltin<"__builtin_ia32_clflush">,
	Intrinsic<[], [llvm_ptr_ty], []>;
	def int_x86_sse2_lfence : GCCBuiltin<"__builtin_ia32_lfence">,
	Intrinsic<[], [], []>;
	def int_x86_sse2_mfence : GCCBuiltin<"__builtin_ia32_mfence">,
	Intrinsic<[], [], []>;
	def int_x86_sse2_pause : GCCBuiltin<"__builtin_ia32_pause">,
	Intrinsic<[], [], []>;
	}

	//===----------------------------------------------------------------------===//
	// SSE3

	// Addition / subtraction ops.
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_sse3_addsub_ps : GCCBuiltin<"__builtin_ia32_addsubps">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
	llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_sse3_addsub_pd : GCCBuiltin<"__builtin_ia32_addsubpd">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
	llvm_v2f64_ty], [IntrNoMem]>;
	}

	// Horizontal ops.
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_sse3_hadd_ps : GCCBuiltin<"__builtin_ia32_haddps">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
	llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_sse3_hadd_pd : GCCBuiltin<"__builtin_ia32_haddpd">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
	llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_sse3_hsub_ps : GCCBuiltin<"__builtin_ia32_hsubps">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
	llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_sse3_hsub_pd : GCCBuiltin<"__builtin_ia32_hsubpd">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
	llvm_v2f64_ty], [IntrNoMem]>;
	}

	// Specialized unaligned load.
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_sse3_ldu_dq : GCCBuiltin<"__builtin_ia32_lddqu">,
	Intrinsic<[llvm_v16i8_ty], [llvm_ptr_ty], [IntrReadMem]>;
	}

	// Thread synchronization ops.
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_sse3_monitor : GCCBuiltin<"__builtin_ia32_monitor">,
	Intrinsic<[], [llvm_ptr_ty,
	llvm_i32_ty, llvm_i32_ty], []>;
	def int_x86_sse3_mwait : GCCBuiltin<"__builtin_ia32_mwait">,
	Intrinsic<[], [llvm_i32_ty,
	llvm_i32_ty], []>;
	}

	//===----------------------------------------------------------------------===//
	// SSSE3

	// Horizontal arithmetic ops
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_ssse3_phadd_w : GCCBuiltin<"__builtin_ia32_phaddw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;
	def int_x86_ssse3_phadd_w_128 : GCCBuiltin<"__builtin_ia32_phaddw128">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
	llvm_v8i16_ty], [IntrNoMem]>;

	def int_x86_ssse3_phadd_d : GCCBuiltin<"__builtin_ia32_phaddd">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;
	def int_x86_ssse3_phadd_d_128 : GCCBuiltin<"__builtin_ia32_phaddd128">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
	llvm_v4i32_ty], [IntrNoMem]>;

	def int_x86_ssse3_phadd_sw : GCCBuiltin<"__builtin_ia32_phaddsw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;
	def int_x86_ssse3_phadd_sw_128 : GCCBuiltin<"__builtin_ia32_phaddsw128">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
	llvm_v8i16_ty], [IntrNoMem]>;

	def int_x86_ssse3_phsub_w : GCCBuiltin<"__builtin_ia32_phsubw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;
	def int_x86_ssse3_phsub_w_128 : GCCBuiltin<"__builtin_ia32_phsubw128">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
	llvm_v8i16_ty], [IntrNoMem]>;

	def int_x86_ssse3_phsub_d : GCCBuiltin<"__builtin_ia32_phsubd">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;
	def int_x86_ssse3_phsub_d_128 : GCCBuiltin<"__builtin_ia32_phsubd128">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
	llvm_v4i32_ty], [IntrNoMem]>;

	def int_x86_ssse3_phsub_sw : GCCBuiltin<"__builtin_ia32_phsubsw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;
	def int_x86_ssse3_phsub_sw_128 : GCCBuiltin<"__builtin_ia32_phsubsw128">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
	llvm_v8i16_ty], [IntrNoMem]>;

	def int_x86_ssse3_pmadd_ub_sw : GCCBuiltin<"__builtin_ia32_pmaddubsw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;
	def int_x86_ssse3_pmadd_ub_sw_128 : GCCBuiltin<"__builtin_ia32_pmaddubsw128">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty,
	llvm_v16i8_ty], [IntrNoMem]>;
	}

	// Packed multiply high with round and scale
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_ssse3_pmul_hr_sw : GCCBuiltin<"__builtin_ia32_pmulhrsw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
	def int_x86_ssse3_pmul_hr_sw_128 : GCCBuiltin<"__builtin_ia32_pmulhrsw128">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
	llvm_v8i16_ty], [IntrNoMem, Commutative]>;
	}

	// Shuffle ops
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_ssse3_pshuf_b : GCCBuiltin<"__builtin_ia32_pshufb">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;
	def int_x86_ssse3_pshuf_b_128 : GCCBuiltin<"__builtin_ia32_pshufb128">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
	llvm_v16i8_ty], [IntrNoMem]>;
	def int_x86_sse_pshuf_w : GCCBuiltin<"__builtin_ia32_pshufw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i8_ty],
	[IntrNoMem]>;
	}

	// Sign ops
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_ssse3_psign_b : GCCBuiltin<"__builtin_ia32_psignb">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;
	def int_x86_ssse3_psign_b_128 : GCCBuiltin<"__builtin_ia32_psignb128">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
	llvm_v16i8_ty], [IntrNoMem]>;

	def int_x86_ssse3_psign_w : GCCBuiltin<"__builtin_ia32_psignw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;
	def int_x86_ssse3_psign_w_128 : GCCBuiltin<"__builtin_ia32_psignw128">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
	llvm_v8i16_ty], [IntrNoMem]>;

	def int_x86_ssse3_psign_d : GCCBuiltin<"__builtin_ia32_psignd">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;
	def int_x86_ssse3_psign_d_128 : GCCBuiltin<"__builtin_ia32_psignd128">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
	llvm_v4i32_ty], [IntrNoMem]>;
	}

	// Absolute value ops
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_ssse3_pabs_b : GCCBuiltin<"__builtin_ia32_pabsb">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;

	def int_x86_ssse3_pabs_w : GCCBuiltin<"__builtin_ia32_pabsw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;

	def int_x86_ssse3_pabs_d : GCCBuiltin<"__builtin_ia32_pabsd">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>;
	}

	//===----------------------------------------------------------------------===//
	// SSE4.1

	// FP rounding ops
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_sse41_round_ss : GCCBuiltin<"__builtin_ia32_roundss">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_sse41_round_ps : GCCBuiltin<"__builtin_ia32_roundps">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_sse41_round_sd : GCCBuiltin<"__builtin_ia32_roundsd">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_sse41_round_pd : GCCBuiltin<"__builtin_ia32_roundpd">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
	llvm_i32_ty], [IntrNoMem]>;
	}

	// Vector min element
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_sse41_phminposuw : GCCBuiltin<"__builtin_ia32_phminposuw128">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty],
	[IntrNoMem]>;
	}

	// Advanced Encryption Standard (AES) Instructions
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_aesni_aesimc : GCCBuiltin<"__builtin_ia32_aesimc128">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty],
	[IntrNoMem]>;

	def int_x86_aesni_aesenc : GCCBuiltin<"__builtin_ia32_aesenc128">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
	[IntrNoMem]>;
	def int_x86_aesni_aesenc_256 : GCCBuiltin<"__builtin_ia32_aesenc256">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty],
	[IntrNoMem]>;
	def int_x86_aesni_aesenc_512 : GCCBuiltin<"__builtin_ia32_aesenc512">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty],
	[IntrNoMem]>;

	def int_x86_aesni_aesenclast : GCCBuiltin<"__builtin_ia32_aesenclast128">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
	[IntrNoMem]>;
	def int_x86_aesni_aesenclast_256 :
	GCCBuiltin<"__builtin_ia32_aesenclast256">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty],
	[IntrNoMem]>;
	def int_x86_aesni_aesenclast_512 :
	GCCBuiltin<"__builtin_ia32_aesenclast512">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty],
	[IntrNoMem]>;

	def int_x86_aesni_aesdec : GCCBuiltin<"__builtin_ia32_aesdec128">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
	[IntrNoMem]>;
	def int_x86_aesni_aesdec_256 : GCCBuiltin<"__builtin_ia32_aesdec256">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty],
	[IntrNoMem]>;
	def int_x86_aesni_aesdec_512 : GCCBuiltin<"__builtin_ia32_aesdec512">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty],
	[IntrNoMem]>;

	def int_x86_aesni_aesdeclast : GCCBuiltin<"__builtin_ia32_aesdeclast128">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
	[IntrNoMem]>;
	def int_x86_aesni_aesdeclast_256 :
	GCCBuiltin<"__builtin_ia32_aesdeclast256">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty],
	[IntrNoMem]>;
	def int_x86_aesni_aesdeclast_512 :
	GCCBuiltin<"__builtin_ia32_aesdeclast512">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty],
	[IntrNoMem]>;

	def int_x86_aesni_aeskeygenassist :
	GCCBuiltin<"__builtin_ia32_aeskeygenassist128">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i8_ty],
	[IntrNoMem]>;
	}

	// PCLMUL instructions
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_pclmulqdq : GCCBuiltin<"__builtin_ia32_pclmulqdq128">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_pclmulqdq_256 : GCCBuiltin<"__builtin_ia32_pclmulqdq256">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_pclmulqdq_512 : GCCBuiltin<"__builtin_ia32_pclmulqdq512">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty],
	[IntrNoMem]>;
	}

	// Vector pack
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_sse41_packusdw : GCCBuiltin<"__builtin_ia32_packusdw128">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
	[IntrNoMem]>;
	}

	// Vector multiply
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_sse41_pmuldq : GCCBuiltin<"__builtin_ia32_pmuldq128">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
	[IntrNoMem, Commutative]>;
	}

	// Vector insert
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_sse41_insertps : GCCBuiltin<"__builtin_ia32_insertps128">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	}

	// Vector blend
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_sse41_pblendvb : GCCBuiltin<"__builtin_ia32_pblendvb128">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty,llvm_v16i8_ty],
	[IntrNoMem]>;
	def int_x86_sse41_blendvpd : GCCBuiltin<"__builtin_ia32_blendvpd">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,llvm_v2f64_ty],
	[IntrNoMem]>;
	def int_x86_sse41_blendvps : GCCBuiltin<"__builtin_ia32_blendvps">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,llvm_v4f32_ty],
	[IntrNoMem]>;
	}

	// Vector dot product
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_sse41_dppd : GCCBuiltin<"__builtin_ia32_dppd">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
	[IntrNoMem, Commutative]>;
	def int_x86_sse41_dpps : GCCBuiltin<"__builtin_ia32_dpps">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
	[IntrNoMem, Commutative]>;
	}

	// Vector sum of absolute differences
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_sse41_mpsadbw : GCCBuiltin<"__builtin_ia32_mpsadbw128">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty,llvm_i8_ty],
	[IntrNoMem, Commutative]>;
	}

	// Test instruction with bitwise comparison.
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_sse41_ptestz : GCCBuiltin<"__builtin_ia32_ptestz128">,
	Intrinsic<[llvm_i32_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
	[IntrNoMem]>;
	def int_x86_sse41_ptestc : GCCBuiltin<"__builtin_ia32_ptestc128">,
	Intrinsic<[llvm_i32_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
	[IntrNoMem]>;
	def int_x86_sse41_ptestnzc : GCCBuiltin<"__builtin_ia32_ptestnzc128">,
	Intrinsic<[llvm_i32_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
	[IntrNoMem]>;
	}

	//===----------------------------------------------------------------------===//
	// SSE4.2

	// Miscellaneous
	// CRC Instruction
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_sse42_crc32_32_8 : GCCBuiltin<"__builtin_ia32_crc32qi">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_sse42_crc32_32_16 : GCCBuiltin<"__builtin_ia32_crc32hi">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i16_ty],
	[IntrNoMem]>;
	def int_x86_sse42_crc32_32_32 : GCCBuiltin<"__builtin_ia32_crc32si">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
	[IntrNoMem]>;
	def int_x86_sse42_crc32_64_64 : GCCBuiltin<"__builtin_ia32_crc32di">,
	Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
	[IntrNoMem]>;
	}

	// String/text processing ops.
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_sse42_pcmpistrm128 : GCCBuiltin<"__builtin_ia32_pcmpistrm128">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_sse42_pcmpistri128 : GCCBuiltin<"__builtin_ia32_pcmpistri128">,
	Intrinsic<[llvm_i32_ty],
	[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_sse42_pcmpistria128 : GCCBuiltin<"__builtin_ia32_pcmpistria128">,
	Intrinsic<[llvm_i32_ty],
	[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_sse42_pcmpistric128 : GCCBuiltin<"__builtin_ia32_pcmpistric128">,
	Intrinsic<[llvm_i32_ty],
	[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_sse42_pcmpistrio128 : GCCBuiltin<"__builtin_ia32_pcmpistrio128">,
	Intrinsic<[llvm_i32_ty],
	[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_sse42_pcmpistris128 : GCCBuiltin<"__builtin_ia32_pcmpistris128">,
	Intrinsic<[llvm_i32_ty],
	[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_sse42_pcmpistriz128 : GCCBuiltin<"__builtin_ia32_pcmpistriz128">,
	Intrinsic<[llvm_i32_ty],
	[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_sse42_pcmpestrm128 : GCCBuiltin<"__builtin_ia32_pcmpestrm128">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
	llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_sse42_pcmpestri128 : GCCBuiltin<"__builtin_ia32_pcmpestri128">,
	Intrinsic<[llvm_i32_ty],
	[llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
	llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_sse42_pcmpestria128 : GCCBuiltin<"__builtin_ia32_pcmpestria128">,
	Intrinsic<[llvm_i32_ty],
	[llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
	llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_sse42_pcmpestric128 : GCCBuiltin<"__builtin_ia32_pcmpestric128">,
	Intrinsic<[llvm_i32_ty],
	[llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
	llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_sse42_pcmpestrio128 : GCCBuiltin<"__builtin_ia32_pcmpestrio128">,
	Intrinsic<[llvm_i32_ty],
	[llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
	llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_sse42_pcmpestris128 : GCCBuiltin<"__builtin_ia32_pcmpestris128">,
	Intrinsic<[llvm_i32_ty],
	[llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
	llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_sse42_pcmpestriz128 : GCCBuiltin<"__builtin_ia32_pcmpestriz128">,
	Intrinsic<[llvm_i32_ty],
	[llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty,
	llvm_i8_ty],
	[IntrNoMem]>;
	}

	//===----------------------------------------------------------------------===//
	// SSE4A

	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_sse4a_extrqi : GCCBuiltin<"__builtin_ia32_extrqi">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_sse4a_extrq : GCCBuiltin<"__builtin_ia32_extrq">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v16i8_ty], [IntrNoMem]>;

	def int_x86_sse4a_insertqi : GCCBuiltin<"__builtin_ia32_insertqi">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
	llvm_i8_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_sse4a_insertq : GCCBuiltin<"__builtin_ia32_insertq">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
	}

	//===----------------------------------------------------------------------===//
	// AVX

	// Arithmetic ops
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx_addsub_pd_256 : GCCBuiltin<"__builtin_ia32_addsubpd256">,
	Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,
	llvm_v4f64_ty], [IntrNoMem]>;
	def int_x86_avx_addsub_ps_256 : GCCBuiltin<"__builtin_ia32_addsubps256">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
	llvm_v8f32_ty], [IntrNoMem]>;
	def int_x86_avx_max_pd_256 : GCCBuiltin<"__builtin_ia32_maxpd256">,
	Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,
	llvm_v4f64_ty], [IntrNoMem]>;
	def int_x86_avx_max_ps_256 : GCCBuiltin<"__builtin_ia32_maxps256">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
	llvm_v8f32_ty], [IntrNoMem]>;
	def int_x86_avx_min_pd_256 : GCCBuiltin<"__builtin_ia32_minpd256">,
	Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,
	llvm_v4f64_ty], [IntrNoMem]>;
	def int_x86_avx_min_ps_256 : GCCBuiltin<"__builtin_ia32_minps256">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
	llvm_v8f32_ty], [IntrNoMem]>;

	def int_x86_avx_sqrt_pd_256 : GCCBuiltin<"__builtin_ia32_sqrtpd256">,
	Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty], [IntrNoMem]>;
	def int_x86_avx_sqrt_ps_256 : GCCBuiltin<"__builtin_ia32_sqrtps256">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>;

	def int_x86_avx_rsqrt_ps_256 : GCCBuiltin<"__builtin_ia32_rsqrtps256">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>;

	def int_x86_avx_rcp_ps_256 : GCCBuiltin<"__builtin_ia32_rcpps256">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>;

	def int_x86_avx_round_pd_256 : GCCBuiltin<"__builtin_ia32_roundpd256">,
	Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx_round_ps_256 : GCCBuiltin<"__builtin_ia32_roundps256">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
	llvm_i32_ty], [IntrNoMem]>;
	}

	// Horizontal ops
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx_hadd_pd_256 : GCCBuiltin<"__builtin_ia32_haddpd256">,
	Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,
	llvm_v4f64_ty], [IntrNoMem]>;
	def int_x86_avx_hsub_ps_256 : GCCBuiltin<"__builtin_ia32_hsubps256">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
	llvm_v8f32_ty], [IntrNoMem]>;
	def int_x86_avx_hsub_pd_256 : GCCBuiltin<"__builtin_ia32_hsubpd256">,
	Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,
	llvm_v4f64_ty], [IntrNoMem]>;
	def int_x86_avx_hadd_ps_256 : GCCBuiltin<"__builtin_ia32_haddps256">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
	llvm_v8f32_ty], [IntrNoMem]>;
	}

	// Vector permutation
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx_vpermilvar_pd : GCCBuiltin<"__builtin_ia32_vpermilvarpd">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
	llvm_v2i64_ty], [IntrNoMem]>;
	def int_x86_avx_vpermilvar_ps : GCCBuiltin<"__builtin_ia32_vpermilvarps">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
	llvm_v4i32_ty], [IntrNoMem]>;

	def int_x86_avx_vpermilvar_pd_256 :
	GCCBuiltin<"__builtin_ia32_vpermilvarpd256">,
	Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4i64_ty], [IntrNoMem]>;
	def int_x86_avx_vpermilvar_ps_256 :
	GCCBuiltin<"__builtin_ia32_vpermilvarps256">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_vpermi2var_d_128 :
	GCCBuiltin<"__builtin_ia32_vpermi2vard128_mask">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermi2var_d_256 :
	GCCBuiltin<"__builtin_ia32_vpermi2vard256_mask">,
	Intrinsic<[llvm_v8i32_ty],
	[llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermi2var_d_512 :
	GCCBuiltin<"__builtin_ia32_vpermi2vard512_mask">,
	Intrinsic<[llvm_v16i32_ty],
	[llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermi2var_hi_128 :
	GCCBuiltin<"__builtin_ia32_vpermi2varhi128_mask">,
	Intrinsic<[llvm_v8i16_ty],
	[llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermi2var_hi_256 :
	GCCBuiltin<"__builtin_ia32_vpermi2varhi256_mask">,
	Intrinsic<[llvm_v16i16_ty],
	[llvm_v16i16_ty, llvm_v16i16_ty, llvm_v16i16_ty, llvm_i16_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermi2var_hi_512 :
	GCCBuiltin<"__builtin_ia32_vpermi2varhi512_mask">,
	Intrinsic<[llvm_v32i16_ty],
	[llvm_v32i16_ty, llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermi2var_pd_128 :
	GCCBuiltin<"__builtin_ia32_vpermi2varpd128_mask">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2i64_ty, llvm_v2f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermi2var_pd_256 :
	GCCBuiltin<"__builtin_ia32_vpermi2varpd256_mask">,
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4f64_ty, llvm_v4i64_ty, llvm_v4f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermi2var_pd_512 :
	GCCBuiltin<"__builtin_ia32_vpermi2varpd512_mask">,
	Intrinsic<[llvm_v8f64_ty],
	[llvm_v8f64_ty, llvm_v8i64_ty, llvm_v8f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermi2var_ps_128 :
	GCCBuiltin<"__builtin_ia32_vpermi2varps128_mask">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4i32_ty, llvm_v4f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermi2var_ps_256 :
	GCCBuiltin<"__builtin_ia32_vpermi2varps256_mask">,
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8f32_ty, llvm_v8i32_ty, llvm_v8f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermi2var_ps_512 :
	GCCBuiltin<"__builtin_ia32_vpermi2varps512_mask">,
	Intrinsic<[llvm_v16f32_ty],
	[llvm_v16f32_ty, llvm_v16i32_ty, llvm_v16f32_ty, llvm_i16_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermi2var_q_128 :
	GCCBuiltin<"__builtin_ia32_vpermi2varq128_mask">,
	Intrinsic<[llvm_v2i64_ty],
	[llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermi2var_q_256 :
	GCCBuiltin<"__builtin_ia32_vpermi2varq256_mask">,
	Intrinsic<[llvm_v4i64_ty],
	[llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermi2var_q_512 :
	GCCBuiltin<"__builtin_ia32_vpermi2varq512_mask">,
	Intrinsic<[llvm_v8i64_ty],
	[llvm_v8i64_ty, llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermt2var_d_512:
	GCCBuiltin<"__builtin_ia32_vpermt2vard512_mask">,
	Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
	llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_vpermt2var_q_512:
	GCCBuiltin<"__builtin_ia32_vpermt2varq512_mask">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
	llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_vpermt2var_ps_512:
	GCCBuiltin<"__builtin_ia32_vpermt2varps512_mask">,
	Intrinsic<[llvm_v16f32_ty], [llvm_v16i32_ty,
	llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_vpermt2var_pd_512:
	GCCBuiltin<"__builtin_ia32_vpermt2varpd512_mask">,
	Intrinsic<[llvm_v8f64_ty], [llvm_v8i64_ty,
	llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_vpermt2var_d_128 :
	GCCBuiltin<"__builtin_ia32_vpermt2vard128_mask">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_maskz_vpermt2var_d_128 :
	GCCBuiltin<"__builtin_ia32_vpermt2vard128_maskz">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermt2var_d_256 :
	GCCBuiltin<"__builtin_ia32_vpermt2vard256_mask">,
	Intrinsic<[llvm_v8i32_ty],
	[llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_maskz_vpermt2var_d_256 :
	GCCBuiltin<"__builtin_ia32_vpermt2vard256_maskz">,
	Intrinsic<[llvm_v8i32_ty],
	[llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_maskz_vpermt2var_d_512 :
	GCCBuiltin<"__builtin_ia32_vpermt2vard512_maskz">,
	Intrinsic<[llvm_v16i32_ty],
	[llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermt2var_hi_128 :
	GCCBuiltin<"__builtin_ia32_vpermt2varhi128_mask">,
	Intrinsic<[llvm_v8i16_ty],
	[llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_maskz_vpermt2var_hi_128 :
	GCCBuiltin<"__builtin_ia32_vpermt2varhi128_maskz">,
	Intrinsic<[llvm_v8i16_ty],
	[llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermt2var_hi_256 :
	GCCBuiltin<"__builtin_ia32_vpermt2varhi256_mask">,
	Intrinsic<[llvm_v16i16_ty],
	[llvm_v16i16_ty, llvm_v16i16_ty, llvm_v16i16_ty, llvm_i16_ty],
	[IntrNoMem]>;

	def int_x86_avx512_maskz_vpermt2var_hi_256 :
	GCCBuiltin<"__builtin_ia32_vpermt2varhi256_maskz">,
	Intrinsic<[llvm_v16i16_ty],
	[llvm_v16i16_ty, llvm_v16i16_ty, llvm_v16i16_ty, llvm_i16_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermt2var_hi_512 :
	GCCBuiltin<"__builtin_ia32_vpermt2varhi512_mask">,
	Intrinsic<[llvm_v32i16_ty],
	[llvm_v32i16_ty, llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_maskz_vpermt2var_hi_512 :
	GCCBuiltin<"__builtin_ia32_vpermt2varhi512_maskz">,
	Intrinsic<[llvm_v32i16_ty],
	[llvm_v32i16_ty, llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermt2var_pd_128 :
	GCCBuiltin<"__builtin_ia32_vpermt2varpd128_mask">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2i64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_maskz_vpermt2var_pd_128 :
	GCCBuiltin<"__builtin_ia32_vpermt2varpd128_maskz">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2i64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermt2var_pd_256 :
	GCCBuiltin<"__builtin_ia32_vpermt2varpd256_mask">,
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4i64_ty, llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_maskz_vpermt2var_pd_256 :
	GCCBuiltin<"__builtin_ia32_vpermt2varpd256_maskz">,
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4i64_ty, llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_maskz_vpermt2var_pd_512 :
	GCCBuiltin<"__builtin_ia32_vpermt2varpd512_maskz">,
	Intrinsic<[llvm_v8f64_ty],
	[llvm_v8i64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermt2var_ps_128 :
	GCCBuiltin<"__builtin_ia32_vpermt2varps128_mask">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4i32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_maskz_vpermt2var_ps_128 :
	GCCBuiltin<"__builtin_ia32_vpermt2varps128_maskz">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4i32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermt2var_ps_256 :
	GCCBuiltin<"__builtin_ia32_vpermt2varps256_mask">,
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8i32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_maskz_vpermt2var_ps_256 :
	GCCBuiltin<"__builtin_ia32_vpermt2varps256_maskz">,
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8i32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_maskz_vpermt2var_ps_512 :
	GCCBuiltin<"__builtin_ia32_vpermt2varps512_maskz">,
	Intrinsic<[llvm_v16f32_ty],
	[llvm_v16i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermt2var_q_128 :
	GCCBuiltin<"__builtin_ia32_vpermt2varq128_mask">,
	Intrinsic<[llvm_v2i64_ty],
	[llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_maskz_vpermt2var_q_128 :
	GCCBuiltin<"__builtin_ia32_vpermt2varq128_maskz">,
	Intrinsic<[llvm_v2i64_ty],
	[llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermt2var_q_256 :
	GCCBuiltin<"__builtin_ia32_vpermt2varq256_mask">,
	Intrinsic<[llvm_v4i64_ty],
	[llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_maskz_vpermt2var_q_256 :
	GCCBuiltin<"__builtin_ia32_vpermt2varq256_maskz">,
	Intrinsic<[llvm_v4i64_ty],
	[llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_maskz_vpermt2var_q_512 :
	GCCBuiltin<"__builtin_ia32_vpermt2varq512_maskz">,
	Intrinsic<[llvm_v8i64_ty],
	[llvm_v8i64_ty, llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermi2var_qi_128 :
	GCCBuiltin<"__builtin_ia32_vpermi2varqi128_mask">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
	llvm_v16i8_ty, llvm_v16i8_ty, llvm_i16_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermt2var_qi_128 :
	GCCBuiltin<"__builtin_ia32_vpermt2varqi128_mask">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
	llvm_v16i8_ty, llvm_v16i8_ty, llvm_i16_ty],
	[IntrNoMem]>;

	def int_x86_avx512_maskz_vpermt2var_qi_128 :
	GCCBuiltin<"__builtin_ia32_vpermt2varqi128_maskz">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
	llvm_v16i8_ty, llvm_v16i8_ty, llvm_i16_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermi2var_qi_256 :
	GCCBuiltin<"__builtin_ia32_vpermi2varqi256_mask">,
	Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
	llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermt2var_qi_256 :
	GCCBuiltin<"__builtin_ia32_vpermt2varqi256_mask">,
	Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
	llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_maskz_vpermt2var_qi_256 :
	GCCBuiltin<"__builtin_ia32_vpermt2varqi256_maskz">,
	Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
	llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermi2var_qi_512 :
	GCCBuiltin<"__builtin_ia32_vpermi2varqi512_mask">,
	Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty,
	llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vpermt2var_qi_512 :
	GCCBuiltin<"__builtin_ia32_vpermt2varqi512_mask">,
	Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty,
	llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty],
	[IntrNoMem]>;

	def int_x86_avx512_maskz_vpermt2var_qi_512 :
	GCCBuiltin<"__builtin_ia32_vpermt2varqi512_maskz">,
	Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty,
	llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty],
	[IntrNoMem]>;

	def int_x86_avx512_vpermilvar_pd_512 :
	GCCBuiltin<"__builtin_ia32_vpermilvarpd512">,
	Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8i64_ty],
	[IntrNoMem]>;

	def int_x86_avx512_vpermilvar_ps_512 :
	GCCBuiltin<"__builtin_ia32_vpermilvarps512">,
	Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_pshuf_b_512 :
	GCCBuiltin<"__builtin_ia32_pshufb512">,
	Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty],
	[IntrNoMem]>;

	}

	// GFNI Instructions
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_vgf2p8affineinvqb_128 :
	GCCBuiltin<"__builtin_ia32_vgf2p8affineinvqb_v16qi">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_vgf2p8affineinvqb_256 :
	GCCBuiltin<"__builtin_ia32_vgf2p8affineinvqb_v32qi">,
	Intrinsic<[llvm_v32i8_ty],
	[llvm_v32i8_ty, llvm_v32i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_vgf2p8affineinvqb_512 :
	GCCBuiltin<"__builtin_ia32_vgf2p8affineinvqb_v64qi">,
	Intrinsic<[llvm_v64i8_ty],
	[llvm_v64i8_ty, llvm_v64i8_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_vgf2p8affineqb_128 :
	GCCBuiltin<"__builtin_ia32_vgf2p8affineqb_v16qi">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_vgf2p8affineqb_256 :
	GCCBuiltin<"__builtin_ia32_vgf2p8affineqb_v32qi">,
	Intrinsic<[llvm_v32i8_ty],
	[llvm_v32i8_ty, llvm_v32i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_vgf2p8affineqb_512 :
	GCCBuiltin<"__builtin_ia32_vgf2p8affineqb_v64qi">,
	Intrinsic<[llvm_v64i8_ty],
	[llvm_v64i8_ty, llvm_v64i8_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_vgf2p8mulb_128 :
	GCCBuiltin<"__builtin_ia32_vgf2p8mulb_v16qi">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v16i8_ty, llvm_v16i8_ty],
	[IntrNoMem]>;
	def int_x86_vgf2p8mulb_256 :
	GCCBuiltin<"__builtin_ia32_vgf2p8mulb_v32qi">,
	Intrinsic<[llvm_v32i8_ty],
	[llvm_v32i8_ty, llvm_v32i8_ty],
	[IntrNoMem]>;
	def int_x86_vgf2p8mulb_512 :
	GCCBuiltin<"__builtin_ia32_vgf2p8mulb_v64qi">,
	Intrinsic<[llvm_v64i8_ty],
	[llvm_v64i8_ty, llvm_v64i8_ty],
	[IntrNoMem]>;
	}

	// Vector blend
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx_blendv_pd_256 : GCCBuiltin<"__builtin_ia32_blendvpd256">,
	Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,
	llvm_v4f64_ty, llvm_v4f64_ty], [IntrNoMem]>;
	def int_x86_avx_blendv_ps_256 : GCCBuiltin<"__builtin_ia32_blendvps256">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
	llvm_v8f32_ty, llvm_v8f32_ty], [IntrNoMem]>;
	}

	// Vector dot product
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx_dp_ps_256 : GCCBuiltin<"__builtin_ia32_dpps256">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
	llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem, Commutative]>;
	}

	// Vector compare
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx_cmp_pd_256 :
	Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,
	llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx_cmp_ps_256 :
	Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
	llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
	}

	// Vector convert
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx_cvtdq2_ps_256 : GCCBuiltin<"__builtin_ia32_cvtdq2ps256">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8i32_ty], [IntrNoMem]>;
	def int_x86_avx_cvt_pd2_ps_256 : GCCBuiltin<"__builtin_ia32_cvtpd2ps256">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
	def int_x86_avx_cvt_ps2dq_256 : GCCBuiltin<"__builtin_ia32_cvtps2dq256">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
	def int_x86_avx_cvtt_pd2dq_256 : GCCBuiltin<"__builtin_ia32_cvttpd2dq256">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
	def int_x86_avx_cvt_pd2dq_256 : GCCBuiltin<"__builtin_ia32_cvtpd2dq256">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
	def int_x86_avx_cvtt_ps2dq_256 : GCCBuiltin<"__builtin_ia32_cvttps2dq256">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
	}

	// Vector bit test
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx_vtestz_pd : GCCBuiltin<"__builtin_ia32_vtestzpd">,
	Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
	llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_avx_vtestc_pd : GCCBuiltin<"__builtin_ia32_vtestcpd">,
	Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
	llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_avx_vtestnzc_pd : GCCBuiltin<"__builtin_ia32_vtestnzcpd">,
	Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
	llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_avx_vtestz_ps : GCCBuiltin<"__builtin_ia32_vtestzps">,
	Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
	llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_avx_vtestc_ps : GCCBuiltin<"__builtin_ia32_vtestcps">,
	Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
	llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_avx_vtestnzc_ps : GCCBuiltin<"__builtin_ia32_vtestnzcps">,
	Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
	llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_avx_vtestz_pd_256 : GCCBuiltin<"__builtin_ia32_vtestzpd256">,
	Intrinsic<[llvm_i32_ty], [llvm_v4f64_ty,
	llvm_v4f64_ty], [IntrNoMem]>;
	def int_x86_avx_vtestc_pd_256 : GCCBuiltin<"__builtin_ia32_vtestcpd256">,
	Intrinsic<[llvm_i32_ty], [llvm_v4f64_ty,
	llvm_v4f64_ty], [IntrNoMem]>;
	def int_x86_avx_vtestnzc_pd_256 : GCCBuiltin<"__builtin_ia32_vtestnzcpd256">,
	Intrinsic<[llvm_i32_ty], [llvm_v4f64_ty,
	llvm_v4f64_ty], [IntrNoMem]>;
	def int_x86_avx_vtestz_ps_256 : GCCBuiltin<"__builtin_ia32_vtestzps256">,
	Intrinsic<[llvm_i32_ty], [llvm_v8f32_ty,
	llvm_v8f32_ty], [IntrNoMem]>;
	def int_x86_avx_vtestc_ps_256 : GCCBuiltin<"__builtin_ia32_vtestcps256">,
	Intrinsic<[llvm_i32_ty], [llvm_v8f32_ty,
	llvm_v8f32_ty], [IntrNoMem]>;
	def int_x86_avx_vtestnzc_ps_256 : GCCBuiltin<"__builtin_ia32_vtestnzcps256">,
	Intrinsic<[llvm_i32_ty], [llvm_v8f32_ty,
	llvm_v8f32_ty], [IntrNoMem]>;
	def int_x86_avx_ptestz_256 : GCCBuiltin<"__builtin_ia32_ptestz256">,
	Intrinsic<[llvm_i32_ty], [llvm_v4i64_ty,
	llvm_v4i64_ty], [IntrNoMem]>;
	def int_x86_avx_ptestc_256 : GCCBuiltin<"__builtin_ia32_ptestc256">,
	Intrinsic<[llvm_i32_ty], [llvm_v4i64_ty,
	llvm_v4i64_ty], [IntrNoMem]>;
	def int_x86_avx_ptestnzc_256 : GCCBuiltin<"__builtin_ia32_ptestnzc256">,
	Intrinsic<[llvm_i32_ty], [llvm_v4i64_ty,
	llvm_v4i64_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_fpclass_pd_128 :
	GCCBuiltin<"__builtin_ia32_fpclasspd128_mask">,
	Intrinsic<[llvm_i8_ty], [llvm_v2f64_ty, llvm_i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_fpclass_pd_256 :
	GCCBuiltin<"__builtin_ia32_fpclasspd256_mask">,
	Intrinsic<[llvm_i8_ty], [llvm_v4f64_ty, llvm_i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_fpclass_pd_512 :
	GCCBuiltin<"__builtin_ia32_fpclasspd512_mask">,
	Intrinsic<[llvm_i8_ty], [llvm_v8f64_ty, llvm_i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_fpclass_ps_128 :
	GCCBuiltin<"__builtin_ia32_fpclassps128_mask">,
	Intrinsic<[llvm_i8_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_fpclass_ps_256 :
	GCCBuiltin<"__builtin_ia32_fpclassps256_mask">,
	Intrinsic<[llvm_i8_ty], [llvm_v8f32_ty, llvm_i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_fpclass_ps_512 :
	GCCBuiltin<"__builtin_ia32_fpclassps512_mask">,
	Intrinsic<[llvm_i16_ty], [llvm_v16f32_ty, llvm_i32_ty, llvm_i16_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_fpclass_sd :
	GCCBuiltin<"__builtin_ia32_fpclasssd_mask">,
	Intrinsic<[llvm_i8_ty], [llvm_v2f64_ty, llvm_i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_fpclass_ss :
	GCCBuiltin<"__builtin_ia32_fpclassss_mask">,
	Intrinsic<[llvm_i8_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	}

	// Vector extract sign mask
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx_movmsk_pd_256 : GCCBuiltin<"__builtin_ia32_movmskpd256">,
	Intrinsic<[llvm_i32_ty], [llvm_v4f64_ty], [IntrNoMem]>;
	def int_x86_avx_movmsk_ps_256 : GCCBuiltin<"__builtin_ia32_movmskps256">,
	Intrinsic<[llvm_i32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
	}

	// Vector zero
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx_vzeroall : GCCBuiltin<"__builtin_ia32_vzeroall">,
	Intrinsic<[], [], []>;
	def int_x86_avx_vzeroupper : GCCBuiltin<"__builtin_ia32_vzeroupper">,
	Intrinsic<[], [], []>;
	}

	// SIMD load ops
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx_ldu_dq_256 : GCCBuiltin<"__builtin_ia32_lddqu256">,
	Intrinsic<[llvm_v32i8_ty], [llvm_ptr_ty], [IntrReadMem]>;
	}

	// Conditional load ops
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx_maskload_pd : GCCBuiltin<"__builtin_ia32_maskloadpd">,
	Intrinsic<[llvm_v2f64_ty], [llvm_ptr_ty, llvm_v2i64_ty],
	[IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx_maskload_ps : GCCBuiltin<"__builtin_ia32_maskloadps">,
	Intrinsic<[llvm_v4f32_ty], [llvm_ptr_ty, llvm_v4i32_ty],
	[IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx_maskload_pd_256 : GCCBuiltin<"__builtin_ia32_maskloadpd256">,
	Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty, llvm_v4i64_ty],
	[IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx_maskload_ps_256 : GCCBuiltin<"__builtin_ia32_maskloadps256">,
	Intrinsic<[llvm_v8f32_ty], [llvm_ptr_ty, llvm_v8i32_ty],
	[IntrReadMem, IntrArgMemOnly]>;
	}

	// Conditional store ops
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx_maskstore_pd : GCCBuiltin<"__builtin_ia32_maskstorepd">,
	Intrinsic<[], [llvm_ptr_ty,
	llvm_v2i64_ty, llvm_v2f64_ty], [IntrArgMemOnly]>;
	def int_x86_avx_maskstore_ps : GCCBuiltin<"__builtin_ia32_maskstoreps">,
	Intrinsic<[], [llvm_ptr_ty,
	llvm_v4i32_ty, llvm_v4f32_ty], [IntrArgMemOnly]>;
	def int_x86_avx_maskstore_pd_256 :
	GCCBuiltin<"__builtin_ia32_maskstorepd256">,
	Intrinsic<[], [llvm_ptr_ty,
	llvm_v4i64_ty, llvm_v4f64_ty], [IntrArgMemOnly]>;
	def int_x86_avx_maskstore_ps_256 :
	GCCBuiltin<"__builtin_ia32_maskstoreps256">,
	Intrinsic<[], [llvm_ptr_ty,
	llvm_v8i32_ty, llvm_v8f32_ty], [IntrArgMemOnly]>;

	def int_x86_avx512_mask_store_ss :
	GCCBuiltin<"__builtin_ia32_storess_mask">,
	Intrinsic<[], [llvm_ptr_ty, llvm_v4f32_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	}

	// BITALG bits shuffle
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx512_mask_vpshufbitqmb_128 :
	GCCBuiltin<"__builtin_ia32_vpshufbitqmb128_mask">,
	Intrinsic<[llvm_i16_ty],
	[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i16_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_vpshufbitqmb_256 :
	GCCBuiltin<"__builtin_ia32_vpshufbitqmb256_mask">,
	Intrinsic<[llvm_i32_ty],
	[llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_vpshufbitqmb_512 :
	GCCBuiltin<"__builtin_ia32_vpshufbitqmb512_mask">,
	Intrinsic<[llvm_i64_ty],
	[llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty],
	[IntrNoMem]>;
	}

	//===----------------------------------------------------------------------===//
	// AVX2

	// Integer arithmetic ops.
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx2_padds_b : GCCBuiltin<"__builtin_ia32_paddsb256">,
	Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
	llvm_v32i8_ty], [IntrNoMem, Commutative]>;
	def int_x86_avx2_padds_w : GCCBuiltin<"__builtin_ia32_paddsw256">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
	llvm_v16i16_ty], [IntrNoMem, Commutative]>;
	def int_x86_avx2_paddus_b : GCCBuiltin<"__builtin_ia32_paddusb256">,
	Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
	llvm_v32i8_ty], [IntrNoMem, Commutative]>;
	def int_x86_avx2_paddus_w : GCCBuiltin<"__builtin_ia32_paddusw256">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
	llvm_v16i16_ty], [IntrNoMem, Commutative]>;
	def int_x86_avx2_psubs_b : GCCBuiltin<"__builtin_ia32_psubsb256">,
	Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
	llvm_v32i8_ty], [IntrNoMem]>;
	def int_x86_avx2_psubs_w : GCCBuiltin<"__builtin_ia32_psubsw256">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
	llvm_v16i16_ty], [IntrNoMem]>;
	def int_x86_avx2_psubus_b : GCCBuiltin<"__builtin_ia32_psubusb256">,
	Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
	llvm_v32i8_ty], [IntrNoMem]>;
	def int_x86_avx2_psubus_w : GCCBuiltin<"__builtin_ia32_psubusw256">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
	llvm_v16i16_ty], [IntrNoMem]>;
	def int_x86_avx2_pmulhu_w : GCCBuiltin<"__builtin_ia32_pmulhuw256">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
	llvm_v16i16_ty], [IntrNoMem, Commutative]>;
	def int_x86_avx2_pmulh_w : GCCBuiltin<"__builtin_ia32_pmulhw256">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
	llvm_v16i16_ty], [IntrNoMem, Commutative]>;
	def int_x86_avx2_pmulu_dq : GCCBuiltin<"__builtin_ia32_pmuludq256">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v8i32_ty,
	llvm_v8i32_ty], [IntrNoMem, Commutative]>;
	def int_x86_avx2_pmul_dq : GCCBuiltin<"__builtin_ia32_pmuldq256">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v8i32_ty,
	llvm_v8i32_ty], [IntrNoMem, Commutative]>;
	def int_x86_avx2_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd256">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v16i16_ty,
	llvm_v16i16_ty], [IntrNoMem, Commutative]>;
	def int_x86_avx2_psad_bw : GCCBuiltin<"__builtin_ia32_psadbw256">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v32i8_ty,
	llvm_v32i8_ty], [IntrNoMem, Commutative]>;
	}

	// Integer shift ops.
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx2_psll_w : GCCBuiltin<"__builtin_ia32_psllw256">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
	llvm_v8i16_ty], [IntrNoMem]>;
	def int_x86_avx2_psll_d : GCCBuiltin<"__builtin_ia32_pslld256">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
	llvm_v4i32_ty], [IntrNoMem]>;
	def int_x86_avx2_psll_q : GCCBuiltin<"__builtin_ia32_psllq256">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
	llvm_v2i64_ty], [IntrNoMem]>;
	def int_x86_avx2_psrl_w : GCCBuiltin<"__builtin_ia32_psrlw256">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
	llvm_v8i16_ty], [IntrNoMem]>;
	def int_x86_avx2_psrl_d : GCCBuiltin<"__builtin_ia32_psrld256">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
	llvm_v4i32_ty], [IntrNoMem]>;
	def int_x86_avx2_psrl_q : GCCBuiltin<"__builtin_ia32_psrlq256">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
	llvm_v2i64_ty], [IntrNoMem]>;
	def int_x86_avx2_psra_w : GCCBuiltin<"__builtin_ia32_psraw256">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
	llvm_v8i16_ty], [IntrNoMem]>;
	def int_x86_avx2_psra_d : GCCBuiltin<"__builtin_ia32_psrad256">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
	llvm_v4i32_ty], [IntrNoMem]>;

	def int_x86_avx2_pslli_w : GCCBuiltin<"__builtin_ia32_psllwi256">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx2_pslli_d : GCCBuiltin<"__builtin_ia32_pslldi256">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx2_pslli_q : GCCBuiltin<"__builtin_ia32_psllqi256">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx2_psrli_w : GCCBuiltin<"__builtin_ia32_psrlwi256">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx2_psrli_d : GCCBuiltin<"__builtin_ia32_psrldi256">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx2_psrli_q : GCCBuiltin<"__builtin_ia32_psrlqi256">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx2_psrai_w : GCCBuiltin<"__builtin_ia32_psrawi256">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx2_psrai_d : GCCBuiltin<"__builtin_ia32_psradi256">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_psra_q_128 : GCCBuiltin<"__builtin_ia32_psraq128">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
	llvm_v2i64_ty], [IntrNoMem]>;
	def int_x86_avx512_psra_q_256 : GCCBuiltin<"__builtin_ia32_psraq256">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
	llvm_v2i64_ty], [IntrNoMem]>;

	def int_x86_avx512_psrai_q_128 : GCCBuiltin<"__builtin_ia32_psraqi128">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_psrai_q_256 : GCCBuiltin<"__builtin_ia32_psraqi256">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_psll_w_512 : GCCBuiltin<"__builtin_ia32_psllw512">,
	Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
	llvm_v8i16_ty], [IntrNoMem]>;
	def int_x86_avx512_psll_d_512 : GCCBuiltin<"__builtin_ia32_pslld512">,
	Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
	llvm_v4i32_ty], [IntrNoMem]>;
	def int_x86_avx512_psll_q_512 : GCCBuiltin<"__builtin_ia32_psllq512">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
	llvm_v2i64_ty], [IntrNoMem]>;
	def int_x86_avx512_psrl_w_512 : GCCBuiltin<"__builtin_ia32_psrlw512">,
	Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
	llvm_v8i16_ty], [IntrNoMem]>;
	def int_x86_avx512_psrl_d_512 : GCCBuiltin<"__builtin_ia32_psrld512">,
	Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
	llvm_v4i32_ty], [IntrNoMem]>;
	def int_x86_avx512_psrl_q_512 : GCCBuiltin<"__builtin_ia32_psrlq512">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
	llvm_v2i64_ty], [IntrNoMem]>;
	def int_x86_avx512_psra_w_512 : GCCBuiltin<"__builtin_ia32_psraw512">,
	Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
	llvm_v8i16_ty], [IntrNoMem]>;
	def int_x86_avx512_psra_d_512 : GCCBuiltin<"__builtin_ia32_psrad512">,
	Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
	llvm_v4i32_ty], [IntrNoMem]>;
	def int_x86_avx512_psra_q_512 : GCCBuiltin<"__builtin_ia32_psraq512">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
	llvm_v2i64_ty], [IntrNoMem]>;

	def int_x86_avx512_pslli_w_512 : GCCBuiltin<"__builtin_ia32_psllwi512">,
	Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_pslli_d_512 : GCCBuiltin<"__builtin_ia32_pslldi512">,
	Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_pslli_q_512 : GCCBuiltin<"__builtin_ia32_psllqi512">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_psrli_w_512 : GCCBuiltin<"__builtin_ia32_psrlwi512">,
	Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_psrli_d_512 : GCCBuiltin<"__builtin_ia32_psrldi512">,
	Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_psrli_q_512 : GCCBuiltin<"__builtin_ia32_psrlqi512">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_psrai_w_512 : GCCBuiltin<"__builtin_ia32_psrawi512">,
	Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_psrai_d_512 : GCCBuiltin<"__builtin_ia32_psradi512">,
	Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_psrai_q_512 : GCCBuiltin<"__builtin_ia32_psraqi512">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_pmultishift_qb_128:
	GCCBuiltin<"__builtin_ia32_vpmultishiftqb128_mask">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
	llvm_v16i8_ty, llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_pmultishift_qb_256:
	GCCBuiltin<"__builtin_ia32_vpmultishiftqb256_mask">,
	Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
	llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_pmultishift_qb_512:
	GCCBuiltin<"__builtin_ia32_vpmultishiftqb512_mask">,
	Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty,
	llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>;
	}

	// Pack ops.
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx2_packsswb : GCCBuiltin<"__builtin_ia32_packsswb256">,
	Intrinsic<[llvm_v32i8_ty], [llvm_v16i16_ty,
	llvm_v16i16_ty], [IntrNoMem]>;
	def int_x86_avx2_packssdw : GCCBuiltin<"__builtin_ia32_packssdw256">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v8i32_ty,
	llvm_v8i32_ty], [IntrNoMem]>;
	def int_x86_avx2_packuswb : GCCBuiltin<"__builtin_ia32_packuswb256">,
	Intrinsic<[llvm_v32i8_ty], [llvm_v16i16_ty,
	llvm_v16i16_ty], [IntrNoMem]>;
	def int_x86_avx2_packusdw : GCCBuiltin<"__builtin_ia32_packusdw256">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v8i32_ty,
	llvm_v8i32_ty], [IntrNoMem]>;
	}

	// Horizontal arithmetic ops
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx2_phadd_w : GCCBuiltin<"__builtin_ia32_phaddw256">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
	llvm_v16i16_ty], [IntrNoMem]>;
	def int_x86_avx2_phadd_d : GCCBuiltin<"__builtin_ia32_phaddd256">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
	llvm_v8i32_ty], [IntrNoMem]>;
	def int_x86_avx2_phadd_sw : GCCBuiltin<"__builtin_ia32_phaddsw256">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
	llvm_v16i16_ty], [IntrNoMem]>;
	def int_x86_avx2_phsub_w : GCCBuiltin<"__builtin_ia32_phsubw256">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
	llvm_v16i16_ty], [IntrNoMem]>;
	def int_x86_avx2_phsub_d : GCCBuiltin<"__builtin_ia32_phsubd256">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
	llvm_v8i32_ty], [IntrNoMem]>;
	def int_x86_avx2_phsub_sw : GCCBuiltin<"__builtin_ia32_phsubsw256">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
	llvm_v16i16_ty], [IntrNoMem]>;
	def int_x86_avx2_pmadd_ub_sw : GCCBuiltin<"__builtin_ia32_pmaddubsw256">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v32i8_ty,
	llvm_v32i8_ty], [IntrNoMem]>;
	}

	// Sign ops
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx2_psign_b : GCCBuiltin<"__builtin_ia32_psignb256">,
	Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
	llvm_v32i8_ty], [IntrNoMem]>;
	def int_x86_avx2_psign_w : GCCBuiltin<"__builtin_ia32_psignw256">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
	llvm_v16i16_ty], [IntrNoMem]>;
	def int_x86_avx2_psign_d : GCCBuiltin<"__builtin_ia32_psignd256">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
	llvm_v8i32_ty], [IntrNoMem]>;
	}

	// Packed multiply high with round and scale
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx2_pmul_hr_sw : GCCBuiltin<"__builtin_ia32_pmulhrsw256">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
	llvm_v16i16_ty], [IntrNoMem, Commutative]>;
	def int_x86_avx512_mask_pmul_hr_sw_128 : GCCBuiltin<"__builtin_ia32_pmulhrsw128_mask">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
	llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_pmul_hr_sw_256 : GCCBuiltin<"__builtin_ia32_pmulhrsw256_mask">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
	llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_pmul_hr_sw_512 : GCCBuiltin<"__builtin_ia32_pmulhrsw512_mask">,
	Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
	llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
	}

	// Vector blend
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx2_pblendvb : GCCBuiltin<"__builtin_ia32_pblendvb256">,
	Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
	llvm_v32i8_ty], [IntrNoMem]>;
	}


	// Vector permutation
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx2_permd : GCCBuiltin<"__builtin_ia32_permvarsi256">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty],
	[IntrNoMem]>;
	def int_x86_avx2_permps : GCCBuiltin<"__builtin_ia32_permvarsf256">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8i32_ty],
	[IntrNoMem]>;
	}

	// Conditional load ops
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx2_maskload_d : GCCBuiltin<"__builtin_ia32_maskloadd">,
	Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_v4i32_ty],
	[IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx2_maskload_q : GCCBuiltin<"__builtin_ia32_maskloadq">,
	Intrinsic<[llvm_v2i64_ty], [llvm_ptr_ty, llvm_v2i64_ty],
	[IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx2_maskload_d_256 : GCCBuiltin<"__builtin_ia32_maskloadd256">,
	Intrinsic<[llvm_v8i32_ty], [llvm_ptr_ty, llvm_v8i32_ty],
	[IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx2_maskload_q_256 : GCCBuiltin<"__builtin_ia32_maskloadq256">,
	Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty, llvm_v4i64_ty],
	[IntrReadMem, IntrArgMemOnly]>;
	}

	// Conditional store ops
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx2_maskstore_d : GCCBuiltin<"__builtin_ia32_maskstored">,
	Intrinsic<[], [llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i32_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx2_maskstore_q : GCCBuiltin<"__builtin_ia32_maskstoreq">,
	Intrinsic<[], [llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i64_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx2_maskstore_d_256 :
	GCCBuiltin<"__builtin_ia32_maskstored256">,
	Intrinsic<[], [llvm_ptr_ty, llvm_v8i32_ty, llvm_v8i32_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx2_maskstore_q_256 :
	GCCBuiltin<"__builtin_ia32_maskstoreq256">,
	Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i64_ty],
	[IntrArgMemOnly]>;
	}

	// Variable bit shift ops
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx2_psllv_d : GCCBuiltin<"__builtin_ia32_psllv4si">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_x86_avx2_psllv_d_256 : GCCBuiltin<"__builtin_ia32_psllv8si">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty],
	[IntrNoMem]>;
	def int_x86_avx2_psllv_q : GCCBuiltin<"__builtin_ia32_psllv2di">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
	[IntrNoMem]>;
	def int_x86_avx2_psllv_q_256 : GCCBuiltin<"__builtin_ia32_psllv4di">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty],
	[IntrNoMem]>;

	def int_x86_avx512_psllv_d_512 : GCCBuiltin<"__builtin_ia32_psllv16si">,
	Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty],
	[IntrNoMem]>;
	def int_x86_avx512_psllv_q_512 : GCCBuiltin<"__builtin_ia32_psllv8di">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty],
	[IntrNoMem]>;

	def int_x86_avx2_psrlv_d : GCCBuiltin<"__builtin_ia32_psrlv4si">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_x86_avx2_psrlv_d_256 : GCCBuiltin<"__builtin_ia32_psrlv8si">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty],
	[IntrNoMem]>;
	def int_x86_avx2_psrlv_q : GCCBuiltin<"__builtin_ia32_psrlv2di">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
	[IntrNoMem]>;
	def int_x86_avx2_psrlv_q_256 : GCCBuiltin<"__builtin_ia32_psrlv4di">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty],
	[IntrNoMem]>;

	def int_x86_avx512_psrlv_d_512 : GCCBuiltin<"__builtin_ia32_psrlv16si">,
	Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty],
	[IntrNoMem]>;
	def int_x86_avx512_psrlv_q_512 : GCCBuiltin<"__builtin_ia32_psrlv8di">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty],
	[IntrNoMem]>;

	def int_x86_avx2_psrav_d : GCCBuiltin<"__builtin_ia32_psrav4si">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_x86_avx2_psrav_d_256 : GCCBuiltin<"__builtin_ia32_psrav8si">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_psrav_d_512 : GCCBuiltin<"__builtin_ia32_psrav16si">,
	Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty],
	[IntrNoMem]>;
	def int_x86_avx512_psrav_q_128 : GCCBuiltin<"__builtin_ia32_psravq128">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
	[IntrNoMem]>;
	def int_x86_avx512_psrav_q_256 : GCCBuiltin<"__builtin_ia32_psravq256">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty],
	[IntrNoMem]>;
	def int_x86_avx512_psrav_q_512 : GCCBuiltin<"__builtin_ia32_psrav8di">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty],
	[IntrNoMem]>;

	def int_x86_avx512_psllv_w_128 : GCCBuiltin<"__builtin_ia32_psllv8hi">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
	[IntrNoMem]>;
	def int_x86_avx512_psllv_w_256 : GCCBuiltin<"__builtin_ia32_psllv16hi">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty],
	[IntrNoMem]>;
	def int_x86_avx512_psllv_w_512 : GCCBuiltin<"__builtin_ia32_psllv32hi">,
	Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty],
	[IntrNoMem]>;

	def int_x86_avx512_psrlv_w_128 : GCCBuiltin<"__builtin_ia32_psrlv8hi">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
	[IntrNoMem]>;
	def int_x86_avx512_psrlv_w_256 : GCCBuiltin<"__builtin_ia32_psrlv16hi">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty],
	[IntrNoMem]>;
	def int_x86_avx512_psrlv_w_512 : GCCBuiltin<"__builtin_ia32_psrlv32hi">,
	Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty],
	[IntrNoMem]>;

	def int_x86_avx512_psrav_w_128 : GCCBuiltin<"__builtin_ia32_psrav8hi">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
	[IntrNoMem]>;
	def int_x86_avx512_psrav_w_256 : GCCBuiltin<"__builtin_ia32_psrav16hi">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty],
	[IntrNoMem]>;
	def int_x86_avx512_psrav_w_512 : GCCBuiltin<"__builtin_ia32_psrav32hi">,
	Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_prorv_d_128 : GCCBuiltin<"__builtin_ia32_prorvd128_mask">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
	llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_prorv_d_256 : GCCBuiltin<"__builtin_ia32_prorvd256_mask">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
	llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_prorv_d_512 : GCCBuiltin<"__builtin_ia32_prorvd512_mask">,
	Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
	llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_prorv_q_128 : GCCBuiltin<"__builtin_ia32_prorvq128_mask">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
	llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_prorv_q_256 : GCCBuiltin<"__builtin_ia32_prorvq256_mask">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
	llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_prorv_q_512 : GCCBuiltin<"__builtin_ia32_prorvq512_mask">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
	llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_prol_d_128 : GCCBuiltin<"__builtin_ia32_prold128_mask">,
	Intrinsic<[llvm_v4i32_ty] , [llvm_v4i32_ty,
	llvm_i32_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_prol_d_256 : GCCBuiltin<"__builtin_ia32_prold256_mask">,
	Intrinsic<[llvm_v8i32_ty] , [llvm_v8i32_ty,
	llvm_i32_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_prol_d_512 : GCCBuiltin<"__builtin_ia32_prold512_mask">,
	Intrinsic<[llvm_v16i32_ty] , [llvm_v16i32_ty,
	llvm_i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_prol_q_128 : GCCBuiltin<"__builtin_ia32_prolq128_mask">,
	Intrinsic<[llvm_v2i64_ty] , [llvm_v2i64_ty,
	llvm_i32_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_prol_q_256 : GCCBuiltin<"__builtin_ia32_prolq256_mask">,
	Intrinsic<[llvm_v4i64_ty] , [llvm_v4i64_ty,
	llvm_i32_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_prol_q_512 : GCCBuiltin<"__builtin_ia32_prolq512_mask">,
	Intrinsic<[llvm_v8i64_ty] , [llvm_v8i64_ty,
	llvm_i32_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;


	def int_x86_avx512_mask_prolv_d_128 : GCCBuiltin<"__builtin_ia32_prolvd128_mask">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
	llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_prolv_d_256 : GCCBuiltin<"__builtin_ia32_prolvd256_mask">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
	llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_prolv_d_512 : GCCBuiltin<"__builtin_ia32_prolvd512_mask">,
	Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
	llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_prolv_q_128 : GCCBuiltin<"__builtin_ia32_prolvq128_mask">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
	llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_prolv_q_256 : GCCBuiltin<"__builtin_ia32_prolvq256_mask">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
	llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_prolv_q_512 : GCCBuiltin<"__builtin_ia32_prolvq512_mask">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
	llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_pror_d_128 : GCCBuiltin<"__builtin_ia32_prord128_mask">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty,
	llvm_i32_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_pror_d_256 : GCCBuiltin<"__builtin_ia32_prord256_mask">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
	llvm_i32_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_pror_d_512 : GCCBuiltin<"__builtin_ia32_prord512_mask">,
	Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
	llvm_i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_pror_q_128 : GCCBuiltin<"__builtin_ia32_prorq128_mask">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty,
	llvm_i32_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_pror_q_256 : GCCBuiltin<"__builtin_ia32_prorq256_mask">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
	llvm_i32_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_pror_q_512 : GCCBuiltin<"__builtin_ia32_prorq512_mask">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
	llvm_i32_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;

	}

	// Gather ops
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx2_gather_d_pd : GCCBuiltin<"__builtin_ia32_gatherd_pd">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v2f64_ty, llvm_i8_ty],
	[IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx2_gather_d_pd_256 : GCCBuiltin<"__builtin_ia32_gatherd_pd256">,
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4f64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4f64_ty, llvm_i8_ty],
	[IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx2_gather_q_pd : GCCBuiltin<"__builtin_ia32_gatherq_pd">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v2f64_ty, llvm_i8_ty],
	[IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx2_gather_q_pd_256 : GCCBuiltin<"__builtin_ia32_gatherq_pd256">,
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4f64_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4f64_ty, llvm_i8_ty],
	[IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx2_gather_d_ps : GCCBuiltin<"__builtin_ia32_gatherd_ps">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4f32_ty, llvm_i8_ty],
	[IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx2_gather_d_ps_256 : GCCBuiltin<"__builtin_ia32_gatherd_ps256">,
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8f32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_v8f32_ty, llvm_i8_ty],
	[IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx2_gather_q_ps : GCCBuiltin<"__builtin_ia32_gatherq_ps">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v4f32_ty, llvm_i8_ty],
	[IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx2_gather_q_ps_256 : GCCBuiltin<"__builtin_ia32_gatherq_ps256">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4f32_ty, llvm_i8_ty],
	[IntrReadMem, IntrArgMemOnly]>;

	def int_x86_avx2_gather_d_q : GCCBuiltin<"__builtin_ia32_gatherd_q">,
	Intrinsic<[llvm_v2i64_ty],
	[llvm_v2i64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v2i64_ty, llvm_i8_ty],
	[IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx2_gather_d_q_256 : GCCBuiltin<"__builtin_ia32_gatherd_q256">,
	Intrinsic<[llvm_v4i64_ty],
	[llvm_v4i64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i64_ty, llvm_i8_ty],
	[IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx2_gather_q_q : GCCBuiltin<"__builtin_ia32_gatherq_q">,
	Intrinsic<[llvm_v2i64_ty],
	[llvm_v2i64_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty],
	[IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx2_gather_q_q_256 : GCCBuiltin<"__builtin_ia32_gatherq_q256">,
	Intrinsic<[llvm_v4i64_ty],
	[llvm_v4i64_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty],
	[IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx2_gather_d_d : GCCBuiltin<"__builtin_ia32_gatherd_d">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v4i32_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx2_gather_d_d_256 : GCCBuiltin<"__builtin_ia32_gatherd_d256">,
	Intrinsic<[llvm_v8i32_ty],
	[llvm_v8i32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty],
	[IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx2_gather_q_d : GCCBuiltin<"__builtin_ia32_gatherq_d">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v4i32_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx2_gather_q_d_256 : GCCBuiltin<"__builtin_ia32_gatherq_d256">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v4i32_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrReadMem, IntrArgMemOnly]>;
	}

	// Misc.
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx2_pmovmskb : GCCBuiltin<"__builtin_ia32_pmovmskb256">,
	Intrinsic<[llvm_i32_ty], [llvm_v32i8_ty], [IntrNoMem]>;
	def int_x86_avx2_pshuf_b : GCCBuiltin<"__builtin_ia32_pshufb256">,
	Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
	llvm_v32i8_ty], [IntrNoMem]>;
	def int_x86_avx2_mpsadbw : GCCBuiltin<"__builtin_ia32_mpsadbw256">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
	llvm_i8_ty], [IntrNoMem, Commutative]>;
	}

	//===----------------------------------------------------------------------===//
	// FMA3 and FMA4

	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_fma_vfmadd_ss : GCCBuiltin<"__builtin_ia32_vfmaddss3">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfmadd_sd : GCCBuiltin<"__builtin_ia32_vfmaddsd3">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
	[IntrNoMem]>;
	def int_x86_fma4_vfmadd_ss : GCCBuiltin<"__builtin_ia32_vfmaddss">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
	[IntrNoMem]>;
	def int_x86_fma4_vfmadd_sd : GCCBuiltin<"__builtin_ia32_vfmaddsd">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfmadd_ps : GCCBuiltin<"__builtin_ia32_vfmaddps">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfmadd_pd : GCCBuiltin<"__builtin_ia32_vfmaddpd">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfmadd_ps_256 : GCCBuiltin<"__builtin_ia32_vfmaddps256">,
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfmadd_pd_256 : GCCBuiltin<"__builtin_ia32_vfmaddpd256">,
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
	[IntrNoMem]>;

	def int_x86_fma_vfmsub_ss : // TODO: remove this intrinsic
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfmsub_sd : // TODO: remove this intrinsic
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfmsub_ps : // TODO: remove this intrinsic
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfmsub_pd : // TODO: remove this intrinsic
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfmsub_ps_256 : // TODO: remove this intrinsic
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfmsub_pd_256 : // TODO: remove this intrinsic
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfnmadd_ss : // TODO: remove this intrinsic
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfnmadd_sd : // TODO: remove this intrinsic
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfnmadd_ps : // TODO: remove this intrinsic
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfnmadd_pd : // TODO: remove this intrinsic
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfnmadd_ps_256 : // TODO: remove this intrinsic
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfnmadd_pd_256 : // TODO: remove this intrinsic
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfnmsub_ss : // TODO: remove this intrinsic
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfnmsub_sd : // TODO: remove this intrinsic
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfnmsub_ps : // TODO: remove this intrinsic
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfnmsub_pd : // TODO: remove this intrinsic
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfnmsub_ps_256 : // TODO: remove this intrinsic
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfnmsub_pd_256 : // TODO: remove this intrinsic
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfmaddsub_ps : GCCBuiltin<"__builtin_ia32_vfmaddsubps">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfmaddsub_pd : GCCBuiltin<"__builtin_ia32_vfmaddsubpd">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfmaddsub_ps_256 :
	GCCBuiltin<"__builtin_ia32_vfmaddsubps256">,
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfmaddsub_pd_256 :
	GCCBuiltin<"__builtin_ia32_vfmaddsubpd256">,
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfmsubadd_ps : // TODO: remove this intrinsic
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfmsubadd_pd : // TODO: remove this intrinsic
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfmsubadd_ps_256 : // TODO: remove this intrinsic
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty],
	[IntrNoMem]>;
	def int_x86_fma_vfmsubadd_pd_256 : // TODO: remove this intrinsic
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vfmadd_pd_128 :
	GCCBuiltin<"__builtin_ia32_vfmaddpd128_mask">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask3_vfmadd_pd_128 :
	GCCBuiltin<"__builtin_ia32_vfmaddpd128_mask3">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_maskz_vfmadd_pd_128 :
	GCCBuiltin<"__builtin_ia32_vfmaddpd128_maskz">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vfmadd_pd_256 :
	GCCBuiltin<"__builtin_ia32_vfmaddpd256_mask">,
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask3_vfmadd_pd_256 :
	GCCBuiltin<"__builtin_ia32_vfmaddpd256_mask3">,
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_maskz_vfmadd_pd_256 :
	GCCBuiltin<"__builtin_ia32_vfmaddpd256_maskz">,
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vfmadd_pd_512 :
	GCCBuiltin<"__builtin_ia32_vfmaddpd512_mask">,
	Intrinsic<[llvm_v8f64_ty],
	[llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask3_vfmadd_pd_512 :
	GCCBuiltin<"__builtin_ia32_vfmaddpd512_mask3">,
	Intrinsic<[llvm_v8f64_ty],
	[llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_maskz_vfmadd_pd_512 :
	GCCBuiltin<"__builtin_ia32_vfmaddpd512_maskz">,
	Intrinsic<[llvm_v8f64_ty],
	[llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_vfmadd_ps_128 :
	GCCBuiltin<"__builtin_ia32_vfmaddps128_mask">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask3_vfmadd_ps_128 :
	GCCBuiltin<"__builtin_ia32_vfmaddps128_mask3">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_maskz_vfmadd_ps_128 :
	GCCBuiltin<"__builtin_ia32_vfmaddps128_maskz">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vfmadd_ps_256 :
	GCCBuiltin<"__builtin_ia32_vfmaddps256_mask">,
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask3_vfmadd_ps_256 :
	GCCBuiltin<"__builtin_ia32_vfmaddps256_mask3">,
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_maskz_vfmadd_ps_256 :
	GCCBuiltin<"__builtin_ia32_vfmaddps256_maskz">,
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vfmadd_ps_512 :
	GCCBuiltin<"__builtin_ia32_vfmaddps512_mask">,
	Intrinsic<[llvm_v16f32_ty],
	[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask3_vfmadd_ps_512 :
	GCCBuiltin<"__builtin_ia32_vfmaddps512_mask3">,
	Intrinsic<[llvm_v16f32_ty],
	[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_maskz_vfmadd_ps_512 :
	GCCBuiltin<"__builtin_ia32_vfmaddps512_maskz">,
	Intrinsic<[llvm_v16f32_ty],
	[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_vfmaddsub_pd_128 :
	GCCBuiltin<"__builtin_ia32_vfmaddsubpd128_mask">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask3_vfmaddsub_pd_128 :
	GCCBuiltin<"__builtin_ia32_vfmaddsubpd128_mask3">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_maskz_vfmaddsub_pd_128 :
	GCCBuiltin<"__builtin_ia32_vfmaddsubpd128_maskz">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vfmaddsub_pd_256 :
	GCCBuiltin<"__builtin_ia32_vfmaddsubpd256_mask">,
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask3_vfmaddsub_pd_256 :
	GCCBuiltin<"__builtin_ia32_vfmaddsubpd256_mask3">,
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_maskz_vfmaddsub_pd_256 :
	GCCBuiltin<"__builtin_ia32_vfmaddsubpd256_maskz">,
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vfmaddsub_pd_512 :
	GCCBuiltin<"__builtin_ia32_vfmaddsubpd512_mask">,
	Intrinsic<[llvm_v8f64_ty],
	[llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask3_vfmaddsub_pd_512 :
	GCCBuiltin<"__builtin_ia32_vfmaddsubpd512_mask3">,
	Intrinsic<[llvm_v8f64_ty],
	[llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_maskz_vfmaddsub_pd_512 :
	GCCBuiltin<"__builtin_ia32_vfmaddsubpd512_maskz">,
	Intrinsic<[llvm_v8f64_ty],
	[llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_vfmaddsub_ps_128 :
	GCCBuiltin<"__builtin_ia32_vfmaddsubps128_mask">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask3_vfmaddsub_ps_128 :
	GCCBuiltin<"__builtin_ia32_vfmaddsubps128_mask3">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_maskz_vfmaddsub_ps_128 :
	GCCBuiltin<"__builtin_ia32_vfmaddsubps128_maskz">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vfmaddsub_ps_256 :
	GCCBuiltin<"__builtin_ia32_vfmaddsubps256_mask">,
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask3_vfmaddsub_ps_256 :
	GCCBuiltin<"__builtin_ia32_vfmaddsubps256_mask3">,
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_maskz_vfmaddsub_ps_256 :
	GCCBuiltin<"__builtin_ia32_vfmaddsubps256_maskz">,
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vfmaddsub_ps_512 :
	GCCBuiltin<"__builtin_ia32_vfmaddsubps512_mask">,
	Intrinsic<[llvm_v16f32_ty],
	[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask3_vfmaddsub_ps_512 :
	GCCBuiltin<"__builtin_ia32_vfmaddsubps512_mask3">,
	Intrinsic<[llvm_v16f32_ty],
	[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_maskz_vfmaddsub_ps_512 :
	GCCBuiltin<"__builtin_ia32_vfmaddsubps512_maskz">,
	Intrinsic<[llvm_v16f32_ty],
	[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
	llvm_i32_ty], [IntrNoMem]>;


	def int_x86_avx512_mask_vfmadd_sd :
	GCCBuiltin<"__builtin_ia32_vfmaddsd3_mask">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_vfmadd_ss :
	GCCBuiltin<"__builtin_ia32_vfmaddss3_mask">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_maskz_vfmadd_sd :
	GCCBuiltin<"__builtin_ia32_vfmaddsd3_maskz">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_maskz_vfmadd_ss :
	GCCBuiltin<"__builtin_ia32_vfmaddss3_maskz">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask3_vfmadd_sd :
	GCCBuiltin<"__builtin_ia32_vfmaddsd3_mask3">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask3_vfmadd_ss :
	GCCBuiltin<"__builtin_ia32_vfmaddss3_mask3">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask3_vfmsub_sd :
	GCCBuiltin<"__builtin_ia32_vfmsubsd3_mask3">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask3_vfmsub_ss :
	GCCBuiltin<"__builtin_ia32_vfmsubss3_mask3">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask3_vfmsub_pd_128 :
	GCCBuiltin<"__builtin_ia32_vfmsubpd128_mask3">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask3_vfmsub_pd_256 :
	GCCBuiltin<"__builtin_ia32_vfmsubpd256_mask3">,
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask3_vfmsub_pd_512 :
	GCCBuiltin<"__builtin_ia32_vfmsubpd512_mask3">,
	Intrinsic<[llvm_v8f64_ty],
	[llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask3_vfmsub_ps_128 :
	GCCBuiltin<"__builtin_ia32_vfmsubps128_mask3">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask3_vfmsub_ps_256 :
	GCCBuiltin<"__builtin_ia32_vfmsubps256_mask3">,
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask3_vfmsub_ps_512 :
	GCCBuiltin<"__builtin_ia32_vfmsubps512_mask3">,
	Intrinsic<[llvm_v16f32_ty],
	[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask3_vfmsubadd_pd_128 :
	GCCBuiltin<"__builtin_ia32_vfmsubaddpd128_mask3">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask3_vfmsubadd_pd_256 :
	GCCBuiltin<"__builtin_ia32_vfmsubaddpd256_mask3">,
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask3_vfmsubadd_pd_512 :
	GCCBuiltin<"__builtin_ia32_vfmsubaddpd512_mask3">,
	Intrinsic<[llvm_v8f64_ty],
	[llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask3_vfmsubadd_ps_128 :
	GCCBuiltin<"__builtin_ia32_vfmsubaddps128_mask3">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask3_vfmsubadd_ps_256 :
	GCCBuiltin<"__builtin_ia32_vfmsubaddps256_mask3">,
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask3_vfmsubadd_ps_512 :
	GCCBuiltin<"__builtin_ia32_vfmsubaddps512_mask3">,
	Intrinsic<[llvm_v16f32_ty],
	[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_vfnmadd_pd_128 :
	GCCBuiltin<"__builtin_ia32_vfnmaddpd128_mask">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vfnmadd_pd_256 :
	GCCBuiltin<"__builtin_ia32_vfnmaddpd256_mask">,
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vfnmadd_pd_512 :
	GCCBuiltin<"__builtin_ia32_vfnmaddpd512_mask">,
	Intrinsic<[llvm_v8f64_ty],
	[llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_vfnmadd_ps_128 :
	GCCBuiltin<"__builtin_ia32_vfnmaddps128_mask">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vfnmadd_ps_256 :
	GCCBuiltin<"__builtin_ia32_vfnmaddps256_mask">,
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vfnmadd_ps_512 :
	GCCBuiltin<"__builtin_ia32_vfnmaddps512_mask">,
	Intrinsic<[llvm_v16f32_ty],
	[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask3_vfnmsub_sd :
	GCCBuiltin<"__builtin_ia32_vfnmsubsd3_mask3">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask3_vfnmsub_ss :
	GCCBuiltin<"__builtin_ia32_vfnmsubss3_mask3">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_vfnmsub_pd_128 :
	GCCBuiltin<"__builtin_ia32_vfnmsubpd128_mask">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask3_vfnmsub_pd_128 :
	GCCBuiltin<"__builtin_ia32_vfnmsubpd128_mask3">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vfnmsub_pd_256 :
	GCCBuiltin<"__builtin_ia32_vfnmsubpd256_mask">,
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask3_vfnmsub_pd_256 :
	GCCBuiltin<"__builtin_ia32_vfnmsubpd256_mask3">,
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vfnmsub_pd_512 :
	GCCBuiltin<"__builtin_ia32_vfnmsubpd512_mask">,
	Intrinsic<[llvm_v8f64_ty],
	[llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask3_vfnmsub_pd_512 :
	GCCBuiltin<"__builtin_ia32_vfnmsubpd512_mask3">,
	Intrinsic<[llvm_v8f64_ty],
	[llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_vfnmsub_ps_128 :
	GCCBuiltin<"__builtin_ia32_vfnmsubps128_mask">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask3_vfnmsub_ps_128 :
	GCCBuiltin<"__builtin_ia32_vfnmsubps128_mask3">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vfnmsub_ps_256 :
	GCCBuiltin<"__builtin_ia32_vfnmsubps256_mask">,
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask3_vfnmsub_ps_256 :
	GCCBuiltin<"__builtin_ia32_vfnmsubps256_mask3">,
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_vfnmsub_ps_512 :
	GCCBuiltin<"__builtin_ia32_vfnmsubps512_mask">,
	Intrinsic<[llvm_v16f32_ty],
	[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask3_vfnmsub_ps_512 :
	GCCBuiltin<"__builtin_ia32_vfnmsubps512_mask3">,
	Intrinsic<[llvm_v16f32_ty],
	[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_vpmadd52h_uq_128 :
	GCCBuiltin<"__builtin_ia32_vpmadd52huq128_mask">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
	llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpmadd52h_uq_128 :
	GCCBuiltin<"__builtin_ia32_vpmadd52huq128_maskz">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
	llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpmadd52l_uq_128 :
	GCCBuiltin<"__builtin_ia32_vpmadd52luq128_mask">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
	llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpmadd52l_uq_128 :
	GCCBuiltin<"__builtin_ia32_vpmadd52luq128_maskz">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
	llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpmadd52h_uq_256 :
	GCCBuiltin<"__builtin_ia32_vpmadd52huq256_mask">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
	llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpmadd52h_uq_256 :
	GCCBuiltin<"__builtin_ia32_vpmadd52huq256_maskz">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
	llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpmadd52l_uq_256 :
	GCCBuiltin<"__builtin_ia32_vpmadd52luq256_mask">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
	llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpmadd52l_uq_256 :
	GCCBuiltin<"__builtin_ia32_vpmadd52luq256_maskz">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
	llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpmadd52h_uq_512 :
	GCCBuiltin<"__builtin_ia32_vpmadd52huq512_mask">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
	llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpmadd52h_uq_512 :
	GCCBuiltin<"__builtin_ia32_vpmadd52huq512_maskz">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
	llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpmadd52l_uq_512 :
	GCCBuiltin<"__builtin_ia32_vpmadd52luq512_mask">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
	llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpmadd52l_uq_512 :
	GCCBuiltin<"__builtin_ia32_vpmadd52luq512_maskz">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
	llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
	}

	// VNNI
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx512_mask_vpdpbusd_128 :
	GCCBuiltin<"__builtin_ia32_vpdpbusd128_mask">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
	llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpdpbusd_128 :
	GCCBuiltin<"__builtin_ia32_vpdpbusd128_maskz">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
	llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpdpbusd_256 :
	GCCBuiltin<"__builtin_ia32_vpdpbusd256_mask">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
	llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpdpbusd_256 :
	GCCBuiltin<"__builtin_ia32_vpdpbusd256_maskz">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
	llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpdpbusd_512 :
	GCCBuiltin<"__builtin_ia32_vpdpbusd512_mask">,
	Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
	llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpdpbusd_512 :
	GCCBuiltin<"__builtin_ia32_vpdpbusd512_maskz">,
	Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
	llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_vpdpbusds_128 :
	GCCBuiltin<"__builtin_ia32_vpdpbusds128_mask">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
	llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpdpbusds_128 :
	GCCBuiltin<"__builtin_ia32_vpdpbusds128_maskz">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
	llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpdpbusds_256 :
	GCCBuiltin<"__builtin_ia32_vpdpbusds256_mask">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
	llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpdpbusds_256 :
	GCCBuiltin<"__builtin_ia32_vpdpbusds256_maskz">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
	llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpdpbusds_512 :
	GCCBuiltin<"__builtin_ia32_vpdpbusds512_mask">,
	Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
	llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpdpbusds_512 :
	GCCBuiltin<"__builtin_ia32_vpdpbusds512_maskz">,
	Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
	llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_vpdpwssd_128 :
	GCCBuiltin<"__builtin_ia32_vpdpwssd128_mask">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
	llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpdpwssd_128 :
	GCCBuiltin<"__builtin_ia32_vpdpwssd128_maskz">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
	llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpdpwssd_256 :
	GCCBuiltin<"__builtin_ia32_vpdpwssd256_mask">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
	llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpdpwssd_256 :
	GCCBuiltin<"__builtin_ia32_vpdpwssd256_maskz">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
	llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpdpwssd_512 :
	GCCBuiltin<"__builtin_ia32_vpdpwssd512_mask">,
	Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
	llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpdpwssd_512 :
	GCCBuiltin<"__builtin_ia32_vpdpwssd512_maskz">,
	Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
	llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_vpdpwssds_128 :
	GCCBuiltin<"__builtin_ia32_vpdpwssds128_mask">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
	llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpdpwssds_128 :
	GCCBuiltin<"__builtin_ia32_vpdpwssds128_maskz">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
	llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpdpwssds_256 :
	GCCBuiltin<"__builtin_ia32_vpdpwssds256_mask">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
	llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpdpwssds_256 :
	GCCBuiltin<"__builtin_ia32_vpdpwssds256_maskz">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
	llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpdpwssds_512 :
	GCCBuiltin<"__builtin_ia32_vpdpwssds512_mask">,
	Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
	llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpdpwssds_512 :
	GCCBuiltin<"__builtin_ia32_vpdpwssds512_maskz">,
	Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
	llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
	}

	//===----------------------------------------------------------------------===//
	// XOP

	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_xop_vpermil2pd : GCCBuiltin<"__builtin_ia32_vpermil2pd">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
	llvm_v2i64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_xop_vpermil2pd_256 :
	GCCBuiltin<"__builtin_ia32_vpermil2pd256">,
	Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
	llvm_v4i64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_xop_vpermil2ps : GCCBuiltin<"__builtin_ia32_vpermil2ps">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
	llvm_v4i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_xop_vpermil2ps_256 :
	GCCBuiltin<"__builtin_ia32_vpermil2ps256">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
	llvm_v8i32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_xop_vfrcz_pd : GCCBuiltin<"__builtin_ia32_vfrczpd">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_xop_vfrcz_ps : GCCBuiltin<"__builtin_ia32_vfrczps">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_xop_vfrcz_sd : GCCBuiltin<"__builtin_ia32_vfrczsd">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
	def int_x86_xop_vfrcz_ss : GCCBuiltin<"__builtin_ia32_vfrczss">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
	def int_x86_xop_vfrcz_pd_256 : GCCBuiltin<"__builtin_ia32_vfrczpd256">,
	Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty], [IntrNoMem]>;
	def int_x86_xop_vfrcz_ps_256 : GCCBuiltin<"__builtin_ia32_vfrczps256">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>;

	def int_x86_xop_vpcomb : GCCBuiltin<"__builtin_ia32_vpcomb">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_xop_vpcomw : GCCBuiltin<"__builtin_ia32_vpcomw">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_xop_vpcomd : GCCBuiltin<"__builtin_ia32_vpcomd">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_xop_vpcomq : GCCBuiltin<"__builtin_ia32_vpcomq">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_xop_vpcomub : GCCBuiltin<"__builtin_ia32_vpcomub">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_xop_vpcomuw : GCCBuiltin<"__builtin_ia32_vpcomuw">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_xop_vpcomud : GCCBuiltin<"__builtin_ia32_vpcomud">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_xop_vpcomuq : GCCBuiltin<"__builtin_ia32_vpcomuq">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
	llvm_i8_ty], [IntrNoMem]>;

	def int_x86_xop_vphaddbd :
	GCCBuiltin<"__builtin_ia32_vphaddbd">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v16i8_ty], [IntrNoMem]>;
	def int_x86_xop_vphaddbq :
	GCCBuiltin<"__builtin_ia32_vphaddbq">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v16i8_ty], [IntrNoMem]>;
	def int_x86_xop_vphaddbw :
	GCCBuiltin<"__builtin_ia32_vphaddbw">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty], [IntrNoMem]>;
	def int_x86_xop_vphadddq :
	GCCBuiltin<"__builtin_ia32_vphadddq">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty], [IntrNoMem]>;
	def int_x86_xop_vphaddubd :
	GCCBuiltin<"__builtin_ia32_vphaddubd">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v16i8_ty], [IntrNoMem]>;
	def int_x86_xop_vphaddubq :
	GCCBuiltin<"__builtin_ia32_vphaddubq">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v16i8_ty], [IntrNoMem]>;
	def int_x86_xop_vphaddubw :
	GCCBuiltin<"__builtin_ia32_vphaddubw">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty], [IntrNoMem]>;
	def int_x86_xop_vphaddudq :
	GCCBuiltin<"__builtin_ia32_vphaddudq">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty], [IntrNoMem]>;
	def int_x86_xop_vphadduwd :
	GCCBuiltin<"__builtin_ia32_vphadduwd">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
	def int_x86_xop_vphadduwq :
	GCCBuiltin<"__builtin_ia32_vphadduwq">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v8i16_ty], [IntrNoMem]>;
	def int_x86_xop_vphaddwd :
	GCCBuiltin<"__builtin_ia32_vphaddwd">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
	def int_x86_xop_vphaddwq :
	GCCBuiltin<"__builtin_ia32_vphaddwq">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v8i16_ty], [IntrNoMem]>;
	def int_x86_xop_vphsubbw :
	GCCBuiltin<"__builtin_ia32_vphsubbw">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty], [IntrNoMem]>;
	def int_x86_xop_vphsubdq :
	GCCBuiltin<"__builtin_ia32_vphsubdq">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty], [IntrNoMem]>;
	def int_x86_xop_vphsubwd :
	GCCBuiltin<"__builtin_ia32_vphsubwd">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
	def int_x86_xop_vpmacsdd :
	GCCBuiltin<"__builtin_ia32_vpmacsdd">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_x86_xop_vpmacsdqh :
	GCCBuiltin<"__builtin_ia32_vpmacsdqh">,
	Intrinsic<[llvm_v2i64_ty],
	[llvm_v4i32_ty, llvm_v4i32_ty, llvm_v2i64_ty],
	[IntrNoMem]>;
	def int_x86_xop_vpmacsdql :
	GCCBuiltin<"__builtin_ia32_vpmacsdql">,
	Intrinsic<[llvm_v2i64_ty],
	[llvm_v4i32_ty, llvm_v4i32_ty, llvm_v2i64_ty],
	[IntrNoMem]>;
	def int_x86_xop_vpmacssdd :
	GCCBuiltin<"__builtin_ia32_vpmacssdd">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_x86_xop_vpmacssdqh :
	GCCBuiltin<"__builtin_ia32_vpmacssdqh">,
	Intrinsic<[llvm_v2i64_ty],
	[llvm_v4i32_ty, llvm_v4i32_ty, llvm_v2i64_ty],
	[IntrNoMem]>;
	def int_x86_xop_vpmacssdql :
	GCCBuiltin<"__builtin_ia32_vpmacssdql">,
	Intrinsic<[llvm_v2i64_ty],
	[llvm_v4i32_ty, llvm_v4i32_ty, llvm_v2i64_ty],
	[IntrNoMem]>;
	def int_x86_xop_vpmacsswd :
	GCCBuiltin<"__builtin_ia32_vpmacsswd">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v8i16_ty, llvm_v8i16_ty, llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_x86_xop_vpmacssww :
	GCCBuiltin<"__builtin_ia32_vpmacssww">,
	Intrinsic<[llvm_v8i16_ty],
	[llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty],
	[IntrNoMem]>;
	def int_x86_xop_vpmacswd :
	GCCBuiltin<"__builtin_ia32_vpmacswd">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v8i16_ty, llvm_v8i16_ty, llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_x86_xop_vpmacsww :
	GCCBuiltin<"__builtin_ia32_vpmacsww">,
	Intrinsic<[llvm_v8i16_ty],
	[llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty],
	[IntrNoMem]>;
	def int_x86_xop_vpmadcsswd :
	GCCBuiltin<"__builtin_ia32_vpmadcsswd">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v8i16_ty, llvm_v8i16_ty, llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_x86_xop_vpmadcswd :
	GCCBuiltin<"__builtin_ia32_vpmadcswd">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v8i16_ty, llvm_v8i16_ty, llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_x86_xop_vpperm :
	GCCBuiltin<"__builtin_ia32_vpperm">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty],
	[IntrNoMem]>;

	def int_x86_xop_vprotb : GCCBuiltin<"__builtin_ia32_vprotb">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
	[IntrNoMem]>;
	def int_x86_xop_vprotd : GCCBuiltin<"__builtin_ia32_vprotd">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_x86_xop_vprotq : GCCBuiltin<"__builtin_ia32_vprotq">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
	[IntrNoMem]>;
	def int_x86_xop_vprotw : GCCBuiltin<"__builtin_ia32_vprotw">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
	[IntrNoMem]>;
	def int_x86_xop_vprotbi : GCCBuiltin<"__builtin_ia32_vprotbi">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_xop_vprotdi : GCCBuiltin<"__builtin_ia32_vprotdi">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_xop_vprotqi : GCCBuiltin<"__builtin_ia32_vprotqi">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_xop_vprotwi : GCCBuiltin<"__builtin_ia32_vprotwi">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_xop_vpshab :
	GCCBuiltin<"__builtin_ia32_vpshab">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
	[IntrNoMem]>;
	def int_x86_xop_vpshad :
	GCCBuiltin<"__builtin_ia32_vpshad">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_x86_xop_vpshaq :
	GCCBuiltin<"__builtin_ia32_vpshaq">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
	[IntrNoMem]>;
	def int_x86_xop_vpshaw :
	GCCBuiltin<"__builtin_ia32_vpshaw">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
	[IntrNoMem]>;
	def int_x86_xop_vpshlb :
	GCCBuiltin<"__builtin_ia32_vpshlb">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty],
	[IntrNoMem]>;
	def int_x86_xop_vpshld :
	GCCBuiltin<"__builtin_ia32_vpshld">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_x86_xop_vpshlq :
	GCCBuiltin<"__builtin_ia32_vpshlq">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
	[IntrNoMem]>;
	def int_x86_xop_vpshlw :
	GCCBuiltin<"__builtin_ia32_vpshlw">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty],
	[IntrNoMem]>;
	}

	//===----------------------------------------------------------------------===//
	// LWP
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_llwpcb :
	GCCBuiltin<"__builtin_ia32_llwpcb">,
	Intrinsic<[], [llvm_ptr_ty], []>;
	def int_x86_slwpcb :
	GCCBuiltin<"__builtin_ia32_slwpcb">,
	Intrinsic<[llvm_ptr_ty], [], []>;
	def int_x86_lwpins32 :
	GCCBuiltin<"__builtin_ia32_lwpins32">,
	Intrinsic<[llvm_i8_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
	def int_x86_lwpins64 :
	GCCBuiltin<"__builtin_ia32_lwpins64">,
	Intrinsic<[llvm_i8_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>;
	def int_x86_lwpval32 :
	GCCBuiltin<"__builtin_ia32_lwpval32">,
	Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
	def int_x86_lwpval64 :
	GCCBuiltin<"__builtin_ia32_lwpval64">,
	Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], []>;
	}

	//===----------------------------------------------------------------------===//
	// MMX

	// Empty MMX state op.
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_mmx_emms : GCCBuiltin<"__builtin_ia32_emms">,
	Intrinsic<[], [], []>;
	def int_x86_mmx_femms : GCCBuiltin<"__builtin_ia32_femms">,
	Intrinsic<[], [], []>;
	}

	// Integer arithmetic ops.
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	// Addition
	def int_x86_mmx_padd_b : GCCBuiltin<"__builtin_ia32_paddb">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem, Commutative]>;
	def int_x86_mmx_padd_w : GCCBuiltin<"__builtin_ia32_paddw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem, Commutative]>;
	def int_x86_mmx_padd_d : GCCBuiltin<"__builtin_ia32_paddd">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem, Commutative]>;
	def int_x86_mmx_padd_q : GCCBuiltin<"__builtin_ia32_paddq">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem, Commutative]>;

	def int_x86_mmx_padds_b : GCCBuiltin<"__builtin_ia32_paddsb">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
	def int_x86_mmx_padds_w : GCCBuiltin<"__builtin_ia32_paddsw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem, Commutative]>;

	def int_x86_mmx_paddus_b : GCCBuiltin<"__builtin_ia32_paddusb">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
	def int_x86_mmx_paddus_w : GCCBuiltin<"__builtin_ia32_paddusw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem, Commutative]>;

	// Subtraction
	def int_x86_mmx_psub_b : GCCBuiltin<"__builtin_ia32_psubb">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem]>;
	def int_x86_mmx_psub_w : GCCBuiltin<"__builtin_ia32_psubw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem]>;
	def int_x86_mmx_psub_d : GCCBuiltin<"__builtin_ia32_psubd">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem]>;
	def int_x86_mmx_psub_q : GCCBuiltin<"__builtin_ia32_psubq">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem]>;

	def int_x86_mmx_psubs_b : GCCBuiltin<"__builtin_ia32_psubsb">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;
	def int_x86_mmx_psubs_w : GCCBuiltin<"__builtin_ia32_psubsw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;

	def int_x86_mmx_psubus_b : GCCBuiltin<"__builtin_ia32_psubusb">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;
	def int_x86_mmx_psubus_w : GCCBuiltin<"__builtin_ia32_psubusw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;

	// Multiplication
	def int_x86_mmx_pmulh_w : GCCBuiltin<"__builtin_ia32_pmulhw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
	def int_x86_mmx_pmull_w : GCCBuiltin<"__builtin_ia32_pmullw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
	def int_x86_mmx_pmulhu_w : GCCBuiltin<"__builtin_ia32_pmulhuw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
	def int_x86_mmx_pmulu_dq : GCCBuiltin<"__builtin_ia32_pmuludq">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
	def int_x86_mmx_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem, Commutative]>;

	// Bitwise operations
	def int_x86_mmx_pand : GCCBuiltin<"__builtin_ia32_pand">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem, Commutative]>;
	def int_x86_mmx_pandn : GCCBuiltin<"__builtin_ia32_pandn">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem]>;
	def int_x86_mmx_por : GCCBuiltin<"__builtin_ia32_por">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem, Commutative]>;
	def int_x86_mmx_pxor : GCCBuiltin<"__builtin_ia32_pxor">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem, Commutative]>;

	// Averages
	def int_x86_mmx_pavg_b : GCCBuiltin<"__builtin_ia32_pavgb">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
	def int_x86_mmx_pavg_w : GCCBuiltin<"__builtin_ia32_pavgw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem, Commutative]>;

	// Maximum
	def int_x86_mmx_pmaxu_b : GCCBuiltin<"__builtin_ia32_pmaxub">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
	def int_x86_mmx_pmaxs_w : GCCBuiltin<"__builtin_ia32_pmaxsw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem, Commutative]>;

	// Minimum
	def int_x86_mmx_pminu_b : GCCBuiltin<"__builtin_ia32_pminub">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
	def int_x86_mmx_pmins_w : GCCBuiltin<"__builtin_ia32_pminsw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem, Commutative]>;

	// Packed sum of absolute differences
	def int_x86_mmx_psad_bw : GCCBuiltin<"__builtin_ia32_psadbw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
	}

	// Integer shift ops.
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	// Shift left logical
	def int_x86_mmx_psll_w : GCCBuiltin<"__builtin_ia32_psllw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;
	def int_x86_mmx_psll_d : GCCBuiltin<"__builtin_ia32_pslld">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;
	def int_x86_mmx_psll_q : GCCBuiltin<"__builtin_ia32_psllq">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;

	def int_x86_mmx_psrl_w : GCCBuiltin<"__builtin_ia32_psrlw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;
	def int_x86_mmx_psrl_d : GCCBuiltin<"__builtin_ia32_psrld">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;
	def int_x86_mmx_psrl_q : GCCBuiltin<"__builtin_ia32_psrlq">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;

	def int_x86_mmx_psra_w : GCCBuiltin<"__builtin_ia32_psraw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;
	def int_x86_mmx_psra_d : GCCBuiltin<"__builtin_ia32_psrad">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;

	def int_x86_mmx_pslli_w : GCCBuiltin<"__builtin_ia32_psllwi">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_mmx_pslli_d : GCCBuiltin<"__builtin_ia32_pslldi">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_mmx_pslli_q : GCCBuiltin<"__builtin_ia32_psllqi">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_mmx_psrli_w : GCCBuiltin<"__builtin_ia32_psrlwi">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_mmx_psrli_d : GCCBuiltin<"__builtin_ia32_psrldi">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_mmx_psrli_q : GCCBuiltin<"__builtin_ia32_psrlqi">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_i32_ty], [IntrNoMem]>;

	def int_x86_mmx_psrai_w : GCCBuiltin<"__builtin_ia32_psrawi">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_mmx_psrai_d : GCCBuiltin<"__builtin_ia32_psradi">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_i32_ty], [IntrNoMem]>;
	}
	// Permute
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx512_mask_permvar_df_256 : GCCBuiltin<"__builtin_ia32_permvardf256_mask">,
	Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,
	llvm_v4i64_ty, llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_permvar_df_512 : GCCBuiltin<"__builtin_ia32_permvardf512_mask">,
	Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty,
	llvm_v8i64_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_permvar_di_256 : GCCBuiltin<"__builtin_ia32_permvardi256_mask">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty,
	llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_permvar_di_512 : GCCBuiltin<"__builtin_ia32_permvardi512_mask">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty,
	llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_permvar_hi_128 : GCCBuiltin<"__builtin_ia32_permvarhi128_mask">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
	llvm_v8i16_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_permvar_hi_256 : GCCBuiltin<"__builtin_ia32_permvarhi256_mask">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
	llvm_v16i16_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_permvar_hi_512 : GCCBuiltin<"__builtin_ia32_permvarhi512_mask">,
	Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty,
	llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_permvar_qi_128 : GCCBuiltin<"__builtin_ia32_permvarqi128_mask">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
	llvm_v16i8_ty, llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_permvar_qi_256 : GCCBuiltin<"__builtin_ia32_permvarqi256_mask">,
	Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
	llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_permvar_qi_512 : GCCBuiltin<"__builtin_ia32_permvarqi512_mask">,
	Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty,
	llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_permvar_sf_256 : GCCBuiltin<"__builtin_ia32_permvarsf256_mask">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
	llvm_v8i32_ty, llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_permvar_sf_512 : GCCBuiltin<"__builtin_ia32_permvarsf512_mask">,
	Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty,
	llvm_v16i32_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_permvar_si_256 : GCCBuiltin<"__builtin_ia32_permvarsi256_mask">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
	llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_permvar_si_512 : GCCBuiltin<"__builtin_ia32_permvarsi512_mask">,
	Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty,
	llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
	}
	// Pack ops.
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_mmx_packsswb : GCCBuiltin<"__builtin_ia32_packsswb">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;
	def int_x86_mmx_packssdw : GCCBuiltin<"__builtin_ia32_packssdw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;
	def int_x86_mmx_packuswb : GCCBuiltin<"__builtin_ia32_packuswb">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;
	}

	// Unpacking ops.
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_mmx_punpckhbw : GCCBuiltin<"__builtin_ia32_punpckhbw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem]>;
	def int_x86_mmx_punpckhwd : GCCBuiltin<"__builtin_ia32_punpckhwd">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem]>;
	def int_x86_mmx_punpckhdq : GCCBuiltin<"__builtin_ia32_punpckhdq">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem]>;
	def int_x86_mmx_punpcklbw : GCCBuiltin<"__builtin_ia32_punpcklbw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem]>;
	def int_x86_mmx_punpcklwd : GCCBuiltin<"__builtin_ia32_punpcklwd">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem]>;
	def int_x86_mmx_punpckldq : GCCBuiltin<"__builtin_ia32_punpckldq">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty],
	[IntrNoMem]>;
	}

	// Integer comparison ops
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_mmx_pcmpeq_b : GCCBuiltin<"__builtin_ia32_pcmpeqb">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
	def int_x86_mmx_pcmpeq_w : GCCBuiltin<"__builtin_ia32_pcmpeqw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem, Commutative]>;
	def int_x86_mmx_pcmpeq_d : GCCBuiltin<"__builtin_ia32_pcmpeqd">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem, Commutative]>;

	def int_x86_mmx_pcmpgt_b : GCCBuiltin<"__builtin_ia32_pcmpgtb">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;
	def int_x86_mmx_pcmpgt_w : GCCBuiltin<"__builtin_ia32_pcmpgtw">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;
	def int_x86_mmx_pcmpgt_d : GCCBuiltin<"__builtin_ia32_pcmpgtd">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty], [IntrNoMem]>;
	}

	// Misc.
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_mmx_maskmovq : GCCBuiltin<"__builtin_ia32_maskmovq">,
	Intrinsic<[], [llvm_x86mmx_ty, llvm_x86mmx_ty, llvm_ptr_ty], []>;

	def int_x86_mmx_pmovmskb : GCCBuiltin<"__builtin_ia32_pmovmskb">,
	Intrinsic<[llvm_i32_ty], [llvm_x86mmx_ty], [IntrNoMem]>;

	def int_x86_mmx_movnt_dq : GCCBuiltin<"__builtin_ia32_movntq">,
	Intrinsic<[], [llvm_ptrx86mmx_ty, llvm_x86mmx_ty], []>;

	def int_x86_mmx_palignr_b : GCCBuiltin<"__builtin_ia32_palignr">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_x86mmx_ty, llvm_i8_ty], [IntrNoMem]>;

	def int_x86_mmx_pextr_w : GCCBuiltin<"__builtin_ia32_vec_ext_v4hi">,
	Intrinsic<[llvm_i32_ty], [llvm_x86mmx_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_mmx_pinsr_w : GCCBuiltin<"__builtin_ia32_vec_set_v4hi">,
	Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty,
	llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
	}

	//===----------------------------------------------------------------------===//
	// BMI

	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_bmi_bextr_32 : GCCBuiltin<"__builtin_ia32_bextr_u32">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_bmi_bextr_64 : GCCBuiltin<"__builtin_ia32_bextr_u64">,
	Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
	def int_x86_bmi_bzhi_32 : GCCBuiltin<"__builtin_ia32_bzhi_si">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_bmi_bzhi_64 : GCCBuiltin<"__builtin_ia32_bzhi_di">,
	Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
	def int_x86_bmi_pdep_32 : GCCBuiltin<"__builtin_ia32_pdep_si">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_bmi_pdep_64 : GCCBuiltin<"__builtin_ia32_pdep_di">,
	Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
	def int_x86_bmi_pext_32 : GCCBuiltin<"__builtin_ia32_pext_si">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_bmi_pext_64 : GCCBuiltin<"__builtin_ia32_pext_di">,
	Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
	}

	//===----------------------------------------------------------------------===//
	// FS/GS Base

	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_rdfsbase_32 : GCCBuiltin<"__builtin_ia32_rdfsbase32">,
	Intrinsic<[llvm_i32_ty], []>;
	def int_x86_rdgsbase_32 : GCCBuiltin<"__builtin_ia32_rdgsbase32">,
	Intrinsic<[llvm_i32_ty], []>;
	def int_x86_rdfsbase_64 : GCCBuiltin<"__builtin_ia32_rdfsbase64">,
	Intrinsic<[llvm_i64_ty], []>;
	def int_x86_rdgsbase_64 : GCCBuiltin<"__builtin_ia32_rdgsbase64">,
	Intrinsic<[llvm_i64_ty], []>;
	def int_x86_wrfsbase_32 : GCCBuiltin<"__builtin_ia32_wrfsbase32">,
	Intrinsic<[], [llvm_i32_ty]>;
	def int_x86_wrgsbase_32 : GCCBuiltin<"__builtin_ia32_wrgsbase32">,
	Intrinsic<[], [llvm_i32_ty]>;
	def int_x86_wrfsbase_64 : GCCBuiltin<"__builtin_ia32_wrfsbase64">,
	Intrinsic<[], [llvm_i64_ty]>;
	def int_x86_wrgsbase_64 : GCCBuiltin<"__builtin_ia32_wrgsbase64">,
	Intrinsic<[], [llvm_i64_ty]>;
	}

	//===----------------------------------------------------------------------===//
	// FXSR
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_fxrstor : GCCBuiltin<"__builtin_ia32_fxrstor">,
	Intrinsic<[], [llvm_ptr_ty], []>;
	def int_x86_fxrstor64 : GCCBuiltin<"__builtin_ia32_fxrstor64">,
	Intrinsic<[], [llvm_ptr_ty], []>;
	def int_x86_fxsave : GCCBuiltin<"__builtin_ia32_fxsave">,
	Intrinsic<[], [llvm_ptr_ty], []>;
	def int_x86_fxsave64 : GCCBuiltin<"__builtin_ia32_fxsave64">,
	Intrinsic<[], [llvm_ptr_ty], []>;
	}

	//===----------------------------------------------------------------------===//
	// XSAVE
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_xsave :
	Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], []>;
	def int_x86_xsave64 :
	Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], []>;
	def int_x86_xrstor :
	Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], []>;
	def int_x86_xrstor64 :
	Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], []>;
	def int_x86_xsaveopt :
	Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], []>;
	def int_x86_xsaveopt64 :
	Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], []>;
	def int_x86_xrstors :
	Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], []>;
	def int_x86_xrstors64 :
	Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], []>;
	def int_x86_xsavec :
	Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], []>;
	def int_x86_xsavec64 :
	Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], []>;
	def int_x86_xsaves :
	Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], []>;
	def int_x86_xsaves64 :
	Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], []>;
	def int_x86_xgetbv :
	Intrinsic<[llvm_i64_ty], [llvm_i32_ty], []>;
	def int_x86_xsetbv :
	Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
	}

	//===----------------------------------------------------------------------===//
	// CLFLUSHOPT and CLWB
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_clflushopt : GCCBuiltin<"__builtin_ia32_clflushopt">,
	Intrinsic<[], [llvm_ptr_ty], []>;

	def int_x86_clwb : GCCBuiltin<"__builtin_ia32_clwb">,
	Intrinsic<[], [llvm_ptr_ty], []>;
	}

	//===----------------------------------------------------------------------===//
	// Support protection key
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_rdpkru : GCCBuiltin <"__builtin_ia32_rdpkru">,
	Intrinsic<[llvm_i32_ty], [], []>;
	def int_x86_wrpkru : GCCBuiltin<"__builtin_ia32_wrpkru">,
	Intrinsic<[], [llvm_i32_ty], []>;
	}
	//===----------------------------------------------------------------------===//
	// Half float conversion

	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_vcvtph2ps_128 : GCCBuiltin<"__builtin_ia32_vcvtph2ps">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
	def int_x86_vcvtph2ps_256 : GCCBuiltin<"__builtin_ia32_vcvtph2ps256">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
	def int_x86_vcvtps2ph_128 : GCCBuiltin<"__builtin_ia32_vcvtps2ph">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v4f32_ty, llvm_i32_ty],
	[IntrNoMem]>;
	def int_x86_vcvtps2ph_256 : GCCBuiltin<"__builtin_ia32_vcvtps2ph256">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8f32_ty, llvm_i32_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_vcvtph2ps_512 : GCCBuiltin<"__builtin_ia32_vcvtph2ps512_mask">,
	Intrinsic<[llvm_v16f32_ty], [llvm_v16i16_ty, llvm_v16f32_ty,
	llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vcvtph2ps_256 : GCCBuiltin<"__builtin_ia32_vcvtph2ps256_mask">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8i16_ty, llvm_v8f32_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vcvtph2ps_128 : GCCBuiltin<"__builtin_ia32_vcvtph2ps_mask">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v8i16_ty, llvm_v4f32_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vcvtps2ph_512 : GCCBuiltin<"__builtin_ia32_vcvtps2ph512_mask">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16f32_ty, llvm_i32_ty,
	llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vcvtps2ph_256 : GCCBuiltin<"__builtin_ia32_vcvtps2ph256_mask">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8f32_ty, llvm_i32_ty,
	llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vcvtps2ph_128 : GCCBuiltin<"__builtin_ia32_vcvtps2ph_mask">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v4f32_ty, llvm_i32_ty,
	llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
	}

	//===----------------------------------------------------------------------===//
	// TBM

	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_tbm_bextri_u32 : GCCBuiltin<"__builtin_ia32_bextri_u32">,
	Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_tbm_bextri_u64 : GCCBuiltin<"__builtin_ia32_bextri_u64">,
	Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
	}

	//===----------------------------------------------------------------------===//
	// RDRAND intrinsics - Return a random value and whether it is valid.
	// RDSEED intrinsics - Return a NIST SP800-90B & C compliant random value and
	// whether it is valid.

	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	// These are declared side-effecting so they don't get eliminated by CSE or
	// LICM.
	def int_x86_rdrand_16 : Intrinsic<[llvm_i16_ty, llvm_i32_ty], [], []>;
	def int_x86_rdrand_32 : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [], []>;
	def int_x86_rdrand_64 : Intrinsic<[llvm_i64_ty, llvm_i32_ty], [], []>;
	def int_x86_rdseed_16 : Intrinsic<[llvm_i16_ty, llvm_i32_ty], [], []>;
	def int_x86_rdseed_32 : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [], []>;
	def int_x86_rdseed_64 : Intrinsic<[llvm_i64_ty, llvm_i32_ty], [], []>;
	}

	//===----------------------------------------------------------------------===//
	// ADX

	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_addcarryx_u32: GCCBuiltin<"__builtin_ia32_addcarryx_u32">,
	Intrinsic<[llvm_i8_ty], [llvm_i8_ty, llvm_i32_ty, llvm_i32_ty,
	llvm_ptr_ty], [IntrArgMemOnly]>;
	def int_x86_addcarryx_u64: GCCBuiltin<"__builtin_ia32_addcarryx_u64">,
	Intrinsic<[llvm_i8_ty], [llvm_i8_ty, llvm_i64_ty, llvm_i64_ty,
	llvm_ptr_ty], [IntrArgMemOnly]>;
	def int_x86_addcarry_u32: GCCBuiltin<"__builtin_ia32_addcarry_u32">,
	Intrinsic<[llvm_i8_ty], [llvm_i8_ty, llvm_i32_ty, llvm_i32_ty,
	llvm_ptr_ty], [IntrArgMemOnly]>;
	def int_x86_addcarry_u64: GCCBuiltin<"__builtin_ia32_addcarry_u64">,
	Intrinsic<[llvm_i8_ty], [llvm_i8_ty, llvm_i64_ty, llvm_i64_ty,
	llvm_ptr_ty], [IntrArgMemOnly]>;
	def int_x86_subborrow_u32: GCCBuiltin<"__builtin_ia32_subborrow_u32">,
	Intrinsic<[llvm_i8_ty], [llvm_i8_ty, llvm_i32_ty, llvm_i32_ty,
	llvm_ptr_ty], [IntrArgMemOnly]>;
	def int_x86_subborrow_u64: GCCBuiltin<"__builtin_ia32_subborrow_u64">,
	Intrinsic<[llvm_i8_ty], [llvm_i8_ty, llvm_i64_ty, llvm_i64_ty,
	llvm_ptr_ty], [IntrArgMemOnly]>;
	}

	//===----------------------------------------------------------------------===//
	// RTM intrinsics. Transactional Memory support.

	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_xbegin : GCCBuiltin<"__builtin_ia32_xbegin">,
	Intrinsic<[llvm_i32_ty], [], []>;
	def int_x86_xend : GCCBuiltin<"__builtin_ia32_xend">,
	Intrinsic<[], [], []>;
	def int_x86_xabort : GCCBuiltin<"__builtin_ia32_xabort">,
	Intrinsic<[], [llvm_i8_ty], []>;
	def int_x86_xtest : GCCBuiltin<"__builtin_ia32_xtest">,
	Intrinsic<[llvm_i32_ty], [], []>;
	}

	//===----------------------------------------------------------------------===//
	// AVX512

	// Mask ops
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	// Mask instructions
	// 16-bit mask
	def int_x86_avx512_kand_w : // TODO: remove this intrinsic
	Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
	[IntrNoMem]>;
	def int_x86_avx512_kandn_w : // TODO: remove this intrinsic
	Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
	[IntrNoMem]>;
	def int_x86_avx512_knot_w : // TODO: remove this intrinsic
	Intrinsic<[llvm_i16_ty], [llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_kor_w : // TODO: remove this intrinsic
	Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
	[IntrNoMem]>;
	def int_x86_avx512_kxor_w : // TODO: remove this intrinsic
	Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
	[IntrNoMem]>;
	def int_x86_avx512_kxnor_w : // TODO: remove this intrinsic
	Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
	[IntrNoMem]>;
	+ def int_x86_avx512_kunpck_bw : GCCBuiltin<"__builtin_ia32_kunpckhi">,
	+ Intrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty],
	+ [IntrNoMem]>;
	+ def int_x86_avx512_kunpck_wd : GCCBuiltin<"__builtin_ia32_kunpcksi">,
	+ Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
	+ [IntrNoMem]>;
	+ def int_x86_avx512_kunpck_dq : GCCBuiltin<"__builtin_ia32_kunpckdi">,
	+ Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
	+ [IntrNoMem]>;
	def int_x86_avx512_kortestz_w : GCCBuiltin<"__builtin_ia32_kortestzhi">,
	Intrinsic<[llvm_i32_ty], [llvm_i16_ty, llvm_i16_ty],
	[IntrNoMem]>;
	def int_x86_avx512_kortestc_w : GCCBuiltin<"__builtin_ia32_kortestchi">,
	Intrinsic<[llvm_i32_ty], [llvm_i16_ty, llvm_i16_ty],
	[IntrNoMem]>;
	}

	// Conversion ops
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx512_cvttss2si : GCCBuiltin<"__builtin_ia32_vcvttss2si32">,
	Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_cvttss2si64 : GCCBuiltin<"__builtin_ia32_vcvttss2si64">,
	Intrinsic<[llvm_i64_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_cvttss2usi : GCCBuiltin<"__builtin_ia32_vcvttss2usi32">,
	Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_cvttss2usi64 : GCCBuiltin<"__builtin_ia32_vcvttss2usi64">,
	Intrinsic<[llvm_i64_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_cvtusi2ss : GCCBuiltin<"__builtin_ia32_cvtusi2ss32">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
	llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_cvtusi642ss : GCCBuiltin<"__builtin_ia32_cvtusi2ss64">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
	llvm_i64_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_cvttsd2si : GCCBuiltin<"__builtin_ia32_vcvttsd2si32">,
	Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_cvttsd2si64 : GCCBuiltin<"__builtin_ia32_vcvttsd2si64">,
	Intrinsic<[llvm_i64_ty], [llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_cvttsd2usi : GCCBuiltin<"__builtin_ia32_vcvttsd2usi32">,
	Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_cvttsd2usi64 : GCCBuiltin<"__builtin_ia32_vcvttsd2usi64">,
	Intrinsic<[llvm_i64_ty], [llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_cvtusi2sd : GCCBuiltin<"__builtin_ia32_cvtusi2sd32">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_cvtusi642sd : GCCBuiltin<"__builtin_ia32_cvtusi2sd64">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
	llvm_i64_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_vcvtss2usi32 : GCCBuiltin<"__builtin_ia32_vcvtss2usi32">,
	Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_vcvtss2usi64 : GCCBuiltin<"__builtin_ia32_vcvtss2usi64">,
	Intrinsic<[llvm_i64_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_vcvtss2si32 : GCCBuiltin<"__builtin_ia32_vcvtss2si32">,
	Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_vcvtss2si64 : GCCBuiltin<"__builtin_ia32_vcvtss2si64">,
	Intrinsic<[llvm_i64_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_vcvtsd2usi32 : GCCBuiltin<"__builtin_ia32_vcvtsd2usi32">,
	Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_vcvtsd2usi64 : GCCBuiltin<"__builtin_ia32_vcvtsd2usi64">,
	Intrinsic<[llvm_i64_ty], [llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_vcvtsd2si32 : GCCBuiltin<"__builtin_ia32_vcvtsd2si32">,
	Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_vcvtsd2si64 : GCCBuiltin<"__builtin_ia32_vcvtsd2si64">,
	Intrinsic<[llvm_i64_ty], [llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_cvtsi2ss32 : GCCBuiltin<"__builtin_ia32_cvtsi2ss32">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
	llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_cvtsi2ss64 : GCCBuiltin<"__builtin_ia32_cvtsi2ss64">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
	llvm_i64_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_cvtsi2sd64 : GCCBuiltin<"__builtin_ia32_cvtsi2sd64">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
	llvm_i64_ty, llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_cvtb2mask_128 : GCCBuiltin<"__builtin_ia32_cvtb2mask128">,
	Intrinsic<[llvm_i16_ty], [llvm_v16i8_ty], [IntrNoMem]>;
	def int_x86_avx512_cvtb2mask_256 : GCCBuiltin<"__builtin_ia32_cvtb2mask256">,
	Intrinsic<[llvm_i32_ty], [llvm_v32i8_ty], [IntrNoMem]>;
	def int_x86_avx512_cvtb2mask_512 : GCCBuiltin<"__builtin_ia32_cvtb2mask512">,
	Intrinsic<[llvm_i64_ty], [llvm_v64i8_ty], [IntrNoMem]>;

	def int_x86_avx512_cvtw2mask_128 : GCCBuiltin<"__builtin_ia32_cvtw2mask128">,
	Intrinsic<[llvm_i8_ty], [llvm_v8i16_ty], [IntrNoMem]>;
	def int_x86_avx512_cvtw2mask_256 : GCCBuiltin<"__builtin_ia32_cvtw2mask256">,
	Intrinsic<[llvm_i16_ty], [llvm_v16i16_ty], [IntrNoMem]>;
	def int_x86_avx512_cvtw2mask_512 : GCCBuiltin<"__builtin_ia32_cvtw2mask512">,
	Intrinsic<[llvm_i32_ty], [llvm_v32i16_ty], [IntrNoMem]>;

	def int_x86_avx512_cvtd2mask_128 : GCCBuiltin<"__builtin_ia32_cvtd2mask128">,
	Intrinsic<[llvm_i8_ty], [llvm_v4i32_ty], [IntrNoMem]>;
	def int_x86_avx512_cvtd2mask_256 : GCCBuiltin<"__builtin_ia32_cvtd2mask256">,
	Intrinsic<[llvm_i8_ty], [llvm_v8i32_ty], [IntrNoMem]>;
	def int_x86_avx512_cvtd2mask_512 : GCCBuiltin<"__builtin_ia32_cvtd2mask512">,
	Intrinsic<[llvm_i16_ty], [llvm_v16i32_ty], [IntrNoMem]>;

	def int_x86_avx512_cvtq2mask_128 : GCCBuiltin<"__builtin_ia32_cvtq2mask128">,
	Intrinsic<[llvm_i8_ty], [llvm_v2i64_ty], [IntrNoMem]>;
	def int_x86_avx512_cvtq2mask_256 : GCCBuiltin<"__builtin_ia32_cvtq2mask256">,
	Intrinsic<[llvm_i8_ty], [llvm_v4i64_ty], [IntrNoMem]>;
	def int_x86_avx512_cvtq2mask_512 : GCCBuiltin<"__builtin_ia32_cvtq2mask512">,
	Intrinsic<[llvm_i8_ty], [llvm_v8i64_ty], [IntrNoMem]>;

	}

	// Pack ops.
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx512_packsswb_512 : GCCBuiltin<"__builtin_ia32_packsswb512">,
	Intrinsic<[llvm_v64i8_ty], [llvm_v32i16_ty,llvm_v32i16_ty],
	[IntrNoMem]>;
	def int_x86_avx512_packssdw_512 : GCCBuiltin<"__builtin_ia32_packssdw512">,
	Intrinsic<[llvm_v32i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty],
	[IntrNoMem]>;
	def int_x86_avx512_packuswb_512 : GCCBuiltin<"__builtin_ia32_packuswb512">,
	Intrinsic<[llvm_v64i8_ty], [llvm_v32i16_ty,llvm_v32i16_ty],
	[IntrNoMem]>;
	def int_x86_avx512_packusdw_512 : GCCBuiltin<"__builtin_ia32_packusdw512">,
	Intrinsic<[llvm_v32i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty],
	[IntrNoMem]>;
	}

	// Vector convert
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx512_mask_cvtdq2ps_128 :
	GCCBuiltin<"__builtin_ia32_cvtdq2ps128_mask">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4i32_ty, llvm_v4f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtdq2ps_256 :
	GCCBuiltin<"__builtin_ia32_cvtdq2ps256_mask">,
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8i32_ty, llvm_v8f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtdq2ps_512 :
	GCCBuiltin<"__builtin_ia32_cvtdq2ps512_mask">,
	Intrinsic<[llvm_v16f32_ty],
	[llvm_v16i32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtpd2dq_128 :
	GCCBuiltin<"__builtin_ia32_cvtpd2dq128_mask">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v2f64_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtpd2dq_256 :
	GCCBuiltin<"__builtin_ia32_cvtpd2dq256_mask">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v4f64_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtpd2dq_512 :
	GCCBuiltin<"__builtin_ia32_cvtpd2dq512_mask">,
	Intrinsic<[llvm_v8i32_ty],
	[llvm_v8f64_ty, llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtpd2ps_256 :
	GCCBuiltin<"__builtin_ia32_cvtpd2ps256_mask">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f64_ty, llvm_v4f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtpd2ps_512 :
	GCCBuiltin<"__builtin_ia32_cvtpd2ps512_mask">,
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8f64_ty, llvm_v8f32_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtsd2ss_round :
	GCCBuiltin<"__builtin_ia32_cvtsd2ss_round_mask">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v2f64_ty, llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtss2sd_round :
	GCCBuiltin<"__builtin_ia32_cvtss2sd_round_mask">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v4f32_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtpd2ps :
	GCCBuiltin<"__builtin_ia32_cvtpd2ps_mask">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v2f64_ty, llvm_v4f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtpd2qq_128 :
	GCCBuiltin<"__builtin_ia32_cvtpd2qq128_mask">,
	Intrinsic<[llvm_v2i64_ty],
	[llvm_v2f64_ty, llvm_v2i64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtpd2qq_256 :
	GCCBuiltin<"__builtin_ia32_cvtpd2qq256_mask">,
	Intrinsic<[llvm_v4i64_ty],
	[llvm_v4f64_ty, llvm_v4i64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtpd2qq_512 :
	GCCBuiltin<"__builtin_ia32_cvtpd2qq512_mask">,
	Intrinsic<[llvm_v8i64_ty],
	[llvm_v8f64_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtpd2udq_128 :
	GCCBuiltin<"__builtin_ia32_cvtpd2udq128_mask">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v2f64_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtpd2udq_256 :
	GCCBuiltin<"__builtin_ia32_cvtpd2udq256_mask">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v4f64_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtpd2udq_512 :
	GCCBuiltin<"__builtin_ia32_cvtpd2udq512_mask">,
	Intrinsic<[llvm_v8i32_ty],
	[llvm_v8f64_ty, llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtpd2uqq_128 :
	GCCBuiltin<"__builtin_ia32_cvtpd2uqq128_mask">,
	Intrinsic<[llvm_v2i64_ty],
	[llvm_v2f64_ty, llvm_v2i64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtpd2uqq_256 :
	GCCBuiltin<"__builtin_ia32_cvtpd2uqq256_mask">,
	Intrinsic<[llvm_v4i64_ty],
	[llvm_v4f64_ty, llvm_v4i64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtpd2uqq_512 :
	GCCBuiltin<"__builtin_ia32_cvtpd2uqq512_mask">,
	Intrinsic<[llvm_v8i64_ty],
	[llvm_v8f64_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtps2dq_128 :
	GCCBuiltin<"__builtin_ia32_cvtps2dq128_mask">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v4f32_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtps2dq_256 :
	GCCBuiltin<"__builtin_ia32_cvtps2dq256_mask">,
	Intrinsic<[llvm_v8i32_ty],
	[llvm_v8f32_ty, llvm_v8i32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtps2dq_512 :
	GCCBuiltin<"__builtin_ia32_cvtps2dq512_mask">,
	Intrinsic<[llvm_v16i32_ty],
	[llvm_v16f32_ty, llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtps2pd_128 :
	GCCBuiltin<"__builtin_ia32_cvtps2pd128_mask">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v4f32_ty, llvm_v2f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtps2pd_256 :
	GCCBuiltin<"__builtin_ia32_cvtps2pd256_mask">,
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4f32_ty, llvm_v4f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtps2pd_512 :
	GCCBuiltin<"__builtin_ia32_cvtps2pd512_mask">,
	Intrinsic<[llvm_v8f64_ty],
	[llvm_v8f32_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtps2qq_128 :
	GCCBuiltin<"__builtin_ia32_cvtps2qq128_mask">,
	Intrinsic<[llvm_v2i64_ty],
	[llvm_v4f32_ty, llvm_v2i64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtps2qq_256 :
	GCCBuiltin<"__builtin_ia32_cvtps2qq256_mask">,
	Intrinsic<[llvm_v4i64_ty],
	[llvm_v4f32_ty, llvm_v4i64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtps2qq_512 :
	GCCBuiltin<"__builtin_ia32_cvtps2qq512_mask">,
	Intrinsic<[llvm_v8i64_ty],
	[llvm_v8f32_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtps2udq_128 :
	GCCBuiltin<"__builtin_ia32_cvtps2udq128_mask">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v4f32_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtps2udq_256 :
	GCCBuiltin<"__builtin_ia32_cvtps2udq256_mask">,
	Intrinsic<[llvm_v8i32_ty],
	[llvm_v8f32_ty, llvm_v8i32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtps2udq_512 :
	GCCBuiltin<"__builtin_ia32_cvtps2udq512_mask">,
	Intrinsic<[llvm_v16i32_ty],
	[llvm_v16f32_ty, llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtps2uqq_128 :
	GCCBuiltin<"__builtin_ia32_cvtps2uqq128_mask">,
	Intrinsic<[llvm_v2i64_ty],
	[llvm_v4f32_ty, llvm_v2i64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtps2uqq_256 :
	GCCBuiltin<"__builtin_ia32_cvtps2uqq256_mask">,
	Intrinsic<[llvm_v4i64_ty],
	[llvm_v4f32_ty, llvm_v4i64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtps2uqq_512 :
	GCCBuiltin<"__builtin_ia32_cvtps2uqq512_mask">,
	Intrinsic<[llvm_v8i64_ty],
	[llvm_v8f32_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtqq2pd_128 :
	GCCBuiltin<"__builtin_ia32_cvtqq2pd128_mask">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2i64_ty, llvm_v2f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtqq2pd_256 :
	GCCBuiltin<"__builtin_ia32_cvtqq2pd256_mask">,
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4i64_ty, llvm_v4f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtqq2pd_512 :
	GCCBuiltin<"__builtin_ia32_cvtqq2pd512_mask">,
	Intrinsic<[llvm_v8f64_ty],
	[llvm_v8i64_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtqq2ps_128 :
	GCCBuiltin<"__builtin_ia32_cvtqq2ps128_mask">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v2i64_ty, llvm_v4f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtqq2ps_256 :
	GCCBuiltin<"__builtin_ia32_cvtqq2ps256_mask">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4i64_ty, llvm_v4f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtqq2ps_512 :
	GCCBuiltin<"__builtin_ia32_cvtqq2ps512_mask">,
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8i64_ty, llvm_v8f32_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvttpd2dq_128 :
	GCCBuiltin<"__builtin_ia32_cvttpd2dq128_mask">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v2f64_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvttpd2dq_256 :
	GCCBuiltin<"__builtin_ia32_cvttpd2dq256_mask">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v4f64_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvttpd2dq_512 :
	GCCBuiltin<"__builtin_ia32_cvttpd2dq512_mask">,
	Intrinsic<[llvm_v8i32_ty],
	[llvm_v8f64_ty, llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvttpd2qq_128 :
	GCCBuiltin<"__builtin_ia32_cvttpd2qq128_mask">,
	Intrinsic<[llvm_v2i64_ty],
	[llvm_v2f64_ty, llvm_v2i64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvttpd2qq_256 :
	GCCBuiltin<"__builtin_ia32_cvttpd2qq256_mask">,
	Intrinsic<[llvm_v4i64_ty],
	[llvm_v4f64_ty, llvm_v4i64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvttpd2qq_512 :
	GCCBuiltin<"__builtin_ia32_cvttpd2qq512_mask">,
	Intrinsic<[llvm_v8i64_ty],
	[llvm_v8f64_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvttpd2udq_128 :
	GCCBuiltin<"__builtin_ia32_cvttpd2udq128_mask">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v2f64_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvttpd2udq_256 :
	GCCBuiltin<"__builtin_ia32_cvttpd2udq256_mask">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v4f64_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvttpd2udq_512 :
	GCCBuiltin<"__builtin_ia32_cvttpd2udq512_mask">,
	Intrinsic<[llvm_v8i32_ty],
	[llvm_v8f64_ty, llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvttpd2uqq_128 :
	GCCBuiltin<"__builtin_ia32_cvttpd2uqq128_mask">,
	Intrinsic<[llvm_v2i64_ty],
	[llvm_v2f64_ty, llvm_v2i64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvttpd2uqq_256 :
	GCCBuiltin<"__builtin_ia32_cvttpd2uqq256_mask">,
	Intrinsic<[llvm_v4i64_ty],
	[llvm_v4f64_ty, llvm_v4i64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvttpd2uqq_512 :
	GCCBuiltin<"__builtin_ia32_cvttpd2uqq512_mask">,
	Intrinsic<[llvm_v8i64_ty],
	[llvm_v8f64_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvttps2dq_128 :
	GCCBuiltin<"__builtin_ia32_cvttps2dq128_mask">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v4f32_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvttps2dq_256 :
	GCCBuiltin<"__builtin_ia32_cvttps2dq256_mask">,
	Intrinsic<[llvm_v8i32_ty],
	[llvm_v8f32_ty, llvm_v8i32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvttps2dq_512 :
	GCCBuiltin<"__builtin_ia32_cvttps2dq512_mask">,
	Intrinsic<[llvm_v16i32_ty],
	[llvm_v16f32_ty, llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvttps2qq_128 :
	GCCBuiltin<"__builtin_ia32_cvttps2qq128_mask">,
	Intrinsic<[llvm_v2i64_ty],
	[llvm_v4f32_ty, llvm_v2i64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvttps2qq_256 :
	GCCBuiltin<"__builtin_ia32_cvttps2qq256_mask">,
	Intrinsic<[llvm_v4i64_ty],
	[llvm_v4f32_ty, llvm_v4i64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvttps2qq_512 :
	GCCBuiltin<"__builtin_ia32_cvttps2qq512_mask">,
	Intrinsic<[llvm_v8i64_ty],
	[llvm_v8f32_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvttps2udq_128 :
	GCCBuiltin<"__builtin_ia32_cvttps2udq128_mask">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v4f32_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvttps2udq_256 :
	GCCBuiltin<"__builtin_ia32_cvttps2udq256_mask">,
	Intrinsic<[llvm_v8i32_ty],
	[llvm_v8f32_ty, llvm_v8i32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvttps2udq_512 :
	GCCBuiltin<"__builtin_ia32_cvttps2udq512_mask">,
	Intrinsic<[llvm_v16i32_ty],
	[llvm_v16f32_ty, llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvttps2uqq_128 :
	GCCBuiltin<"__builtin_ia32_cvttps2uqq128_mask">,
	Intrinsic<[llvm_v2i64_ty],
	[llvm_v4f32_ty, llvm_v2i64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvttps2uqq_256 :
	GCCBuiltin<"__builtin_ia32_cvttps2uqq256_mask">,
	Intrinsic<[llvm_v4i64_ty],
	[llvm_v4f32_ty, llvm_v4i64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvttps2uqq_512 :
	GCCBuiltin<"__builtin_ia32_cvttps2uqq512_mask">,
	Intrinsic<[llvm_v8i64_ty],
	[llvm_v8f32_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtudq2ps_128 :
	GCCBuiltin<"__builtin_ia32_cvtudq2ps128_mask">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4i32_ty, llvm_v4f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtudq2ps_256 :
	GCCBuiltin<"__builtin_ia32_cvtudq2ps256_mask">,
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8i32_ty, llvm_v8f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtudq2ps_512 :
	GCCBuiltin<"__builtin_ia32_cvtudq2ps512_mask">,
	Intrinsic<[llvm_v16f32_ty],
	[llvm_v16i32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtuqq2pd_128 :
	GCCBuiltin<"__builtin_ia32_cvtuqq2pd128_mask">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2i64_ty, llvm_v2f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtuqq2pd_256 :
	GCCBuiltin<"__builtin_ia32_cvtuqq2pd256_mask">,
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4i64_ty, llvm_v4f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtuqq2pd_512 :
	GCCBuiltin<"__builtin_ia32_cvtuqq2pd512_mask">,
	Intrinsic<[llvm_v8f64_ty],
	[llvm_v8i64_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtuqq2ps_128 :
	GCCBuiltin<"__builtin_ia32_cvtuqq2ps128_mask">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v2i64_ty, llvm_v4f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtuqq2ps_256 :
	GCCBuiltin<"__builtin_ia32_cvtuqq2ps256_mask">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4i64_ty, llvm_v4f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_cvtuqq2ps_512 :
	GCCBuiltin<"__builtin_ia32_cvtuqq2ps512_mask">,
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8i64_ty, llvm_v8f32_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_rndscale_pd_128 : GCCBuiltin<"__builtin_ia32_rndscalepd_128_mask">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_i32_ty,
	llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_rndscale_pd_256 : GCCBuiltin<"__builtin_ia32_rndscalepd_256_mask">,
	Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty,
	llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_rndscale_pd_512 : GCCBuiltin<"__builtin_ia32_rndscalepd_mask">,
	Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_i32_ty, llvm_v8f64_ty,
	llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_rndscale_ps_128 : GCCBuiltin<"__builtin_ia32_rndscaleps_128_mask">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty,
	llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_rndscale_ps_256 : GCCBuiltin<"__builtin_ia32_rndscaleps_256_mask">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty,
	llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_rndscale_ps_512 : GCCBuiltin<"__builtin_ia32_rndscaleps_mask">,
	Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_i32_ty, llvm_v16f32_ty,
	llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_reduce_pd_128 : GCCBuiltin<"__builtin_ia32_reducepd128_mask">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_i32_ty,
	llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_reduce_pd_256 : GCCBuiltin<"__builtin_ia32_reducepd256_mask">,
	Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty,
	llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_reduce_pd_512 : GCCBuiltin<"__builtin_ia32_reducepd512_mask">,
	Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_i32_ty, llvm_v8f64_ty,
	llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_reduce_ps_128 : GCCBuiltin<"__builtin_ia32_reduceps128_mask">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty,
	llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_reduce_ps_256 : GCCBuiltin<"__builtin_ia32_reduceps256_mask">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty,
	llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_reduce_ps_512 : GCCBuiltin<"__builtin_ia32_reduceps512_mask">,
	Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_i32_ty, llvm_v16f32_ty,
	llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_range_pd_128 : GCCBuiltin<"__builtin_ia32_rangepd128_mask">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty,
	llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_range_pd_256 : GCCBuiltin<"__builtin_ia32_rangepd256_mask">,
	Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, llvm_i32_ty,
	llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_range_pd_512 : GCCBuiltin<"__builtin_ia32_rangepd512_mask">,
	Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i32_ty,
	llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_range_ps_128 : GCCBuiltin<"__builtin_ia32_rangeps128_mask">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty,
	llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_range_ps_256 : GCCBuiltin<"__builtin_ia32_rangeps256_mask">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty, llvm_i32_ty,
	llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_range_ps_512 : GCCBuiltin<"__builtin_ia32_rangeps512_mask">,
	Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i32_ty,
	llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
	}

	// Vector load with broadcast
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	// TODO: Remove the broadcast intrinsics with no gcc builtin and autoupgrade
	def int_x86_avx512_vbroadcast_ss_512 :
	Intrinsic<[llvm_v16f32_ty], [llvm_ptr_ty], [IntrReadMem, IntrArgMemOnly]>;

	def int_x86_avx512_vbroadcast_sd_512 :
	Intrinsic<[llvm_v8f64_ty], [llvm_ptr_ty], [IntrReadMem, IntrArgMemOnly]>;

	def int_x86_avx512_broadcastmw_512 :
	GCCBuiltin<"__builtin_ia32_broadcastmw512">,
	Intrinsic<[llvm_v16i32_ty], [llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_broadcastmw_256 :
	GCCBuiltin<"__builtin_ia32_broadcastmw256">,
	Intrinsic<[llvm_v8i32_ty], [llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_broadcastmw_128 :
	GCCBuiltin<"__builtin_ia32_broadcastmw128">,
	Intrinsic<[llvm_v4i32_ty], [llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_broadcastmb_512 :
	GCCBuiltin<"__builtin_ia32_broadcastmb512">,
	Intrinsic<[llvm_v8i64_ty], [llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_broadcastmb_256 :
	GCCBuiltin<"__builtin_ia32_broadcastmb256">,
	Intrinsic<[llvm_v4i64_ty], [llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_broadcastmb_128 :
	GCCBuiltin<"__builtin_ia32_broadcastmb128">,
	Intrinsic<[llvm_v2i64_ty], [llvm_i8_ty], [IntrNoMem]>;
	}

	// Arithmetic ops
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".

	def int_x86_avx512_mask_add_ps_512 : GCCBuiltin<"__builtin_ia32_addps512_mask">,
	Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
	llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_add_pd_512 : GCCBuiltin<"__builtin_ia32_addpd512_mask">,
	Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
	llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_sub_ps_512 : GCCBuiltin<"__builtin_ia32_subps512_mask">,
	Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
	llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_sub_pd_512 : GCCBuiltin<"__builtin_ia32_subpd512_mask">,
	Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
	llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_mul_ps_512 : GCCBuiltin<"__builtin_ia32_mulps512_mask">,
	Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
	llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_mul_pd_512 : GCCBuiltin<"__builtin_ia32_mulpd512_mask">,
	Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
	llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_div_ps_512 : GCCBuiltin<"__builtin_ia32_divps512_mask">,
	Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
	llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_div_pd_512 : GCCBuiltin<"__builtin_ia32_divpd512_mask">,
	Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
	llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_max_ps_512 : GCCBuiltin<"__builtin_ia32_maxps512_mask">,
	Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
	llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_max_pd_512 : GCCBuiltin<"__builtin_ia32_maxpd512_mask">,
	Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
	llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_min_ps_512 : GCCBuiltin<"__builtin_ia32_minps512_mask">,
	Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
	llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_min_pd_512 : GCCBuiltin<"__builtin_ia32_minpd512_mask">,
	Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
	llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_add_ss_round : GCCBuiltin<"__builtin_ia32_addss_round_mask">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
	llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_div_ss_round : GCCBuiltin<"__builtin_ia32_divss_round_mask">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
	llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_mul_ss_round : GCCBuiltin<"__builtin_ia32_mulss_round_mask">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
	llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_sub_ss_round : GCCBuiltin<"__builtin_ia32_subss_round_mask">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
	llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_max_ss_round : GCCBuiltin<"__builtin_ia32_maxss_round_mask">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
	llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_min_ss_round : GCCBuiltin<"__builtin_ia32_minss_round_mask">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
	llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_add_sd_round : GCCBuiltin<"__builtin_ia32_addsd_round_mask">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
	llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_div_sd_round : GCCBuiltin<"__builtin_ia32_divsd_round_mask">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
	llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_mul_sd_round : GCCBuiltin<"__builtin_ia32_mulsd_round_mask">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
	llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_sub_sd_round : GCCBuiltin<"__builtin_ia32_subsd_round_mask">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
	llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_max_sd_round : GCCBuiltin<"__builtin_ia32_maxsd_round_mask">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
	llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_min_sd_round : GCCBuiltin<"__builtin_ia32_minsd_round_mask">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
	llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_rndscale_ss : GCCBuiltin<"__builtin_ia32_rndscaless_round_mask">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
	llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_rndscale_sd : GCCBuiltin<"__builtin_ia32_rndscalesd_round_mask">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
	llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_range_ss : GCCBuiltin<"__builtin_ia32_rangess128_round_mask">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
	llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_range_sd : GCCBuiltin<"__builtin_ia32_rangesd128_round_mask">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
	llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_reduce_ss : GCCBuiltin<"__builtin_ia32_reducess_mask">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
	llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_reduce_sd : GCCBuiltin<"__builtin_ia32_reducesd_mask">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
	llvm_i8_ty, llvm_i32_ty, llvm_i32_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_scalef_sd : GCCBuiltin<"__builtin_ia32_scalefsd_round_mask">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
	llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_scalef_ss : GCCBuiltin<"__builtin_ia32_scalefss_round_mask">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
	llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_scalef_pd_128 : GCCBuiltin<"__builtin_ia32_scalefpd128_mask">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
	llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_scalef_pd_256 : GCCBuiltin<"__builtin_ia32_scalefpd256_mask">,
	Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
	llvm_v4f64_ty, llvm_i8_ty],[IntrNoMem]>;
	def int_x86_avx512_mask_scalef_pd_512 : GCCBuiltin<"__builtin_ia32_scalefpd512_mask">,
	Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
	llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_scalef_ps_128 : GCCBuiltin<"__builtin_ia32_scalefps128_mask">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
	llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_scalef_ps_256 : GCCBuiltin<"__builtin_ia32_scalefps256_mask">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
	llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_scalef_ps_512 : GCCBuiltin<"__builtin_ia32_scalefps512_mask">,
	Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
	llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_sqrt_ss : GCCBuiltin<"__builtin_ia32_sqrtss_round_mask">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
	llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_sqrt_sd : GCCBuiltin<"__builtin_ia32_sqrtsd_round_mask">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
	llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_sqrt_pd_128 : GCCBuiltin<"__builtin_ia32_sqrtpd128_mask">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_sqrt_pd_256 : GCCBuiltin<"__builtin_ia32_sqrtpd256_mask">,
	Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_sqrt_pd_512 : GCCBuiltin<"__builtin_ia32_sqrtpd512_mask">,
	Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
	llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_sqrt_ps_128 : GCCBuiltin<"__builtin_ia32_sqrtps128_mask">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_sqrt_ps_256 : GCCBuiltin<"__builtin_ia32_sqrtps256_mask">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_sqrt_ps_512 : GCCBuiltin<"__builtin_ia32_sqrtps512_mask">,
	Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
	llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_fixupimm_pd_128 :
	GCCBuiltin<"__builtin_ia32_fixupimmpd128_mask">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2i64_ty, llvm_i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_maskz_fixupimm_pd_128 :
	GCCBuiltin<"__builtin_ia32_fixupimmpd128_maskz">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2i64_ty, llvm_i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_fixupimm_pd_256 :
	GCCBuiltin<"__builtin_ia32_fixupimmpd256_mask">,
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4i64_ty, llvm_i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_maskz_fixupimm_pd_256 :
	GCCBuiltin<"__builtin_ia32_fixupimmpd256_maskz">,
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4i64_ty, llvm_i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_fixupimm_pd_512 :
	GCCBuiltin<"__builtin_ia32_fixupimmpd512_mask">,
	Intrinsic<[llvm_v8f64_ty],
	[llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8i64_ty, llvm_i32_ty, llvm_i8_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_fixupimm_pd_512 :
	GCCBuiltin<"__builtin_ia32_fixupimmpd512_maskz">,
	Intrinsic<[llvm_v8f64_ty],
	[llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8i64_ty, llvm_i32_ty, llvm_i8_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_fixupimm_ps_128 :
	GCCBuiltin<"__builtin_ia32_fixupimmps128_mask">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_maskz_fixupimm_ps_128 :
	GCCBuiltin<"__builtin_ia32_fixupimmps128_maskz">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_fixupimm_ps_256 :
	GCCBuiltin<"__builtin_ia32_fixupimmps256_mask">,
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8i32_ty, llvm_i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_maskz_fixupimm_ps_256 :
	GCCBuiltin<"__builtin_ia32_fixupimmps256_maskz">,
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8i32_ty, llvm_i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_fixupimm_ps_512 :
	GCCBuiltin<"__builtin_ia32_fixupimmps512_mask">,
	Intrinsic<[llvm_v16f32_ty],
	[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16i32_ty, llvm_i32_ty,
	llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_fixupimm_ps_512 :
	GCCBuiltin<"__builtin_ia32_fixupimmps512_maskz">,
	Intrinsic<[llvm_v16f32_ty],
	[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16i32_ty, llvm_i32_ty,
	llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_fixupimm_sd :
	GCCBuiltin<"__builtin_ia32_fixupimmsd_mask">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2i64_ty, llvm_i32_ty, llvm_i8_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_fixupimm_sd :
	GCCBuiltin<"__builtin_ia32_fixupimmsd_maskz">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2i64_ty, llvm_i32_ty, llvm_i8_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_fixupimm_ss :
	GCCBuiltin<"__builtin_ia32_fixupimmss_mask">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_i8_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_fixupimm_ss :
	GCCBuiltin<"__builtin_ia32_fixupimmss_maskz">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_i8_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_getexp_pd_128 : GCCBuiltin<"__builtin_ia32_getexppd128_mask">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_getexp_pd_256 : GCCBuiltin<"__builtin_ia32_getexppd256_mask">,
	Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_getexp_pd_512 : GCCBuiltin<"__builtin_ia32_getexppd512_mask">,
	Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
	llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_getexp_ps_128 : GCCBuiltin<"__builtin_ia32_getexpps128_mask">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_getexp_ps_256 : GCCBuiltin<"__builtin_ia32_getexpps256_mask">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_getexp_ps_512 : GCCBuiltin<"__builtin_ia32_getexpps512_mask">,
	Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
	llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_getexp_ss : GCCBuiltin<"__builtin_ia32_getexpss128_round_mask">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
	llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_getexp_sd : GCCBuiltin<"__builtin_ia32_getexpsd128_round_mask">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
	llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_getmant_pd_128 :
	GCCBuiltin<"__builtin_ia32_getmantpd128_mask">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty,llvm_i32_ty, llvm_v2f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_getmant_pd_256 :
	GCCBuiltin<"__builtin_ia32_getmantpd256_mask">,
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4f64_ty,llvm_i32_ty, llvm_v4f64_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_getmant_pd_512 :
	GCCBuiltin<"__builtin_ia32_getmantpd512_mask">,
	Intrinsic<[llvm_v8f64_ty],
	[llvm_v8f64_ty,llvm_i32_ty, llvm_v8f64_ty, llvm_i8_ty,llvm_i32_ty ],
	[IntrNoMem]>;

	def int_x86_avx512_mask_getmant_ps_128 :
	GCCBuiltin<"__builtin_ia32_getmantps128_mask">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_i32_ty, llvm_v4f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_getmant_ps_256 :
	GCCBuiltin<"__builtin_ia32_getmantps256_mask">,
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8f32_ty, llvm_i32_ty, llvm_v8f32_ty, llvm_i8_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_getmant_ps_512 :
	GCCBuiltin<"__builtin_ia32_getmantps512_mask">,
	Intrinsic<[llvm_v16f32_ty],
	[llvm_v16f32_ty,llvm_i32_ty, llvm_v16f32_ty,llvm_i16_ty,llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_getmant_ss :
	GCCBuiltin<"__builtin_ia32_getmantss_round_mask">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_v4f32_ty,
	llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_getmant_sd :
	GCCBuiltin<"__builtin_ia32_getmantsd_round_mask">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty, llvm_v2f64_ty,
	llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_rsqrt14_ss : GCCBuiltin<"__builtin_ia32_rsqrt14ss_mask">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_rsqrt14_sd : GCCBuiltin<"__builtin_ia32_rsqrt14sd_mask">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
	llvm_i8_ty], [IntrNoMem]>;

	def int_x86_avx512_rsqrt14_pd_128 : GCCBuiltin<"__builtin_ia32_rsqrt14pd128_mask">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_rsqrt14_pd_256 : GCCBuiltin<"__builtin_ia32_rsqrt14pd256_mask">,
	Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_rsqrt14_pd_512 : GCCBuiltin<"__builtin_ia32_rsqrt14pd512_mask">,
	Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_rsqrt14_ps_128 : GCCBuiltin<"__builtin_ia32_rsqrt14ps128_mask">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_rsqrt14_ps_256 : GCCBuiltin<"__builtin_ia32_rsqrt14ps256_mask">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_rsqrt14_ps_512 : GCCBuiltin<"__builtin_ia32_rsqrt14ps512_mask">,
	Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
	llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_rcp14_ss : GCCBuiltin<"__builtin_ia32_rcp14ss_mask">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_rcp14_sd : GCCBuiltin<"__builtin_ia32_rcp14sd_mask">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
	llvm_i8_ty], [IntrNoMem]>;

	def int_x86_avx512_rcp14_pd_128 : GCCBuiltin<"__builtin_ia32_rcp14pd128_mask">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_rcp14_pd_256 : GCCBuiltin<"__builtin_ia32_rcp14pd256_mask">,
	Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_rcp14_pd_512 : GCCBuiltin<"__builtin_ia32_rcp14pd512_mask">,
	Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_rcp14_ps_128 : GCCBuiltin<"__builtin_ia32_rcp14ps128_mask">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_rcp14_ps_256 : GCCBuiltin<"__builtin_ia32_rcp14ps256_mask">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_rcp14_ps_512 : GCCBuiltin<"__builtin_ia32_rcp14ps512_mask">,
	Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
	llvm_i16_ty], [IntrNoMem]>;

	def int_x86_avx512_rcp28_ps : GCCBuiltin<"__builtin_ia32_rcp28ps_mask">,
	Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
	llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_rcp28_pd : GCCBuiltin<"__builtin_ia32_rcp28pd_mask">,
	Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
	llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_exp2_ps : GCCBuiltin<"__builtin_ia32_exp2ps_mask">,
	Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
	llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_exp2_pd : GCCBuiltin<"__builtin_ia32_exp2pd_mask">,
	Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
	llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_rcp28_ss : GCCBuiltin<"__builtin_ia32_rcp28ss_round_mask">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
	llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrNoMem]>;
	def int_x86_avx512_rcp28_sd : GCCBuiltin<"__builtin_ia32_rcp28sd_round_mask">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
	llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrNoMem]>;
	def int_x86_avx512_rsqrt28_ps : GCCBuiltin<"__builtin_ia32_rsqrt28ps_mask">,
	Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
	llvm_i16_ty, llvm_i32_ty],
	[IntrNoMem]>;
	def int_x86_avx512_rsqrt28_pd : GCCBuiltin<"__builtin_ia32_rsqrt28pd_mask">,
	Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
	llvm_i8_ty, llvm_i32_ty],
	[IntrNoMem]>;
	def int_x86_avx512_rsqrt28_ss : GCCBuiltin<"__builtin_ia32_rsqrt28ss_round_mask">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
	llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrNoMem]>;
	def int_x86_avx512_rsqrt28_sd : GCCBuiltin<"__builtin_ia32_rsqrt28sd_round_mask">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
	llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrNoMem]>;
	def int_x86_avx512_psad_bw_512 : GCCBuiltin<"__builtin_ia32_psadbw512">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v64i8_ty, llvm_v64i8_ty],
	[IntrNoMem, Commutative]>;
	}
	// Integer arithmetic ops
	let TargetPrefix = "x86" in {
	def int_x86_avx512_mask_padds_b_128 : GCCBuiltin<"__builtin_ia32_paddsb128_mask">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
	llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_padds_b_256 : GCCBuiltin<"__builtin_ia32_paddsb256_mask">,
	Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
	llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_padds_b_512 : GCCBuiltin<"__builtin_ia32_paddsb512_mask">,
	Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty,
	llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_padds_w_128 : GCCBuiltin<"__builtin_ia32_paddsw128_mask">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
	llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_padds_w_256 : GCCBuiltin<"__builtin_ia32_paddsw256_mask">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
	llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_padds_w_512 : GCCBuiltin<"__builtin_ia32_paddsw512_mask">,
	Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
	llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_paddus_b_128 : GCCBuiltin<"__builtin_ia32_paddusb128_mask">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
	llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_paddus_b_256 : GCCBuiltin<"__builtin_ia32_paddusb256_mask">,
	Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
	llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_paddus_b_512 : GCCBuiltin<"__builtin_ia32_paddusb512_mask">,
	Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty,
	llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_paddus_w_128 : GCCBuiltin<"__builtin_ia32_paddusw128_mask">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
	llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_paddus_w_256 : GCCBuiltin<"__builtin_ia32_paddusw256_mask">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
	llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_paddus_w_512 : GCCBuiltin<"__builtin_ia32_paddusw512_mask">,
	Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
	llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_psubs_b_128 : GCCBuiltin<"__builtin_ia32_psubsb128_mask">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
	llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_psubs_b_256 : GCCBuiltin<"__builtin_ia32_psubsb256_mask">,
	Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
	llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_psubs_b_512 : GCCBuiltin<"__builtin_ia32_psubsb512_mask">,
	Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty,
	llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_psubs_w_128 : GCCBuiltin<"__builtin_ia32_psubsw128_mask">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
	llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_psubs_w_256 : GCCBuiltin<"__builtin_ia32_psubsw256_mask">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
	llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_psubs_w_512 : GCCBuiltin<"__builtin_ia32_psubsw512_mask">,
	Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
	llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_psubus_b_128 : GCCBuiltin<"__builtin_ia32_psubusb128_mask">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
	llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_psubus_b_256 : GCCBuiltin<"__builtin_ia32_psubusb256_mask">,
	Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
	llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_psubus_b_512 : GCCBuiltin<"__builtin_ia32_psubusb512_mask">,
	Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty,
	llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_psubus_w_128 : GCCBuiltin<"__builtin_ia32_psubusw128_mask">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
	llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_psubus_w_256 : GCCBuiltin<"__builtin_ia32_psubusw256_mask">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
	llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_psubus_w_512 : GCCBuiltin<"__builtin_ia32_psubusw512_mask">,
	Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
	llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_pmulu_dq_512 : GCCBuiltin<"__builtin_ia32_pmuludq512">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>;
	def int_x86_avx512_pmul_dq_512 : GCCBuiltin<"__builtin_ia32_pmuldq512">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_pmulhu_w_512 : GCCBuiltin<"__builtin_ia32_pmulhuw512_mask">,
	Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
	llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_pmulh_w_512 : GCCBuiltin<"__builtin_ia32_pmulhw512_mask">,
	Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
	llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_pmulhu_w_128 : GCCBuiltin<"__builtin_ia32_pmulhuw128_mask">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
	llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_pmulhu_w_256 : GCCBuiltin<"__builtin_ia32_pmulhuw256_mask">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
	llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_pmulh_w_128 : GCCBuiltin<"__builtin_ia32_pmulhw128_mask">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
	llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_pmulh_w_256 : GCCBuiltin<"__builtin_ia32_pmulhw256_mask">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
	llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_pmaddw_d_128 :
	GCCBuiltin<"__builtin_ia32_pmaddwd128_mask">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v8i16_ty, llvm_v8i16_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmaddw_d_256 :
	GCCBuiltin<"__builtin_ia32_pmaddwd256_mask">,
	Intrinsic<[llvm_v8i32_ty],
	[llvm_v16i16_ty, llvm_v16i16_ty, llvm_v8i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmaddw_d_512 :
	GCCBuiltin<"__builtin_ia32_pmaddwd512_mask">,
	Intrinsic<[llvm_v16i32_ty],
	[llvm_v32i16_ty, llvm_v32i16_ty, llvm_v16i32_ty, llvm_i16_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmaddubs_w_128 :
	GCCBuiltin<"__builtin_ia32_pmaddubsw128_mask">,
	Intrinsic<[llvm_v8i16_ty],
	[llvm_v16i8_ty, llvm_v16i8_ty, llvm_v8i16_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmaddubs_w_256 :
	GCCBuiltin<"__builtin_ia32_pmaddubsw256_mask">,
	Intrinsic<[llvm_v16i16_ty],
	[llvm_v32i8_ty, llvm_v32i8_ty, llvm_v16i16_ty, llvm_i16_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmaddubs_w_512 :
	GCCBuiltin<"__builtin_ia32_pmaddubsw512_mask">,
	Intrinsic<[llvm_v32i16_ty],
	[llvm_v64i8_ty, llvm_v64i8_ty, llvm_v32i16_ty, llvm_i32_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_dbpsadbw_128 :
	GCCBuiltin<"__builtin_ia32_dbpsadbw128_mask">,
	Intrinsic<[llvm_v8i16_ty],
	[llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty, llvm_v8i16_ty,
	llvm_i8_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_dbpsadbw_256 :
	GCCBuiltin<"__builtin_ia32_dbpsadbw256_mask">,
	Intrinsic<[llvm_v16i16_ty],
	[llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty, llvm_v16i16_ty,
	llvm_i16_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_dbpsadbw_512 :
	GCCBuiltin<"__builtin_ia32_dbpsadbw512_mask">,
	Intrinsic<[llvm_v32i16_ty],
	[llvm_v64i8_ty, llvm_v64i8_ty, llvm_i32_ty, llvm_v32i16_ty,
	llvm_i32_ty], [IntrNoMem]>;
	}

	// Gather and Scatter ops
	let TargetPrefix = "x86" in {
	def int_x86_avx512_gather_dpd_512 : GCCBuiltin<"__builtin_ia32_gathersiv8df">,
	Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_ptr_ty,
	llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx512_gather_dps_512 : GCCBuiltin<"__builtin_ia32_gathersiv16sf">,
	Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_ptr_ty,
	llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty],
	[IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx512_gather_qpd_512 : GCCBuiltin<"__builtin_ia32_gatherdiv8df">,
	Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_ptr_ty,
	llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx512_gather_qps_512 : GCCBuiltin<"__builtin_ia32_gatherdiv16sf">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_ptr_ty,
	llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrReadMem, IntrArgMemOnly]>;


	def int_x86_avx512_gather_dpq_512 : GCCBuiltin<"__builtin_ia32_gathersiv8di">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_ptr_ty,
	llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx512_gather_dpi_512 : GCCBuiltin<"__builtin_ia32_gathersiv16si">,
	Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_ptr_ty,
	llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty],
	[IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx512_gather_qpq_512 : GCCBuiltin<"__builtin_ia32_gatherdiv8di">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_ptr_ty,
	llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx512_gather_qpi_512 : GCCBuiltin<"__builtin_ia32_gatherdiv16si">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_ptr_ty,
	llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrReadMem, IntrArgMemOnly]>;

	def int_x86_avx512_gather3div2_df :
	GCCBuiltin<"__builtin_ia32_gather3div2df">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrReadMem, IntrArgMemOnly]>;

	def int_x86_avx512_gather3div2_di :
	GCCBuiltin<"__builtin_ia32_gather3div2di">,
	Intrinsic<[llvm_v2i64_ty],
	[llvm_v2i64_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrReadMem, IntrArgMemOnly]>;

	def int_x86_avx512_gather3div4_df :
	GCCBuiltin<"__builtin_ia32_gather3div4df">,
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4f64_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrReadMem, IntrArgMemOnly]>;

	def int_x86_avx512_gather3div4_di :
	GCCBuiltin<"__builtin_ia32_gather3div4di">,
	Intrinsic<[llvm_v4i64_ty],
	[llvm_v4i64_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrReadMem, IntrArgMemOnly]>;

	def int_x86_avx512_gather3div4_sf :
	GCCBuiltin<"__builtin_ia32_gather3div4sf">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrReadMem, IntrArgMemOnly]>;

	def int_x86_avx512_gather3div4_si :
	GCCBuiltin<"__builtin_ia32_gather3div4si">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v4i32_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrReadMem, IntrArgMemOnly]>;

	def int_x86_avx512_gather3div8_sf :
	GCCBuiltin<"__builtin_ia32_gather3div8sf">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrReadMem, IntrArgMemOnly]>;

	def int_x86_avx512_gather3div8_si :
	GCCBuiltin<"__builtin_ia32_gather3div8si">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v4i32_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrReadMem, IntrArgMemOnly]>;

	def int_x86_avx512_gather3siv2_df :
	GCCBuiltin<"__builtin_ia32_gather3siv2df">,
	Intrinsic<[llvm_v2f64_ty],
	[llvm_v2f64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrReadMem, IntrArgMemOnly]>;

	def int_x86_avx512_gather3siv2_di :
	GCCBuiltin<"__builtin_ia32_gather3siv2di">,
	Intrinsic<[llvm_v2i64_ty],
	[llvm_v2i64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrReadMem, IntrArgMemOnly]>;

	def int_x86_avx512_gather3siv4_df :
	GCCBuiltin<"__builtin_ia32_gather3siv4df">,
	Intrinsic<[llvm_v4f64_ty],
	[llvm_v4f64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrReadMem, IntrArgMemOnly]>;

	def int_x86_avx512_gather3siv4_di :
	GCCBuiltin<"__builtin_ia32_gather3siv4di">,
	Intrinsic<[llvm_v4i64_ty],
	[llvm_v4i64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrReadMem, IntrArgMemOnly]>;

	def int_x86_avx512_gather3siv4_sf :
	GCCBuiltin<"__builtin_ia32_gather3siv4sf">,
	Intrinsic<[llvm_v4f32_ty],
	[llvm_v4f32_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrReadMem, IntrArgMemOnly]>;

	def int_x86_avx512_gather3siv4_si :
	GCCBuiltin<"__builtin_ia32_gather3siv4si">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v4i32_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrReadMem, IntrArgMemOnly]>;

	def int_x86_avx512_gather3siv8_sf :
	GCCBuiltin<"__builtin_ia32_gather3siv8sf">,
	Intrinsic<[llvm_v8f32_ty],
	[llvm_v8f32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrReadMem, IntrArgMemOnly]>;

	def int_x86_avx512_gather3siv8_si :
	GCCBuiltin<"__builtin_ia32_gather3siv8si">,
	Intrinsic<[llvm_v8i32_ty],
	[llvm_v8i32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty],
	[IntrReadMem, IntrArgMemOnly]>;

	// scatter
	def int_x86_avx512_scatter_dpd_512 : GCCBuiltin<"__builtin_ia32_scattersiv8df">,
	Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
	llvm_v8i32_ty, llvm_v8f64_ty, llvm_i32_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_scatter_dps_512 : GCCBuiltin<"__builtin_ia32_scattersiv16sf">,
	Intrinsic<[], [llvm_ptr_ty, llvm_i16_ty,
	llvm_v16i32_ty, llvm_v16f32_ty, llvm_i32_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_scatter_qpd_512 : GCCBuiltin<"__builtin_ia32_scatterdiv8df">,
	Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
	llvm_v8i64_ty, llvm_v8f64_ty, llvm_i32_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_scatter_qps_512 : GCCBuiltin<"__builtin_ia32_scatterdiv16sf">,
	Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
	llvm_v8i64_ty, llvm_v8f32_ty, llvm_i32_ty],
	[IntrArgMemOnly]>;


	def int_x86_avx512_scatter_dpq_512 : GCCBuiltin<"__builtin_ia32_scattersiv8di">,
	Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
	llvm_v8i32_ty, llvm_v8i64_ty, llvm_i32_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_scatter_dpi_512 : GCCBuiltin<"__builtin_ia32_scattersiv16si">,
	Intrinsic<[], [llvm_ptr_ty, llvm_i16_ty,
	llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_scatter_qpq_512 : GCCBuiltin<"__builtin_ia32_scatterdiv8di">,
	Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,llvm_v8i64_ty, llvm_v8i64_ty,
	llvm_i32_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_scatter_qpi_512 : GCCBuiltin<"__builtin_ia32_scatterdiv16si">,
	Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v8i64_ty, llvm_v8i32_ty,
	llvm_i32_ty],
	[IntrArgMemOnly]>;

	def int_x86_avx512_scatterdiv2_df :
	GCCBuiltin<"__builtin_ia32_scatterdiv2df">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_i8_ty, llvm_v2i64_ty, llvm_v2f64_ty, llvm_i32_ty],
	[IntrArgMemOnly]>;

	def int_x86_avx512_scatterdiv2_di :
	GCCBuiltin<"__builtin_ia32_scatterdiv2di">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_i8_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty],
	[IntrArgMemOnly]>;

	def int_x86_avx512_scatterdiv4_df :
	GCCBuiltin<"__builtin_ia32_scatterdiv4df">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_i8_ty, llvm_v4i64_ty, llvm_v4f64_ty, llvm_i32_ty],
	[IntrArgMemOnly]>;

	def int_x86_avx512_scatterdiv4_di :
	GCCBuiltin<"__builtin_ia32_scatterdiv4di">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_i8_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty],
	[IntrArgMemOnly]>;

	def int_x86_avx512_scatterdiv4_sf :
	GCCBuiltin<"__builtin_ia32_scatterdiv4sf">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_i8_ty, llvm_v2i64_ty, llvm_v4f32_ty, llvm_i32_ty],
	[IntrArgMemOnly]>;

	def int_x86_avx512_scatterdiv4_si :
	GCCBuiltin<"__builtin_ia32_scatterdiv4si">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_i8_ty, llvm_v2i64_ty, llvm_v4i32_ty, llvm_i32_ty],
	[IntrArgMemOnly]>;

	def int_x86_avx512_scatterdiv8_sf :
	GCCBuiltin<"__builtin_ia32_scatterdiv8sf">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_i8_ty, llvm_v4i64_ty, llvm_v4f32_ty, llvm_i32_ty],
	[IntrArgMemOnly]>;

	def int_x86_avx512_scatterdiv8_si :
	GCCBuiltin<"__builtin_ia32_scatterdiv8si">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_i8_ty, llvm_v4i64_ty, llvm_v4i32_ty, llvm_i32_ty],
	[IntrArgMemOnly]>;

	def int_x86_avx512_scattersiv2_df :
	GCCBuiltin<"__builtin_ia32_scattersiv2df">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_i8_ty, llvm_v4i32_ty, llvm_v2f64_ty, llvm_i32_ty],
	[IntrArgMemOnly]>;

	def int_x86_avx512_scattersiv2_di :
	GCCBuiltin<"__builtin_ia32_scattersiv2di">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_i8_ty, llvm_v4i32_ty, llvm_v2i64_ty, llvm_i32_ty],
	[IntrArgMemOnly]>;

	def int_x86_avx512_scattersiv4_df :
	GCCBuiltin<"__builtin_ia32_scattersiv4df">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_i8_ty, llvm_v4i32_ty, llvm_v4f64_ty, llvm_i32_ty],
	[IntrArgMemOnly]>;

	def int_x86_avx512_scattersiv4_di :
	GCCBuiltin<"__builtin_ia32_scattersiv4di">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_i8_ty, llvm_v4i32_ty, llvm_v4i64_ty, llvm_i32_ty],
	[IntrArgMemOnly]>;

	def int_x86_avx512_scattersiv4_sf :
	GCCBuiltin<"__builtin_ia32_scattersiv4sf">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_i8_ty, llvm_v4i32_ty, llvm_v4f32_ty, llvm_i32_ty],
	[IntrArgMemOnly]>;

	def int_x86_avx512_scattersiv4_si :
	GCCBuiltin<"__builtin_ia32_scattersiv4si">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_i8_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty],
	[IntrArgMemOnly]>;

	def int_x86_avx512_scattersiv8_sf :
	GCCBuiltin<"__builtin_ia32_scattersiv8sf">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_i8_ty, llvm_v8i32_ty, llvm_v8f32_ty, llvm_i32_ty],
	[IntrArgMemOnly]>;

	def int_x86_avx512_scattersiv8_si :
	GCCBuiltin<"__builtin_ia32_scattersiv8si">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_i8_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty],
	[IntrArgMemOnly]>;

	// gather prefetch
	def int_x86_avx512_gatherpf_dpd_512 : GCCBuiltin<"__builtin_ia32_gatherpfdpd">,
	Intrinsic<[], [llvm_i8_ty, llvm_v8i32_ty, llvm_ptr_ty,
	llvm_i32_ty, llvm_i32_ty], [IntrArgMemOnly]>;
	def int_x86_avx512_gatherpf_dps_512 : GCCBuiltin<"__builtin_ia32_gatherpfdps">,
	Intrinsic<[], [llvm_i16_ty, llvm_v16i32_ty, llvm_ptr_ty,
	llvm_i32_ty, llvm_i32_ty], [IntrArgMemOnly]>;
	def int_x86_avx512_gatherpf_qpd_512 : GCCBuiltin<"__builtin_ia32_gatherpfqpd">,
	Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
	llvm_i32_ty, llvm_i32_ty], [IntrArgMemOnly]>;
	def int_x86_avx512_gatherpf_qps_512 : GCCBuiltin<"__builtin_ia32_gatherpfqps">,
	Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
	llvm_i32_ty, llvm_i32_ty], [IntrArgMemOnly]>;

	// scatter prefetch
	def int_x86_avx512_scatterpf_dpd_512 : GCCBuiltin<"__builtin_ia32_scatterpfdpd">,
	Intrinsic<[], [llvm_i8_ty, llvm_v8i32_ty, llvm_ptr_ty,
	llvm_i32_ty, llvm_i32_ty], [IntrArgMemOnly]>;
	def int_x86_avx512_scatterpf_dps_512 : GCCBuiltin<"__builtin_ia32_scatterpfdps">,
	Intrinsic<[], [llvm_i16_ty, llvm_v16i32_ty, llvm_ptr_ty,
	llvm_i32_ty, llvm_i32_ty], [IntrArgMemOnly]>;
	def int_x86_avx512_scatterpf_qpd_512 : GCCBuiltin<"__builtin_ia32_scatterpfqpd">,
	Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
	llvm_i32_ty, llvm_i32_ty], [IntrArgMemOnly]>;
	def int_x86_avx512_scatterpf_qps_512 : GCCBuiltin<"__builtin_ia32_scatterpfqps">,
	Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
	llvm_i32_ty, llvm_i32_ty], [IntrArgMemOnly]>;
	}

	// AVX-512 conflict detection instruction
	// Instructions that count the number of leading zero bits
	let TargetPrefix = "x86" in {
	def int_x86_avx512_mask_conflict_d_128 :
	GCCBuiltin<"__builtin_ia32_vpconflictsi_128_mask">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_conflict_d_256 :
	GCCBuiltin<"__builtin_ia32_vpconflictsi_256_mask">,
	Intrinsic<[llvm_v8i32_ty],
	[llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_conflict_d_512 :
	GCCBuiltin<"__builtin_ia32_vpconflictsi_512_mask">,
	Intrinsic<[llvm_v16i32_ty],
	[llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty],
	[IntrNoMem]>;

	def int_x86_avx512_mask_conflict_q_128 :
	GCCBuiltin<"__builtin_ia32_vpconflictdi_128_mask">,
	Intrinsic<[llvm_v2i64_ty],
	[llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_conflict_q_256 :
	GCCBuiltin<"__builtin_ia32_vpconflictdi_256_mask">,
	Intrinsic<[llvm_v4i64_ty],
	[llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_conflict_q_512 :
	GCCBuiltin<"__builtin_ia32_vpconflictdi_512_mask">,
	Intrinsic<[llvm_v8i64_ty],
	[llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty],
	[IntrNoMem]>;
	}

	// Compares
	let TargetPrefix = "x86" in {
	// 512-bit
	def int_x86_avx512_vcomi_sd : GCCBuiltin<"__builtin_ia32_vcomisd">,
	Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty,
	llvm_v2f64_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_vcomi_ss : GCCBuiltin<"__builtin_ia32_vcomiss">,
	Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty,
	llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
	}

	// Compress, Expand
	let TargetPrefix = "x86" in {
	def int_x86_avx512_mask_compress_ps_512 :
	GCCBuiltin<"__builtin_ia32_compresssf512_mask">,
	Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
	llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_compress_pd_512 :
	GCCBuiltin<"__builtin_ia32_compressdf512_mask">,
	Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_compress_ps_256 :
	GCCBuiltin<"__builtin_ia32_compresssf256_mask">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_compress_pd_256 :
	GCCBuiltin<"__builtin_ia32_compressdf256_mask">,
	Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_compress_ps_128 :
	GCCBuiltin<"__builtin_ia32_compresssf128_mask">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_compress_pd_128 :
	GCCBuiltin<"__builtin_ia32_compressdf128_mask">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
	llvm_i8_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_compress_store_ps_512 :
	GCCBuiltin<"__builtin_ia32_compressstoresf512_mask">,
	Intrinsic<[], [llvm_ptr_ty, llvm_v16f32_ty,
	llvm_i16_ty], [IntrArgMemOnly]>;
	def int_x86_avx512_mask_compress_store_pd_512 :
	GCCBuiltin<"__builtin_ia32_compressstoredf512_mask">,
	Intrinsic<[], [llvm_ptr_ty, llvm_v8f64_ty,
	llvm_i8_ty], [IntrArgMemOnly]>;
	def int_x86_avx512_mask_compress_store_ps_256 :
	GCCBuiltin<"__builtin_ia32_compressstoresf256_mask">,
	Intrinsic<[], [llvm_ptr_ty, llvm_v8f32_ty,
	llvm_i8_ty], [IntrArgMemOnly]>;
	def int_x86_avx512_mask_compress_store_pd_256 :
	GCCBuiltin<"__builtin_ia32_compressstoredf256_mask">,
	Intrinsic<[], [llvm_ptr_ty, llvm_v4f64_ty,
	llvm_i8_ty], [IntrArgMemOnly]>;
	def int_x86_avx512_mask_compress_store_ps_128 :
	GCCBuiltin<"__builtin_ia32_compressstoresf128_mask">,
	Intrinsic<[], [llvm_ptr_ty, llvm_v4f32_ty,
	llvm_i8_ty], [IntrArgMemOnly]>;
	def int_x86_avx512_mask_compress_store_pd_128 :
	GCCBuiltin<"__builtin_ia32_compressstoredf128_mask">,
	Intrinsic<[], [llvm_ptr_ty, llvm_v2f64_ty,
	llvm_i8_ty], [IntrArgMemOnly]>;

	def int_x86_avx512_mask_compress_d_512 :
	GCCBuiltin<"__builtin_ia32_compresssi512_mask">,
	Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
	llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_compress_q_512 :
	GCCBuiltin<"__builtin_ia32_compressdi512_mask">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_compress_d_256 :
	GCCBuiltin<"__builtin_ia32_compresssi256_mask">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_compress_q_256 :
	GCCBuiltin<"__builtin_ia32_compressdi256_mask">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_compress_d_128 :
	GCCBuiltin<"__builtin_ia32_compresssi128_mask">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_compress_q_128 :
	GCCBuiltin<"__builtin_ia32_compressdi128_mask">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
	llvm_i8_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_compress_store_d_512 :
	GCCBuiltin<"__builtin_ia32_compressstoresi512_mask">,
	Intrinsic<[], [llvm_ptr_ty, llvm_v16i32_ty,
	llvm_i16_ty], [IntrArgMemOnly]>;
	def int_x86_avx512_mask_compress_store_q_512 :
	GCCBuiltin<"__builtin_ia32_compressstoredi512_mask">,
	Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty,
	llvm_i8_ty], [IntrArgMemOnly]>;
	def int_x86_avx512_mask_compress_store_d_256 :
	GCCBuiltin<"__builtin_ia32_compressstoresi256_mask">,
	Intrinsic<[], [llvm_ptr_ty, llvm_v8i32_ty,
	llvm_i8_ty], [IntrArgMemOnly]>;
	def int_x86_avx512_mask_compress_store_q_256 :
	GCCBuiltin<"__builtin_ia32_compressstoredi256_mask">,
	Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty,
	llvm_i8_ty], [IntrArgMemOnly]>;
	def int_x86_avx512_mask_compress_store_d_128 :
	GCCBuiltin<"__builtin_ia32_compressstoresi128_mask">,
	Intrinsic<[], [llvm_ptr_ty, llvm_v4i32_ty,
	llvm_i8_ty], [IntrArgMemOnly]>;
	def int_x86_avx512_mask_compress_store_q_128 :
	GCCBuiltin<"__builtin_ia32_compressstoredi128_mask">,
	Intrinsic<[], [llvm_ptr_ty, llvm_v2i64_ty,
	llvm_i8_ty], [IntrArgMemOnly]>;

	def int_x86_avx512_mask_compress_b_512 :
	GCCBuiltin<"__builtin_ia32_compressqi512_mask">,
	Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty,
	llvm_i64_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_compress_w_512 :
	GCCBuiltin<"__builtin_ia32_compresshi512_mask">,
	Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_compress_b_256 :
	GCCBuiltin<"__builtin_ia32_compressqi256_mask">,
	Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_compress_w_256 :
	GCCBuiltin<"__builtin_ia32_compresshi256_mask">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
	llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_compress_b_128 :
	GCCBuiltin<"__builtin_ia32_compressqi128_mask">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
	llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_compress_w_128 :
	GCCBuiltin<"__builtin_ia32_compresshi128_mask">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
	llvm_i8_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_compress_store_b_512 :
	GCCBuiltin<"__builtin_ia32_compressstoreqi512_mask">,
	Intrinsic<[], [llvm_ptr_ty, llvm_v64i8_ty,
	llvm_i64_ty], [IntrArgMemOnly]>;
	def int_x86_avx512_mask_compress_store_w_512 :
	GCCBuiltin<"__builtin_ia32_compressstorehi512_mask">,
	Intrinsic<[], [llvm_ptr_ty, llvm_v32i16_ty,
	llvm_i32_ty], [IntrArgMemOnly]>;
	def int_x86_avx512_mask_compress_store_b_256 :
	GCCBuiltin<"__builtin_ia32_compressstoreqi256_mask">,
	Intrinsic<[], [llvm_ptr_ty, llvm_v32i8_ty,
	llvm_i32_ty], [IntrArgMemOnly]>;
	def int_x86_avx512_mask_compress_store_w_256 :
	GCCBuiltin<"__builtin_ia32_compressstorehi256_mask">,
	Intrinsic<[], [llvm_ptr_ty, llvm_v16i16_ty,
	llvm_i16_ty], [IntrArgMemOnly]>;
	def int_x86_avx512_mask_compress_store_b_128 :
	GCCBuiltin<"__builtin_ia32_compressstoreqi128_mask">,
	Intrinsic<[], [llvm_ptr_ty, llvm_v16i8_ty,
	llvm_i16_ty], [IntrArgMemOnly]>;
	def int_x86_avx512_mask_compress_store_w_128 :
	GCCBuiltin<"__builtin_ia32_compressstorehi128_mask">,
	Intrinsic<[], [llvm_ptr_ty, llvm_v8i16_ty,
	llvm_i8_ty], [IntrArgMemOnly]>;

	// expand
	def int_x86_avx512_mask_expand_ps_512 :
	GCCBuiltin<"__builtin_ia32_expandsf512_mask">,
	Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
	llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_expand_pd_512 :
	GCCBuiltin<"__builtin_ia32_expanddf512_mask">,
	Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_expand_ps_256 :
	GCCBuiltin<"__builtin_ia32_expandsf256_mask">,
	Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_expand_pd_256 :
	GCCBuiltin<"__builtin_ia32_expanddf256_mask">,
	Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_expand_ps_128 :
	GCCBuiltin<"__builtin_ia32_expandsf128_mask">,
	Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_expand_pd_128 :
	GCCBuiltin<"__builtin_ia32_expanddf128_mask">,
	Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
	llvm_i8_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_expand_load_ps_512 :
	GCCBuiltin<"__builtin_ia32_expandloadsf512_mask">,
	Intrinsic<[llvm_v16f32_ty], [llvm_ptr_ty, llvm_v16f32_ty,
	llvm_i16_ty], [IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx512_mask_expand_load_pd_512 :
	GCCBuiltin<"__builtin_ia32_expandloaddf512_mask">,
	Intrinsic<[llvm_v8f64_ty], [llvm_ptr_ty, llvm_v8f64_ty,
	llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx512_mask_expand_load_ps_256 :
	GCCBuiltin<"__builtin_ia32_expandloadsf256_mask">,
	Intrinsic<[llvm_v8f32_ty], [llvm_ptr_ty, llvm_v8f32_ty,
	llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx512_mask_expand_load_pd_256 :
	GCCBuiltin<"__builtin_ia32_expandloaddf256_mask">,
	Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty, llvm_v4f64_ty,
	llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx512_mask_expand_load_ps_128 :
	GCCBuiltin<"__builtin_ia32_expandloadsf128_mask">,
	Intrinsic<[llvm_v4f32_ty], [llvm_ptr_ty, llvm_v4f32_ty,
	llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx512_mask_expand_load_pd_128 :
	GCCBuiltin<"__builtin_ia32_expandloaddf128_mask">,
	Intrinsic<[llvm_v2f64_ty], [llvm_ptr_ty, llvm_v2f64_ty,
	llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;

	def int_x86_avx512_mask_expand_d_512 :
	GCCBuiltin<"__builtin_ia32_expandsi512_mask">,
	Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
	llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_expand_q_512 :
	GCCBuiltin<"__builtin_ia32_expanddi512_mask">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_expand_d_256 :
	GCCBuiltin<"__builtin_ia32_expandsi256_mask">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_expand_q_256 :
	GCCBuiltin<"__builtin_ia32_expanddi256_mask">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_expand_d_128 :
	GCCBuiltin<"__builtin_ia32_expandsi128_mask">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_expand_q_128 :
	GCCBuiltin<"__builtin_ia32_expanddi128_mask">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
	llvm_i8_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_expand_load_d_512 :
	GCCBuiltin<"__builtin_ia32_expandloadsi512_mask">,
	Intrinsic<[llvm_v16i32_ty], [llvm_ptr_ty, llvm_v16i32_ty,
	llvm_i16_ty], [IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx512_mask_expand_load_q_512 :
	GCCBuiltin<"__builtin_ia32_expandloaddi512_mask">,
	Intrinsic<[llvm_v8i64_ty], [llvm_ptr_ty, llvm_v8i64_ty,
	llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx512_mask_expand_load_d_256 :
	GCCBuiltin<"__builtin_ia32_expandloadsi256_mask">,
	Intrinsic<[llvm_v8i32_ty], [llvm_ptr_ty, llvm_v8i32_ty,
	llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx512_mask_expand_load_q_256 :
	GCCBuiltin<"__builtin_ia32_expandloaddi256_mask">,
	Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty, llvm_v4i64_ty,
	llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx512_mask_expand_load_d_128 :
	GCCBuiltin<"__builtin_ia32_expandloadsi128_mask">,
	Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_v4i32_ty,
	llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx512_mask_expand_load_q_128 :
	GCCBuiltin<"__builtin_ia32_expandloaddi128_mask">,
	Intrinsic<[llvm_v2i64_ty], [llvm_ptr_ty, llvm_v2i64_ty,
	llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;

	def int_x86_avx512_mask_expand_b_512 :
	GCCBuiltin<"__builtin_ia32_expandqi512_mask">,
	Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty,
	llvm_i64_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_expand_w_512 :
	GCCBuiltin<"__builtin_ia32_expandhi512_mask">,
	Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_expand_b_256 :
	GCCBuiltin<"__builtin_ia32_expandqi256_mask">,
	Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_expand_w_256 :
	GCCBuiltin<"__builtin_ia32_expandhi256_mask">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
	llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_expand_b_128 :
	GCCBuiltin<"__builtin_ia32_expandqi128_mask">,
	Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
	llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_expand_w_128 :
	GCCBuiltin<"__builtin_ia32_expandhi128_mask">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
	llvm_i8_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_expand_load_b_512 :
	GCCBuiltin<"__builtin_ia32_expandloadqi512_mask">,
	Intrinsic<[llvm_v64i8_ty], [llvm_ptr_ty, llvm_v64i8_ty,
	llvm_i64_ty], [IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx512_mask_expand_load_w_512 :
	GCCBuiltin<"__builtin_ia32_expandloadhi512_mask">,
	Intrinsic<[llvm_v32i16_ty], [llvm_ptr_ty, llvm_v32i16_ty,
	llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx512_mask_expand_load_b_256 :
	GCCBuiltin<"__builtin_ia32_expandloadqi256_mask">,
	Intrinsic<[llvm_v32i8_ty], [llvm_ptr_ty, llvm_v32i8_ty,
	llvm_i32_ty], [IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx512_mask_expand_load_w_256 :
	GCCBuiltin<"__builtin_ia32_expandloadhi256_mask">,
	Intrinsic<[llvm_v16i16_ty], [llvm_ptr_ty, llvm_v16i16_ty,
	llvm_i16_ty], [IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx512_mask_expand_load_b_128 :
	GCCBuiltin<"__builtin_ia32_expandloadqi128_mask">,
	Intrinsic<[llvm_v16i8_ty], [llvm_ptr_ty, llvm_v16i8_ty,
	llvm_i16_ty], [IntrReadMem, IntrArgMemOnly]>;
	def int_x86_avx512_mask_expand_load_w_128 :
	GCCBuiltin<"__builtin_ia32_expandloadhi128_mask">,
	Intrinsic<[llvm_v8i16_ty], [llvm_ptr_ty, llvm_v8i16_ty,
	llvm_i8_ty], [IntrReadMem, IntrArgMemOnly]>;
	}

	// VBMI2 Concat & Shift
	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
	def int_x86_avx512_mask_vpshld_q_512 :
	GCCBuiltin<"__builtin_ia32_vpshldq512_mask">,
	Intrinsic<[llvm_v8i64_ty],
	[llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty, llvm_v8i64_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpshld_q_256 :
	GCCBuiltin<"__builtin_ia32_vpshldq256_mask">,
	Intrinsic<[llvm_v4i64_ty],
	[llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty, llvm_v4i64_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpshld_q_128 :
	GCCBuiltin<"__builtin_ia32_vpshldq128_mask">,
	Intrinsic<[llvm_v2i64_ty],
	[llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty, llvm_v2i64_ty,
	llvm_i8_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_vpshld_d_512 :
	GCCBuiltin<"__builtin_ia32_vpshldd512_mask">,
	Intrinsic<[llvm_v16i32_ty],
	[llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty, llvm_v16i32_ty,
	llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpshld_d_256 :
	GCCBuiltin<"__builtin_ia32_vpshldd256_mask">,
	Intrinsic<[llvm_v8i32_ty],
	[llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty, llvm_v8i32_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpshld_d_128 :
	GCCBuiltin<"__builtin_ia32_vpshldd128_mask">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_v4i32_ty,
	llvm_i8_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_vpshld_w_512 :
	GCCBuiltin<"__builtin_ia32_vpshldw512_mask">,
	Intrinsic<[llvm_v32i16_ty],
	[llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty, llvm_v32i16_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpshld_w_256 :
	GCCBuiltin<"__builtin_ia32_vpshldw256_mask">,
	Intrinsic<[llvm_v16i16_ty],
	[llvm_v16i16_ty, llvm_v16i16_ty, llvm_i32_ty, llvm_v16i16_ty,
	llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpshld_w_128 :
	GCCBuiltin<"__builtin_ia32_vpshldw128_mask">,
	Intrinsic<[llvm_v8i16_ty],
	[llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty, llvm_v8i16_ty,
	llvm_i8_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_vpshrd_q_512 :
	GCCBuiltin<"__builtin_ia32_vpshrdq512_mask">,
	Intrinsic<[llvm_v8i64_ty],
	[llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty, llvm_v8i64_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpshrd_q_256 :
	GCCBuiltin<"__builtin_ia32_vpshrdq256_mask">,
	Intrinsic<[llvm_v4i64_ty],
	[llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty, llvm_v4i64_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpshrd_q_128 :
	GCCBuiltin<"__builtin_ia32_vpshrdq128_mask">,
	Intrinsic<[llvm_v2i64_ty],
	[llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty, llvm_v2i64_ty,
	llvm_i8_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_vpshrd_d_512 :
	GCCBuiltin<"__builtin_ia32_vpshrdd512_mask">,
	Intrinsic<[llvm_v16i32_ty],
	[llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty, llvm_v16i32_ty,
	llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpshrd_d_256 :
	GCCBuiltin<"__builtin_ia32_vpshrdd256_mask">,
	Intrinsic<[llvm_v8i32_ty],
	[llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty, llvm_v8i32_ty,
	llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpshrd_d_128 :
	GCCBuiltin<"__builtin_ia32_vpshrdd128_mask">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_v4i32_ty,
	llvm_i8_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_vpshrd_w_512 :
	GCCBuiltin<"__builtin_ia32_vpshrdw512_mask">,
	Intrinsic<[llvm_v32i16_ty],
	[llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty, llvm_v32i16_ty,
	llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpshrd_w_256 :
	GCCBuiltin<"__builtin_ia32_vpshrdw256_mask">,
	Intrinsic<[llvm_v16i16_ty],
	[llvm_v16i16_ty, llvm_v16i16_ty, llvm_i32_ty, llvm_v16i16_ty,
	llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpshrd_w_128 :
	GCCBuiltin<"__builtin_ia32_vpshrdw128_mask">,
	Intrinsic<[llvm_v8i16_ty],
	[llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty, llvm_v8i16_ty,
	llvm_i8_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_vpshldv_w_128 :
	GCCBuiltin<"__builtin_ia32_vpshldvw128_mask">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
	llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpshldv_w_128 :
	GCCBuiltin<"__builtin_ia32_vpshldvw128_maskz">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
	llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpshldv_w_256 :
	GCCBuiltin<"__builtin_ia32_vpshldvw256_mask">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
	llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpshldv_w_256 :
	GCCBuiltin<"__builtin_ia32_vpshldvw256_maskz">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
	llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpshldv_w_512 :
	GCCBuiltin<"__builtin_ia32_vpshldvw512_mask">,
	Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
	llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpshldv_w_512 :
	GCCBuiltin<"__builtin_ia32_vpshldvw512_maskz">,
	Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
	llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_vpshldv_q_128 :
	GCCBuiltin<"__builtin_ia32_vpshldvq128_mask">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
	llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpshldv_q_128 :
	GCCBuiltin<"__builtin_ia32_vpshldvq128_maskz">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
	llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpshldv_q_256 :
	GCCBuiltin<"__builtin_ia32_vpshldvq256_mask">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
	llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpshldv_q_256 :
	GCCBuiltin<"__builtin_ia32_vpshldvq256_maskz">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
	llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpshldv_q_512 :
	GCCBuiltin<"__builtin_ia32_vpshldvq512_mask">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
	llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpshldv_q_512 :
	GCCBuiltin<"__builtin_ia32_vpshldvq512_maskz">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
	llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_vpshldv_d_128 :
	GCCBuiltin<"__builtin_ia32_vpshldvd128_mask">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
	llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpshldv_d_128 :
	GCCBuiltin<"__builtin_ia32_vpshldvd128_maskz">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
	llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpshldv_d_256 :
	GCCBuiltin<"__builtin_ia32_vpshldvd256_mask">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
	llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpshldv_d_256 :
	GCCBuiltin<"__builtin_ia32_vpshldvd256_maskz">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
	llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpshldv_d_512 :
	GCCBuiltin<"__builtin_ia32_vpshldvd512_mask">,
	Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
	llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpshldv_d_512 :
	GCCBuiltin<"__builtin_ia32_vpshldvd512_maskz">,
	Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
	llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_vpshrdv_w_128 :
	GCCBuiltin<"__builtin_ia32_vpshrdvw128_mask">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
	llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpshrdv_w_128 :
	GCCBuiltin<"__builtin_ia32_vpshrdvw128_maskz">,
	Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
	llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpshrdv_w_256 :
	GCCBuiltin<"__builtin_ia32_vpshrdvw256_mask">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
	llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpshrdv_w_256 :
	GCCBuiltin<"__builtin_ia32_vpshrdvw256_maskz">,
	Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
	llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpshrdv_w_512 :
	GCCBuiltin<"__builtin_ia32_vpshrdvw512_mask">,
	Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
	llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpshrdv_w_512 :
	GCCBuiltin<"__builtin_ia32_vpshrdvw512_maskz">,
	Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
	llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_vpshrdv_q_128 :
	GCCBuiltin<"__builtin_ia32_vpshrdvq128_mask">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
	llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpshrdv_q_128 :
	GCCBuiltin<"__builtin_ia32_vpshrdvq128_maskz">,
	Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
	llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpshrdv_q_256 :
	GCCBuiltin<"__builtin_ia32_vpshrdvq256_mask">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
	llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpshrdv_q_256 :
	GCCBuiltin<"__builtin_ia32_vpshrdvq256_maskz">,
	Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
	llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpshrdv_q_512 :
	GCCBuiltin<"__builtin_ia32_vpshrdvq512_mask">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
	llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpshrdv_q_512 :
	GCCBuiltin<"__builtin_ia32_vpshrdvq512_maskz">,
	Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
	llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_vpshrdv_d_128 :
	GCCBuiltin<"__builtin_ia32_vpshrdvd128_mask">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
	llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpshrdv_d_128 :
	GCCBuiltin<"__builtin_ia32_vpshrdvd128_maskz">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
	llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpshrdv_d_256 :
	GCCBuiltin<"__builtin_ia32_vpshrdvd256_mask">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
	llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpshrdv_d_256 :
	GCCBuiltin<"__builtin_ia32_vpshrdvd256_maskz">,
	Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
	llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_vpshrdv_d_512 :
	GCCBuiltin<"__builtin_ia32_vpshrdvd512_mask">,
	Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
	llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
	def int_x86_avx512_maskz_vpshrdv_d_512 :
	GCCBuiltin<"__builtin_ia32_vpshrdvd512_maskz">,
	Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
	llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>;
	}

	// truncate
	let TargetPrefix = "x86" in {
	def int_x86_avx512_mask_pmov_qb_128 :
	GCCBuiltin<"__builtin_ia32_pmovqb128_mask">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v2i64_ty, llvm_v16i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmov_qb_mem_128 :
	GCCBuiltin<"__builtin_ia32_pmovqb128mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovs_qb_128 :
	GCCBuiltin<"__builtin_ia32_pmovsqb128_mask">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v2i64_ty, llvm_v16i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovs_qb_mem_128 :
	GCCBuiltin<"__builtin_ia32_pmovsqb128mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovus_qb_128 :
	GCCBuiltin<"__builtin_ia32_pmovusqb128_mask">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v2i64_ty, llvm_v16i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovus_qb_mem_128 :
	GCCBuiltin<"__builtin_ia32_pmovusqb128mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmov_qb_256 :
	GCCBuiltin<"__builtin_ia32_pmovqb256_mask">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v4i64_ty, llvm_v16i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmov_qb_mem_256 :
	GCCBuiltin<"__builtin_ia32_pmovqb256mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovs_qb_256 :
	GCCBuiltin<"__builtin_ia32_pmovsqb256_mask">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v4i64_ty, llvm_v16i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovs_qb_mem_256 :
	GCCBuiltin<"__builtin_ia32_pmovsqb256mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovus_qb_256 :
	GCCBuiltin<"__builtin_ia32_pmovusqb256_mask">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v4i64_ty, llvm_v16i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovus_qb_mem_256 :
	GCCBuiltin<"__builtin_ia32_pmovusqb256mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmov_qb_512 :
	GCCBuiltin<"__builtin_ia32_pmovqb512_mask">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v8i64_ty, llvm_v16i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmov_qb_mem_512 :
	GCCBuiltin<"__builtin_ia32_pmovqb512mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovs_qb_512 :
	GCCBuiltin<"__builtin_ia32_pmovsqb512_mask">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v8i64_ty, llvm_v16i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovs_qb_mem_512 :
	GCCBuiltin<"__builtin_ia32_pmovsqb512mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovus_qb_512 :
	GCCBuiltin<"__builtin_ia32_pmovusqb512_mask">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v8i64_ty, llvm_v16i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovus_qb_mem_512 :
	GCCBuiltin<"__builtin_ia32_pmovusqb512mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmov_qw_128 :
	GCCBuiltin<"__builtin_ia32_pmovqw128_mask">,
	Intrinsic<[llvm_v8i16_ty],
	[llvm_v2i64_ty, llvm_v8i16_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmov_qw_mem_128 :
	GCCBuiltin<"__builtin_ia32_pmovqw128mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovs_qw_128 :
	GCCBuiltin<"__builtin_ia32_pmovsqw128_mask">,
	Intrinsic<[llvm_v8i16_ty],
	[llvm_v2i64_ty, llvm_v8i16_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovs_qw_mem_128 :
	GCCBuiltin<"__builtin_ia32_pmovsqw128mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovus_qw_128 :
	GCCBuiltin<"__builtin_ia32_pmovusqw128_mask">,
	Intrinsic<[llvm_v8i16_ty],
	[llvm_v2i64_ty, llvm_v8i16_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovus_qw_mem_128 :
	GCCBuiltin<"__builtin_ia32_pmovusqw128mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmov_qw_256 :
	GCCBuiltin<"__builtin_ia32_pmovqw256_mask">,
	Intrinsic<[llvm_v8i16_ty],
	[llvm_v4i64_ty, llvm_v8i16_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmov_qw_mem_256 :
	GCCBuiltin<"__builtin_ia32_pmovqw256mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovs_qw_256 :
	GCCBuiltin<"__builtin_ia32_pmovsqw256_mask">,
	Intrinsic<[llvm_v8i16_ty],
	[llvm_v4i64_ty, llvm_v8i16_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovs_qw_mem_256 :
	GCCBuiltin<"__builtin_ia32_pmovsqw256mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovus_qw_256 :
	GCCBuiltin<"__builtin_ia32_pmovusqw256_mask">,
	Intrinsic<[llvm_v8i16_ty],
	[llvm_v4i64_ty, llvm_v8i16_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovus_qw_mem_256 :
	GCCBuiltin<"__builtin_ia32_pmovusqw256mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmov_qw_512 :
	GCCBuiltin<"__builtin_ia32_pmovqw512_mask">,
	Intrinsic<[llvm_v8i16_ty],
	[llvm_v8i64_ty, llvm_v8i16_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmov_qw_mem_512 :
	GCCBuiltin<"__builtin_ia32_pmovqw512mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovs_qw_512 :
	GCCBuiltin<"__builtin_ia32_pmovsqw512_mask">,
	Intrinsic<[llvm_v8i16_ty],
	[llvm_v8i64_ty, llvm_v8i16_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovs_qw_mem_512 :
	GCCBuiltin<"__builtin_ia32_pmovsqw512mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovus_qw_512 :
	GCCBuiltin<"__builtin_ia32_pmovusqw512_mask">,
	Intrinsic<[llvm_v8i16_ty],
	[llvm_v8i64_ty, llvm_v8i16_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovus_qw_mem_512 :
	GCCBuiltin<"__builtin_ia32_pmovusqw512mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmov_qd_128 :
	GCCBuiltin<"__builtin_ia32_pmovqd128_mask">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v2i64_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmov_qd_mem_128 :
	GCCBuiltin<"__builtin_ia32_pmovqd128mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovs_qd_128 :
	GCCBuiltin<"__builtin_ia32_pmovsqd128_mask">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v2i64_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovs_qd_mem_128 :
	GCCBuiltin<"__builtin_ia32_pmovsqd128mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovus_qd_128 :
	GCCBuiltin<"__builtin_ia32_pmovusqd128_mask">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v2i64_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovus_qd_mem_128 :
	GCCBuiltin<"__builtin_ia32_pmovusqd128mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmov_qd_256 :
	GCCBuiltin<"__builtin_ia32_pmovqd256_mask">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v4i64_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmov_qd_mem_256 :
	GCCBuiltin<"__builtin_ia32_pmovqd256mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovs_qd_256 :
	GCCBuiltin<"__builtin_ia32_pmovsqd256_mask">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v4i64_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovs_qd_mem_256 :
	GCCBuiltin<"__builtin_ia32_pmovsqd256mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovus_qd_256 :
	GCCBuiltin<"__builtin_ia32_pmovusqd256_mask">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v4i64_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovus_qd_mem_256 :
	GCCBuiltin<"__builtin_ia32_pmovusqd256mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmov_qd_512 :
	GCCBuiltin<"__builtin_ia32_pmovqd512_mask">,
	Intrinsic<[llvm_v8i32_ty],
	[llvm_v8i64_ty, llvm_v8i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmov_qd_mem_512 :
	GCCBuiltin<"__builtin_ia32_pmovqd512mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovs_qd_512 :
	GCCBuiltin<"__builtin_ia32_pmovsqd512_mask">,
	Intrinsic<[llvm_v8i32_ty],
	[llvm_v8i64_ty, llvm_v8i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovs_qd_mem_512 :
	GCCBuiltin<"__builtin_ia32_pmovsqd512mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovus_qd_512 :
	GCCBuiltin<"__builtin_ia32_pmovusqd512_mask">,
	Intrinsic<[llvm_v8i32_ty],
	[llvm_v8i64_ty, llvm_v8i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovus_qd_mem_512 :
	GCCBuiltin<"__builtin_ia32_pmovusqd512mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmov_db_128 :
	GCCBuiltin<"__builtin_ia32_pmovdb128_mask">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v4i32_ty, llvm_v16i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmov_db_mem_128 :
	GCCBuiltin<"__builtin_ia32_pmovdb128mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovs_db_128 :
	GCCBuiltin<"__builtin_ia32_pmovsdb128_mask">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v4i32_ty, llvm_v16i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovs_db_mem_128 :
	GCCBuiltin<"__builtin_ia32_pmovsdb128mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovus_db_128 :
	GCCBuiltin<"__builtin_ia32_pmovusdb128_mask">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v4i32_ty, llvm_v16i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovus_db_mem_128 :
	GCCBuiltin<"__builtin_ia32_pmovusdb128mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmov_db_256 :
	GCCBuiltin<"__builtin_ia32_pmovdb256_mask">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v8i32_ty, llvm_v16i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmov_db_mem_256 :
	GCCBuiltin<"__builtin_ia32_pmovdb256mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovs_db_256 :
	GCCBuiltin<"__builtin_ia32_pmovsdb256_mask">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v8i32_ty, llvm_v16i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovs_db_mem_256 :
	GCCBuiltin<"__builtin_ia32_pmovsdb256mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovus_db_256 :
	GCCBuiltin<"__builtin_ia32_pmovusdb256_mask">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v8i32_ty, llvm_v16i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovus_db_mem_256 :
	GCCBuiltin<"__builtin_ia32_pmovusdb256mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmov_db_512 :
	GCCBuiltin<"__builtin_ia32_pmovdb512_mask">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v16i32_ty, llvm_v16i8_ty, llvm_i16_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmov_db_mem_512 :
	GCCBuiltin<"__builtin_ia32_pmovdb512mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovs_db_512 :
	GCCBuiltin<"__builtin_ia32_pmovsdb512_mask">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v16i32_ty, llvm_v16i8_ty, llvm_i16_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovs_db_mem_512 :
	GCCBuiltin<"__builtin_ia32_pmovsdb512mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovus_db_512 :
	GCCBuiltin<"__builtin_ia32_pmovusdb512_mask">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v16i32_ty, llvm_v16i8_ty, llvm_i16_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovus_db_mem_512 :
	GCCBuiltin<"__builtin_ia32_pmovusdb512mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmov_dw_128 :
	GCCBuiltin<"__builtin_ia32_pmovdw128_mask">,
	Intrinsic<[llvm_v8i16_ty],
	[llvm_v4i32_ty, llvm_v8i16_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmov_dw_mem_128 :
	GCCBuiltin<"__builtin_ia32_pmovdw128mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovs_dw_128 :
	GCCBuiltin<"__builtin_ia32_pmovsdw128_mask">,
	Intrinsic<[llvm_v8i16_ty],
	[llvm_v4i32_ty, llvm_v8i16_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovs_dw_mem_128 :
	GCCBuiltin<"__builtin_ia32_pmovsdw128mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovus_dw_128 :
	GCCBuiltin<"__builtin_ia32_pmovusdw128_mask">,
	Intrinsic<[llvm_v8i16_ty],
	[llvm_v4i32_ty, llvm_v8i16_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovus_dw_mem_128 :
	GCCBuiltin<"__builtin_ia32_pmovusdw128mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmov_dw_256 :
	GCCBuiltin<"__builtin_ia32_pmovdw256_mask">,
	Intrinsic<[llvm_v8i16_ty],
	[llvm_v8i32_ty, llvm_v8i16_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmov_dw_mem_256 :
	GCCBuiltin<"__builtin_ia32_pmovdw256mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovs_dw_256 :
	GCCBuiltin<"__builtin_ia32_pmovsdw256_mask">,
	Intrinsic<[llvm_v8i16_ty],
	[llvm_v8i32_ty, llvm_v8i16_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovs_dw_mem_256 :
	GCCBuiltin<"__builtin_ia32_pmovsdw256mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovus_dw_256 :
	GCCBuiltin<"__builtin_ia32_pmovusdw256_mask">,
	Intrinsic<[llvm_v8i16_ty],
	[llvm_v8i32_ty, llvm_v8i16_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovus_dw_mem_256 :
	GCCBuiltin<"__builtin_ia32_pmovusdw256mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmov_dw_512 :
	GCCBuiltin<"__builtin_ia32_pmovdw512_mask">,
	Intrinsic<[llvm_v16i16_ty],
	[llvm_v16i32_ty, llvm_v16i16_ty, llvm_i16_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmov_dw_mem_512 :
	GCCBuiltin<"__builtin_ia32_pmovdw512mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovs_dw_512 :
	GCCBuiltin<"__builtin_ia32_pmovsdw512_mask">,
	Intrinsic<[llvm_v16i16_ty],
	[llvm_v16i32_ty, llvm_v16i16_ty, llvm_i16_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovs_dw_mem_512 :
	GCCBuiltin<"__builtin_ia32_pmovsdw512mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovus_dw_512 :
	GCCBuiltin<"__builtin_ia32_pmovusdw512_mask">,
	Intrinsic<[llvm_v16i16_ty],
	[llvm_v16i32_ty, llvm_v16i16_ty, llvm_i16_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovus_dw_mem_512 :
	GCCBuiltin<"__builtin_ia32_pmovusdw512mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmov_wb_128 :
	GCCBuiltin<"__builtin_ia32_pmovwb128_mask">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v8i16_ty, llvm_v16i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmov_wb_mem_128 :
	GCCBuiltin<"__builtin_ia32_pmovwb128mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v8i16_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovs_wb_128 :
	GCCBuiltin<"__builtin_ia32_pmovswb128_mask">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v8i16_ty, llvm_v16i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovs_wb_mem_128 :
	GCCBuiltin<"__builtin_ia32_pmovswb128mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v8i16_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovus_wb_128 :
	GCCBuiltin<"__builtin_ia32_pmovuswb128_mask">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v8i16_ty, llvm_v16i8_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovus_wb_mem_128 :
	GCCBuiltin<"__builtin_ia32_pmovuswb128mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v8i16_ty, llvm_i8_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmov_wb_256 :
	GCCBuiltin<"__builtin_ia32_pmovwb256_mask">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v16i16_ty, llvm_v16i8_ty, llvm_i16_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmov_wb_mem_256 :
	GCCBuiltin<"__builtin_ia32_pmovwb256mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v16i16_ty, llvm_i16_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovs_wb_256 :
	GCCBuiltin<"__builtin_ia32_pmovswb256_mask">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v16i16_ty, llvm_v16i8_ty, llvm_i16_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovs_wb_mem_256 :
	GCCBuiltin<"__builtin_ia32_pmovswb256mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v16i16_ty, llvm_i16_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovus_wb_256 :
	GCCBuiltin<"__builtin_ia32_pmovuswb256_mask">,
	Intrinsic<[llvm_v16i8_ty],
	[llvm_v16i16_ty, llvm_v16i8_ty, llvm_i16_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovus_wb_mem_256 :
	GCCBuiltin<"__builtin_ia32_pmovuswb256mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v16i16_ty, llvm_i16_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmov_wb_512 :
	GCCBuiltin<"__builtin_ia32_pmovwb512_mask">,
	Intrinsic<[llvm_v32i8_ty],
	[llvm_v32i16_ty, llvm_v32i8_ty, llvm_i32_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmov_wb_mem_512 :
	GCCBuiltin<"__builtin_ia32_pmovwb512mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v32i16_ty, llvm_i32_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovs_wb_512 :
	GCCBuiltin<"__builtin_ia32_pmovswb512_mask">,
	Intrinsic<[llvm_v32i8_ty],
	[llvm_v32i16_ty, llvm_v32i8_ty, llvm_i32_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovs_wb_mem_512 :
	GCCBuiltin<"__builtin_ia32_pmovswb512mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v32i16_ty, llvm_i32_ty],
	[IntrArgMemOnly]>;
	def int_x86_avx512_mask_pmovus_wb_512 :
	GCCBuiltin<"__builtin_ia32_pmovuswb512_mask">,
	Intrinsic<[llvm_v32i8_ty],
	[llvm_v32i16_ty, llvm_v32i8_ty, llvm_i32_ty],
	[IntrNoMem]>;
	def int_x86_avx512_mask_pmovus_wb_mem_512 :
	GCCBuiltin<"__builtin_ia32_pmovuswb512mem_mask">,
	Intrinsic<[],
	[llvm_ptr_ty, llvm_v32i16_ty, llvm_i32_ty],
	[IntrArgMemOnly]>;
	}

	// Bitwise ternary logic
	let TargetPrefix = "x86" in {
	def int_x86_avx512_mask_pternlog_d_128 :
	GCCBuiltin<"__builtin_ia32_pternlogd128_mask">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty,
	llvm_i8_ty], [IntrNoMem]>;

	def int_x86_avx512_maskz_pternlog_d_128 :
	GCCBuiltin<"__builtin_ia32_pternlogd128_maskz">,
	Intrinsic<[llvm_v4i32_ty],
	[llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty,
	llvm_i8_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_pternlog_d_256 :
	GCCBuiltin<"__builtin_ia32_pternlogd256_mask">,
	Intrinsic<[llvm_v8i32_ty],
	[llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty,
	llvm_i8_ty], [IntrNoMem]>;

	def int_x86_avx512_maskz_pternlog_d_256 :
	GCCBuiltin<"__builtin_ia32_pternlogd256_maskz">,
	Intrinsic<[llvm_v8i32_ty],
	[llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty,
	llvm_i8_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_pternlog_d_512 :
	GCCBuiltin<"__builtin_ia32_pternlogd512_mask">,
	Intrinsic<[llvm_v16i32_ty],
	[llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty,
	llvm_i16_ty], [IntrNoMem]>;

	def int_x86_avx512_maskz_pternlog_d_512 :
	GCCBuiltin<"__builtin_ia32_pternlogd512_maskz">,
	Intrinsic<[llvm_v16i32_ty],
	[llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty,
	llvm_i16_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_pternlog_q_128 :
	GCCBuiltin<"__builtin_ia32_pternlogq128_mask">,
	Intrinsic<[llvm_v2i64_ty],
	[llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty,
	llvm_i8_ty], [IntrNoMem]>;

	def int_x86_avx512_maskz_pternlog_q_128 :
	GCCBuiltin<"__builtin_ia32_pternlogq128_maskz">,
	Intrinsic<[llvm_v2i64_ty],
	[llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty,
	llvm_i8_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_pternlog_q_256 :
	GCCBuiltin<"__builtin_ia32_pternlogq256_mask">,
	Intrinsic<[llvm_v4i64_ty],
	[llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty,
	llvm_i8_ty], [IntrNoMem]>;

	def int_x86_avx512_maskz_pternlog_q_256 :
	GCCBuiltin<"__builtin_ia32_pternlogq256_maskz">,
	Intrinsic<[llvm_v4i64_ty],
	[llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty,
	llvm_i8_ty], [IntrNoMem]>;

	def int_x86_avx512_mask_pternlog_q_512 :
	GCCBuiltin<"__builtin_ia32_pternlogq512_mask">,
	Intrinsic<[llvm_v8i64_ty],
	[llvm_v8i64_ty, llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty,
	llvm_i8_ty], [IntrNoMem]>;

	def int_x86_avx512_maskz_pternlog_q_512 :
	GCCBuiltin<"__builtin_ia32_pternlogq512_maskz">,
	Intrinsic<[llvm_v8i64_ty],
	[llvm_v8i64_ty, llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty,
	llvm_i8_ty], [IntrNoMem]>;
	}

	// Misc.
	let TargetPrefix = "x86" in {
	def int_x86_avx512_mask_cmp_ps_512 :
	GCCBuiltin<"__builtin_ia32_cmpps512_mask">,
	Intrinsic<[llvm_i16_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
	llvm_i32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_cmp_pd_512 :
	GCCBuiltin<"__builtin_ia32_cmppd512_mask">,
	Intrinsic<[llvm_i8_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
	llvm_i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_cmp_ps_256 :
	GCCBuiltin<"__builtin_ia32_cmpps256_mask">,
	Intrinsic<[llvm_i8_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
	llvm_i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_cmp_pd_256 :
	GCCBuiltin<"__builtin_ia32_cmppd256_mask">,
	Intrinsic<[llvm_i8_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
	llvm_i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_cmp_ps_128 :
	GCCBuiltin<"__builtin_ia32_cmpps128_mask">,
	Intrinsic<[llvm_i8_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
	llvm_i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_cmp_pd_128 :
	GCCBuiltin<"__builtin_ia32_cmppd128_mask">,
	Intrinsic<[llvm_i8_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
	llvm_i32_ty, llvm_i8_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_cmp_ss :
	GCCBuiltin<"__builtin_ia32_cmpss_mask">,
	Intrinsic<[llvm_i8_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
	llvm_i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	def int_x86_avx512_mask_cmp_sd :
	GCCBuiltin<"__builtin_ia32_cmpsd_mask">,
	Intrinsic<[llvm_i8_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
	llvm_i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
	}

	//===----------------------------------------------------------------------===//
	// SHA intrinsics
	let TargetPrefix = "x86" in {
	def int_x86_sha1rnds4 : GCCBuiltin<"__builtin_ia32_sha1rnds4">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty],
	[IntrNoMem]>;
	def int_x86_sha1nexte : GCCBuiltin<"__builtin_ia32_sha1nexte">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
	def int_x86_sha1msg1 : GCCBuiltin<"__builtin_ia32_sha1msg1">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
	def int_x86_sha1msg2 : GCCBuiltin<"__builtin_ia32_sha1msg2">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
	def int_x86_sha256rnds2 : GCCBuiltin<"__builtin_ia32_sha256rnds2">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
	[IntrNoMem]>;
	def int_x86_sha256msg1 : GCCBuiltin<"__builtin_ia32_sha256msg1">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
	def int_x86_sha256msg2 : GCCBuiltin<"__builtin_ia32_sha256msg2">,
	Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>;
	}

	//===----------------------------------------------------------------------===//
	// Thread synchronization ops with timer.
	let TargetPrefix = "x86" in {
	def int_x86_monitorx
	: GCCBuiltin<"__builtin_ia32_monitorx">,
	Intrinsic<[], [ llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty ], []>;
	def int_x86_mwaitx
	: GCCBuiltin<"__builtin_ia32_mwaitx">,
	Intrinsic<[], [ llvm_i32_ty, llvm_i32_ty, llvm_i32_ty ], []>;
	}

	//===----------------------------------------------------------------------===//
	// Cache-line zero
	let TargetPrefix = "x86" in {
	def int_x86_clzero : GCCBuiltin<"__builtin_ia32_clzero">,
	Intrinsic<[], [llvm_ptr_ty], []>;
	}
	Index: head/contrib/llvm/include/llvm/MC/MCAsmMacro.h
	===================================================================
	--- head/contrib/llvm/include/llvm/MC/MCAsmMacro.h (nonexistent)
	+++ head/contrib/llvm/include/llvm/MC/MCAsmMacro.h (revision 329410)
	@@ -0,0 +1,38 @@
	+//===- MCAsmMacro.h - Assembly Macros ---------------------------- C++ --===//
	+//
	+// The LLVM Compiler Infrastructure
	+//
	+// This file is distributed under the University of Illinois Open Source
	+// License. See LICENSE.TXT for details.
	+//
	+//===----------------------------------------------------------------------===//
	+
	+#ifndef LLVM_MC_MCASMMACRO_H
	+#define LLVM_MC_MCASMMACRO_H
	+
	+#include "llvm/MC/MCParser/MCAsmLexer.h"
	+
	+namespace llvm {
	+
	+struct MCAsmMacroParameter {
	+ StringRef Name;
	+ std::vector<AsmToken> Value;
	+ bool Required = false;
	+ bool Vararg = false;
	+
	+ MCAsmMacroParameter() = default;
	+};
	+
	+typedef std::vector<MCAsmMacroParameter> MCAsmMacroParameters;
	+struct MCAsmMacro {
	+ StringRef Name;
	+ StringRef Body;
	+ MCAsmMacroParameters Parameters;
	+
	+public:
	+ MCAsmMacro(StringRef N, StringRef B, MCAsmMacroParameters P)
	+ : Name(N), Body(B), Parameters(std::move(P)) {}
	+};
	+}; // namespace llvm
	+
	+#endif

	Property changes on: head/contrib/llvm/include/llvm/MC/MCAsmMacro.h
	___________________________________________________________________
	Added: svn:eol-style
	## -0,0 +1 ##
	+native
	\ No newline at end of property
	Added: svn:keywords
	## -0,0 +1 ##
	+FreeBSD=%H
	\ No newline at end of property
	Added: svn:mime-type
	## -0,0 +1 ##
	+text/plain
	\ No newline at end of property
	Index: head/contrib/llvm/include/llvm/MC/MCContext.h
	===================================================================
	--- head/contrib/llvm/include/llvm/MC/MCContext.h (revision 329409)
	+++ head/contrib/llvm/include/llvm/MC/MCContext.h (revision 329410)
	@@ -1,697 +1,712 @@
	//===- MCContext.h - Machine Code Context ------------------------ C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_MC_MCCONTEXT_H
	#define LLVM_MC_MCCONTEXT_H

	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/SetVector.h"
	#include "llvm/ADT/SmallString.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/StringMap.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/BinaryFormat/Dwarf.h"
	+#include "llvm/MC/MCAsmMacro.h"
	#include "llvm/MC/MCDwarf.h"
	#include "llvm/MC/MCSubtargetInfo.h"
	#include "llvm/MC/SectionKind.h"
	#include "llvm/Support/Allocator.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/raw_ostream.h"
	#include <algorithm>
	#include <cassert>
	#include <cstddef>
	#include <cstdint>
	#include <map>
	#include <memory>
	#include <string>
	#include <utility>
	#include <vector>

	namespace llvm {

	class CodeViewContext;
	class MCAsmInfo;
	class MCLabel;
	class MCObjectFileInfo;
	class MCRegisterInfo;
	class MCSection;
	class MCSectionCOFF;
	class MCSectionELF;
	class MCSectionMachO;
	class MCSectionWasm;
	class MCStreamer;
	class MCSymbol;
	class MCSymbolELF;
	class MCSymbolWasm;
	class SMLoc;
	class SourceMgr;

	/// Context object for machine code objects. This class owns all of the
	/// sections that it creates.
	///
	class MCContext {
	public:
	using SymbolTable = StringMap<MCSymbol *, BumpPtrAllocator &>;

	private:
	/// The SourceMgr for this object, if any.
	const SourceMgr *SrcMgr;

	/// The SourceMgr for inline assembly, if any.
	SourceMgr *InlineSrcMgr;

	/// The MCAsmInfo for this target.
	const MCAsmInfo *MAI;

	/// The MCRegisterInfo for this target.
	const MCRegisterInfo *MRI;

	/// The MCObjectFileInfo for this target.
	const MCObjectFileInfo *MOFI;

	std::unique_ptr<CodeViewContext> CVContext;

	/// Allocator object used for creating machine code objects.
	///
	/// We use a bump pointer allocator to avoid the need to track all allocated
	/// objects.
	BumpPtrAllocator Allocator;

	SpecificBumpPtrAllocator<MCSectionCOFF> COFFAllocator;
	SpecificBumpPtrAllocator<MCSectionELF> ELFAllocator;
	SpecificBumpPtrAllocator<MCSectionMachO> MachOAllocator;
	SpecificBumpPtrAllocator<MCSectionWasm> WasmAllocator;

	/// Bindings of names to symbols.
	SymbolTable Symbols;

	/// A mapping from a local label number and an instance count to a symbol.
	/// For example, in the assembly
	/// 1:
	/// 2:
	/// 1:
	/// We have three labels represented by the pairs (1, 0), (2, 0) and (1, 1)
	DenseMap<std::pair<unsigned, unsigned>, MCSymbol *> LocalSymbols;

	/// Keeps tracks of names that were used both for used declared and
	/// artificial symbols. The value is "true" if the name has been used for a
	/// non-section symbol (there can be at most one of those, plus an unlimited
	/// number of section symbols with the same name).
	StringMap<bool, BumpPtrAllocator &> UsedNames;

	/// The next ID to dole out to an unnamed assembler temporary symbol with
	/// a given prefix.
	StringMap<unsigned> NextID;

	/// Instances of directional local labels.
	DenseMap<unsigned, MCLabel *> Instances;
	/// NextInstance() creates the next instance of the directional local label
	/// for the LocalLabelVal and adds it to the map if needed.
	unsigned NextInstance(unsigned LocalLabelVal);
	/// GetInstance() gets the current instance of the directional local label
	/// for the LocalLabelVal and adds it to the map if needed.
	unsigned GetInstance(unsigned LocalLabelVal);

	/// The file name of the log file from the environment variable
	/// AS_SECURE_LOG_FILE. Which must be set before the .secure_log_unique
	/// directive is used or it is an error.
	char *SecureLogFile;
	/// The stream that gets written to for the .secure_log_unique directive.
	std::unique_ptr<raw_fd_ostream> SecureLog;
	/// Boolean toggled when .secure_log_unique / .secure_log_reset is seen to
	/// catch errors if .secure_log_unique appears twice without
	/// .secure_log_reset appearing between them.
	bool SecureLogUsed = false;

	/// The compilation directory to use for DW_AT_comp_dir.
	SmallString<128> CompilationDir;

	/// The main file name if passed in explicitly.
	std::string MainFileName;

	/// The dwarf file and directory tables from the dwarf .file directive.
	/// We now emit a line table for each compile unit. To reduce the prologue
	/// size of each line table, the files and directories used by each compile
	/// unit are separated.
	std::map<unsigned, MCDwarfLineTable> MCDwarfLineTablesCUMap;

	/// The current dwarf line information from the last dwarf .loc directive.
	MCDwarfLoc CurrentDwarfLoc;
	bool DwarfLocSeen = false;

	/// Generate dwarf debugging info for assembly source files.
	bool GenDwarfForAssembly = false;

	/// The current dwarf file number when generate dwarf debugging info for
	/// assembly source files.
	unsigned GenDwarfFileNumber = 0;

	/// Sections for generating the .debug_ranges and .debug_aranges sections.
	SetVector<MCSection *> SectionsForRanges;

	/// The information gathered from labels that will have dwarf label
	/// entries when generating dwarf assembly source files.
	std::vector<MCGenDwarfLabelEntry> MCGenDwarfLabelEntries;

	/// The string to embed in the debug information for the compile unit, if
	/// non-empty.
	StringRef DwarfDebugFlags;

	/// The string to embed in as the dwarf AT_producer for the compile unit, if
	/// non-empty.
	StringRef DwarfDebugProducer;

	/// The maximum version of dwarf that we should emit.
	uint16_t DwarfVersion = 4;

	/// Honor temporary labels, this is useful for debugging semantic
	/// differences between temporary and non-temporary labels (primarily on
	/// Darwin).
	bool AllowTemporaryLabels = true;
	bool UseNamesOnTempLabels = true;

	/// The Compile Unit ID that we are currently processing.
	unsigned DwarfCompileUnitID = 0;

	struct ELFSectionKey {
	std::string SectionName;
	StringRef GroupName;
	unsigned UniqueID;

	ELFSectionKey(StringRef SectionName, StringRef GroupName,
	unsigned UniqueID)
	: SectionName(SectionName), GroupName(GroupName), UniqueID(UniqueID) {
	}

	bool operator<(const ELFSectionKey &Other) const {
	if (SectionName != Other.SectionName)
	return SectionName < Other.SectionName;
	if (GroupName != Other.GroupName)
	return GroupName < Other.GroupName;
	return UniqueID < Other.UniqueID;
	}
	};

	struct COFFSectionKey {
	std::string SectionName;
	StringRef GroupName;
	int SelectionKey;
	unsigned UniqueID;

	COFFSectionKey(StringRef SectionName, StringRef GroupName,
	int SelectionKey, unsigned UniqueID)
	: SectionName(SectionName), GroupName(GroupName),
	SelectionKey(SelectionKey), UniqueID(UniqueID) {}

	bool operator<(const COFFSectionKey &Other) const {
	if (SectionName != Other.SectionName)
	return SectionName < Other.SectionName;
	if (GroupName != Other.GroupName)
	return GroupName < Other.GroupName;
	if (SelectionKey != Other.SelectionKey)
	return SelectionKey < Other.SelectionKey;
	return UniqueID < Other.UniqueID;
	}
	};

	struct WasmSectionKey {
	std::string SectionName;
	StringRef GroupName;
	unsigned UniqueID;

	WasmSectionKey(StringRef SectionName, StringRef GroupName,
	unsigned UniqueID)
	: SectionName(SectionName), GroupName(GroupName), UniqueID(UniqueID) {
	}

	bool operator<(const WasmSectionKey &Other) const {
	if (SectionName != Other.SectionName)
	return SectionName < Other.SectionName;
	if (GroupName != Other.GroupName)
	return GroupName < Other.GroupName;
	return UniqueID < Other.UniqueID;
	}
	};

	StringMap<MCSectionMachO *> MachOUniquingMap;
	std::map<ELFSectionKey, MCSectionELF *> ELFUniquingMap;
	std::map<COFFSectionKey, MCSectionCOFF *> COFFUniquingMap;
	std::map<WasmSectionKey, MCSectionWasm *> WasmUniquingMap;
	StringMap<bool> RelSecNames;

	SpecificBumpPtrAllocator<MCSubtargetInfo> MCSubtargetAllocator;

	/// Do automatic reset in destructor
	bool AutoReset;

	bool HadError = false;

	MCSymbol createSymbolImpl(const StringMapEntry<bool> Name,
	bool CanBeUnnamed);
	MCSymbol *createSymbol(StringRef Name, bool AlwaysAddSuffix,
	bool IsTemporary);

	MCSymbol *getOrCreateDirectionalLocalSymbol(unsigned LocalLabelVal,
	unsigned Instance);

	MCSectionELF *createELFSectionImpl(StringRef Section, unsigned Type,
	unsigned Flags, SectionKind K,
	unsigned EntrySize,
	const MCSymbolELF *Group,
	unsigned UniqueID,
	const MCSymbolELF *Associated);

	+ /// \brief Map of currently defined macros.
	+ StringMap<MCAsmMacro> MacroMap;
	+
	public:
	explicit MCContext(const MCAsmInfo MAI, const MCRegisterInfo MRI,
	const MCObjectFileInfo *MOFI,
	const SourceMgr *Mgr = nullptr, bool DoAutoReset = true);
	MCContext(const MCContext &) = delete;
	MCContext &operator=(const MCContext &) = delete;
	~MCContext();

	const SourceMgr *getSourceManager() const { return SrcMgr; }

	void setInlineSourceManager(SourceMgr *SM) { InlineSrcMgr = SM; }

	const MCAsmInfo *getAsmInfo() const { return MAI; }

	const MCRegisterInfo *getRegisterInfo() const { return MRI; }

	const MCObjectFileInfo *getObjectFileInfo() const { return MOFI; }

	CodeViewContext &getCVContext();

	void setAllowTemporaryLabels(bool Value) { AllowTemporaryLabels = Value; }
	void setUseNamesOnTempLabels(bool Value) { UseNamesOnTempLabels = Value; }

	/// \name Module Lifetime Management
	/// @{

	/// reset - return object to right after construction state to prepare
	/// to process a new module
	void reset();

	/// @}

	/// \name Symbol Management
	/// @{

	/// Create and return a new linker temporary symbol with a unique but
	/// unspecified name.
	MCSymbol *createLinkerPrivateTempSymbol();

	/// Create and return a new assembler temporary symbol with a unique but
	/// unspecified name.
	MCSymbol *createTempSymbol(bool CanBeUnnamed = true);

	MCSymbol *createTempSymbol(const Twine &Name, bool AlwaysAddSuffix,
	bool CanBeUnnamed = true);

	/// Create the definition of a directional local symbol for numbered label
	/// (used for "1:" definitions).
	MCSymbol *createDirectionalLocalSymbol(unsigned LocalLabelVal);

	/// Create and return a directional local symbol for numbered label (used
	/// for "1b" or 1f" references).
	MCSymbol *getDirectionalLocalSymbol(unsigned LocalLabelVal, bool Before);

	/// Lookup the symbol inside with the specified \p Name. If it exists,
	/// return it. If not, create a forward reference and return it.
	///
	/// \param Name - The symbol name, which must be unique across all symbols.
	MCSymbol *getOrCreateSymbol(const Twine &Name);

	/// Gets a symbol that will be defined to the final stack offset of a local
	/// variable after codegen.
	///
	/// \param Idx - The index of a local variable passed to @llvm.localescape.
	MCSymbol *getOrCreateFrameAllocSymbol(StringRef FuncName, unsigned Idx);

	MCSymbol *getOrCreateParentFrameOffsetSymbol(StringRef FuncName);

	MCSymbol *getOrCreateLSDASymbol(StringRef FuncName);

	/// Get the symbol for \p Name, or null.
	MCSymbol *lookupSymbol(const Twine &Name) const;

	/// Set value for a symbol.
	void setSymbolValue(MCStreamer &Streamer, StringRef Sym, uint64_t Val);

	/// getSymbols - Get a reference for the symbol table for clients that
	/// want to, for example, iterate over all symbols. 'const' because we
	/// still want any modifications to the table itself to use the MCContext
	/// APIs.
	const SymbolTable &getSymbols() const { return Symbols; }

	/// @}

	/// \name Section Management
	/// @{

	enum : unsigned {
	/// Pass this value as the UniqueID during section creation to get the
	/// generic section with the given name and characteristics. The usual
	/// sections such as .text use this ID.
	GenericSectionID = ~0U
	};

	/// Return the MCSection for the specified mach-o section. This requires
	/// the operands to be valid.
	MCSectionMachO *getMachOSection(StringRef Segment, StringRef Section,
	unsigned TypeAndAttributes,
	unsigned Reserved2, SectionKind K,
	const char *BeginSymName = nullptr);

	MCSectionMachO *getMachOSection(StringRef Segment, StringRef Section,
	unsigned TypeAndAttributes, SectionKind K,
	const char *BeginSymName = nullptr) {
	return getMachOSection(Segment, Section, TypeAndAttributes, 0, K,
	BeginSymName);
	}

	MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
	unsigned Flags) {
	return getELFSection(Section, Type, Flags, 0, "");
	}

	MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
	unsigned Flags, unsigned EntrySize,
	const Twine &Group) {
	return getELFSection(Section, Type, Flags, EntrySize, Group, ~0);
	}

	MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
	unsigned Flags, unsigned EntrySize,
	const Twine &Group, unsigned UniqueID) {
	return getELFSection(Section, Type, Flags, EntrySize, Group, UniqueID,
	nullptr);
	}

	MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
	unsigned Flags, unsigned EntrySize,
	const Twine &Group, unsigned UniqueID,
	const MCSymbolELF *Associated);

	MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
	unsigned Flags, unsigned EntrySize,
	const MCSymbolELF *Group, unsigned UniqueID,
	const MCSymbolELF *Associated);

	/// Get a section with the provided group identifier. This section is
	/// named by concatenating \p Prefix with '.' then \p Suffix. The \p Type
	/// describes the type of the section and \p Flags are used to further
	/// configure this named section.
	MCSectionELF *getELFNamedSection(const Twine &Prefix, const Twine &Suffix,
	unsigned Type, unsigned Flags,
	unsigned EntrySize = 0);

	MCSectionELF *createELFRelSection(const Twine &Name, unsigned Type,
	unsigned Flags, unsigned EntrySize,
	const MCSymbolELF *Group,
	const MCSectionELF *RelInfoSection);

	void renameELFSection(MCSectionELF *Section, StringRef Name);

	MCSectionELF createELFGroupSection(const MCSymbolELF Group);

	MCSectionCOFF *getCOFFSection(StringRef Section, unsigned Characteristics,
	SectionKind Kind, StringRef COMDATSymName,
	int Selection,
	unsigned UniqueID = GenericSectionID,
	const char *BeginSymName = nullptr);

	MCSectionCOFF *getCOFFSection(StringRef Section, unsigned Characteristics,
	SectionKind Kind,
	const char *BeginSymName = nullptr);

	MCSectionCOFF *getCOFFSection(StringRef Section);

	/// Gets or creates a section equivalent to Sec that is associated with the
	/// section containing KeySym. For example, to create a debug info section
	/// associated with an inline function, pass the normal debug info section
	/// as Sec and the function symbol as KeySym.
	MCSectionCOFF *
	getAssociativeCOFFSection(MCSectionCOFF Sec, const MCSymbol KeySym,
	unsigned UniqueID = GenericSectionID);

	MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K) {
	return getWasmSection(Section, K, nullptr);
	}

	MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K,
	const char *BeginSymName) {
	return getWasmSection(Section, K, "", ~0, BeginSymName);
	}

	MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K,
	const Twine &Group, unsigned UniqueID) {
	return getWasmSection(Section, K, Group, UniqueID, nullptr);
	}

	MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K,
	const Twine &Group, unsigned UniqueID,
	const char *BeginSymName);

	MCSectionWasm *getWasmSection(const Twine &Section, SectionKind K,
	const MCSymbolWasm *Group, unsigned UniqueID,
	const char *BeginSymName);

	// Create and save a copy of STI and return a reference to the copy.
	MCSubtargetInfo &getSubtargetCopy(const MCSubtargetInfo &STI);

	/// @}

	/// \name Dwarf Management
	/// @{

	/// \brief Get the compilation directory for DW_AT_comp_dir
	/// The compilation directory should be set with \c setCompilationDir before
	/// calling this function. If it is unset, an empty string will be returned.
	StringRef getCompilationDir() const { return CompilationDir; }

	/// \brief Set the compilation directory for DW_AT_comp_dir
	void setCompilationDir(StringRef S) { CompilationDir = S.str(); }

	/// \brief Get the main file name for use in error messages and debug
	/// info. This can be set to ensure we've got the correct file name
	/// after preprocessing or for -save-temps.
	const std::string &getMainFileName() const { return MainFileName; }

	/// \brief Set the main file name and override the default.
	void setMainFileName(StringRef S) { MainFileName = S; }

	/// Creates an entry in the dwarf file and directory tables.
	unsigned getDwarfFile(StringRef Directory, StringRef FileName,
	unsigned FileNumber, unsigned CUID);

	bool isValidDwarfFileNumber(unsigned FileNumber, unsigned CUID = 0);

	const std::map<unsigned, MCDwarfLineTable> &getMCDwarfLineTables() const {
	return MCDwarfLineTablesCUMap;
	}

	MCDwarfLineTable &getMCDwarfLineTable(unsigned CUID) {
	return MCDwarfLineTablesCUMap[CUID];
	}

	const MCDwarfLineTable &getMCDwarfLineTable(unsigned CUID) const {
	auto I = MCDwarfLineTablesCUMap.find(CUID);
	assert(I != MCDwarfLineTablesCUMap.end());
	return I->second;
	}

	const SmallVectorImpl<MCDwarfFile> &getMCDwarfFiles(unsigned CUID = 0) {
	return getMCDwarfLineTable(CUID).getMCDwarfFiles();
	}

	const SmallVectorImpl<std::string> &getMCDwarfDirs(unsigned CUID = 0) {
	return getMCDwarfLineTable(CUID).getMCDwarfDirs();
	}

	bool hasMCLineSections() const {
	for (const auto &Table : MCDwarfLineTablesCUMap)
	if (!Table.second.getMCDwarfFiles().empty() \|\| Table.second.getLabel())
	return true;
	return false;
	}

	unsigned getDwarfCompileUnitID() { return DwarfCompileUnitID; }

	void setDwarfCompileUnitID(unsigned CUIndex) {
	DwarfCompileUnitID = CUIndex;
	}

	void setMCLineTableCompilationDir(unsigned CUID, StringRef CompilationDir) {
	getMCDwarfLineTable(CUID).setCompilationDir(CompilationDir);
	}

	/// Saves the information from the currently parsed dwarf .loc directive
	/// and sets DwarfLocSeen. When the next instruction is assembled an entry
	/// in the line number table with this information and the address of the
	/// instruction will be created.
	void setCurrentDwarfLoc(unsigned FileNum, unsigned Line, unsigned Column,
	unsigned Flags, unsigned Isa,
	unsigned Discriminator) {
	CurrentDwarfLoc.setFileNum(FileNum);
	CurrentDwarfLoc.setLine(Line);
	CurrentDwarfLoc.setColumn(Column);
	CurrentDwarfLoc.setFlags(Flags);
	CurrentDwarfLoc.setIsa(Isa);
	CurrentDwarfLoc.setDiscriminator(Discriminator);
	DwarfLocSeen = true;
	}

	void clearDwarfLocSeen() { DwarfLocSeen = false; }

	bool getDwarfLocSeen() { return DwarfLocSeen; }
	const MCDwarfLoc &getCurrentDwarfLoc() { return CurrentDwarfLoc; }

	bool getGenDwarfForAssembly() { return GenDwarfForAssembly; }
	void setGenDwarfForAssembly(bool Value) { GenDwarfForAssembly = Value; }
	unsigned getGenDwarfFileNumber() { return GenDwarfFileNumber; }

	void setGenDwarfFileNumber(unsigned FileNumber) {
	GenDwarfFileNumber = FileNumber;
	}

	const SetVector<MCSection *> &getGenDwarfSectionSyms() {
	return SectionsForRanges;
	}

	bool addGenDwarfSection(MCSection *Sec) {
	return SectionsForRanges.insert(Sec);
	}

	void finalizeDwarfSections(MCStreamer &MCOS);

	const std::vector<MCGenDwarfLabelEntry> &getMCGenDwarfLabelEntries() const {
	return MCGenDwarfLabelEntries;
	}

	void addMCGenDwarfLabelEntry(const MCGenDwarfLabelEntry &E) {
	MCGenDwarfLabelEntries.push_back(E);
	}

	void setDwarfDebugFlags(StringRef S) { DwarfDebugFlags = S; }
	StringRef getDwarfDebugFlags() { return DwarfDebugFlags; }

	void setDwarfDebugProducer(StringRef S) { DwarfDebugProducer = S; }
	StringRef getDwarfDebugProducer() { return DwarfDebugProducer; }

	dwarf::DwarfFormat getDwarfFormat() const {
	// TODO: Support DWARF64
	return dwarf::DWARF32;
	}

	void setDwarfVersion(uint16_t v) { DwarfVersion = v; }
	uint16_t getDwarfVersion() const { return DwarfVersion; }

	/// @}

	char *getSecureLogFile() { return SecureLogFile; }
	raw_fd_ostream *getSecureLog() { return SecureLog.get(); }

	void setSecureLog(std::unique_ptr<raw_fd_ostream> Value) {
	SecureLog = std::move(Value);
	}

	bool getSecureLogUsed() { return SecureLogUsed; }
	void setSecureLogUsed(bool Value) { SecureLogUsed = Value; }

	void *allocate(unsigned Size, unsigned Align = 8) {
	return Allocator.Allocate(Size, Align);
	}

	void deallocate(void *Ptr) {}

	bool hadError() { return HadError; }
	void reportError(SMLoc L, const Twine &Msg);
	// Unrecoverable error has occurred. Display the best diagnostic we can
	// and bail via exit(1). For now, most MC backend errors are unrecoverable.
	// FIXME: We should really do something about that.
	LLVM_ATTRIBUTE_NORETURN void reportFatalError(SMLoc L,
	const Twine &Msg);
	+
	+ const MCAsmMacro *lookupMacro(StringRef Name) {
	+ StringMap<MCAsmMacro>::iterator I = MacroMap.find(Name);
	+ return (I == MacroMap.end()) ? nullptr : &I->getValue();
	+ }
	+
	+ void defineMacro(StringRef Name, MCAsmMacro Macro) {
	+ MacroMap.insert(std::make_pair(Name, std::move(Macro)));
	+ }
	+
	+ void undefineMacro(StringRef Name) { MacroMap.erase(Name); }
	};

	} // end namespace llvm

	// operator new and delete aren't allowed inside namespaces.
	// The throw specifications are mandated by the standard.
	/// \brief Placement new for using the MCContext's allocator.
	///
	/// This placement form of operator new uses the MCContext's allocator for
	/// obtaining memory. It is a non-throwing new, which means that it returns
	/// null on error. (If that is what the allocator does. The current does, so if
	/// this ever changes, this operator will have to be changed, too.)
	/// Usage looks like this (assuming there's an MCContext 'Context' in scope):
	/// \code
	/// // Default alignment (8)
	/// IntegerLiteral *Ex = new (Context) IntegerLiteral(arguments);
	/// // Specific alignment
	/// IntegerLiteral *Ex2 = new (Context, 4) IntegerLiteral(arguments);
	/// \endcode
	/// Please note that you cannot use delete on the pointer; it must be
	/// deallocated using an explicit destructor call followed by
	/// \c Context.Deallocate(Ptr).
	///
	/// \param Bytes The number of bytes to allocate. Calculated by the compiler.
	/// \param C The MCContext that provides the allocator.
	/// \param Alignment The alignment of the allocated memory (if the underlying
	/// allocator supports it).
	/// \return The allocated memory. Could be NULL.
	inline void *operator new(size_t Bytes, llvm::MCContext &C,
	size_t Alignment = 8) noexcept {
	return C.allocate(Bytes, Alignment);
	}
	/// \brief Placement delete companion to the new above.
	///
	/// This operator is just a companion to the new above. There is no way of
	/// invoking it directly; see the new operator for more details. This operator
	/// is called implicitly by the compiler if a placement new expression using
	/// the MCContext throws in the object constructor.
	inline void operator delete(void *Ptr, llvm::MCContext &C, size_t) noexcept {
	C.deallocate(Ptr);
	}

	/// This placement form of operator new[] uses the MCContext's allocator for
	/// obtaining memory. It is a non-throwing new[], which means that it returns
	/// null on error.
	/// Usage looks like this (assuming there's an MCContext 'Context' in scope):
	/// \code
	/// // Default alignment (8)
	/// char *data = new (Context) char[10];
	/// // Specific alignment
	/// char *data = new (Context, 4) char[10];
	/// \endcode
	/// Please note that you cannot use delete on the pointer; it must be
	/// deallocated using an explicit destructor call followed by
	/// \c Context.Deallocate(Ptr).
	///
	/// \param Bytes The number of bytes to allocate. Calculated by the compiler.
	/// \param C The MCContext that provides the allocator.
	/// \param Alignment The alignment of the allocated memory (if the underlying
	/// allocator supports it).
	/// \return The allocated memory. Could be NULL.
	inline void *operator new[](size_t Bytes, llvm::MCContext &C,
	size_t Alignment = 8) noexcept {
	return C.allocate(Bytes, Alignment);
	}

	/// \brief Placement delete[] companion to the new[] above.
	///
	/// This operator is just a companion to the new[] above. There is no way of
	/// invoking it directly; see the new[] operator for more details. This operator
	/// is called implicitly by the compiler if a placement new[] expression using
	/// the MCContext throws in the object constructor.
	inline void operator delete[](void *Ptr, llvm::MCContext &C) noexcept {
	C.deallocate(Ptr);
	}

	#endif // LLVM_MC_MCCONTEXT_H
	Index: head/contrib/llvm/include/llvm/Support/GenericDomTreeConstruction.h
	===================================================================
	--- head/contrib/llvm/include/llvm/Support/GenericDomTreeConstruction.h (revision 329409)
	+++ head/contrib/llvm/include/llvm/Support/GenericDomTreeConstruction.h (revision 329410)
	@@ -1,1630 +1,1626 @@
	//===- GenericDomTreeConstruction.h - Dominator Calculation ------- C++ --==//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	/// \file
	///
	/// Generic dominator tree construction - This file provides routines to
	/// construct immediate dominator information for a flow-graph based on the
	/// Semi-NCA algorithm described in this dissertation:
	///
	/// Linear-Time Algorithms for Dominators and Related Problems
	/// Loukas Georgiadis, Princeton University, November 2005, pp. 21-23:
	/// ftp://ftp.cs.princeton.edu/reports/2005/737.pdf
	///
	/// This implements the O(n*log(n)) versions of EVAL and LINK, because it turns
	/// out that the theoretically slower O(n*log(n)) implementation is actually
	/// faster than the almost-linear O(n*alpha(n)) version, even for large CFGs.
	///
	/// The file uses the Depth Based Search algorithm to perform incremental
	/// updates (insertion and deletions). The implemented algorithm is based on
	/// this publication:
	///
	/// An Experimental Study of Dynamic Dominators
	/// Loukas Georgiadis, et al., April 12 2016, pp. 5-7, 9-10:
	/// https://arxiv.org/pdf/1604.02711.pdf
	///
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_SUPPORT_GENERICDOMTREECONSTRUCTION_H
	#define LLVM_SUPPORT_GENERICDOMTREECONSTRUCTION_H

	#include <queue>
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/DenseSet.h"
	#include "llvm/ADT/DepthFirstIterator.h"
	#include "llvm/ADT/PointerIntPair.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/GenericDomTree.h"

	#define DEBUG_TYPE "dom-tree-builder"

	namespace llvm {
	namespace DomTreeBuilder {

	template <typename DomTreeT>
	struct SemiNCAInfo {
	using NodePtr = typename DomTreeT::NodePtr;
	using NodeT = typename DomTreeT::NodeType;
	using TreeNodePtr = DomTreeNodeBase<NodeT> *;
	using RootsT = decltype(DomTreeT::Roots);
	static constexpr bool IsPostDom = DomTreeT::IsPostDominator;

	// Information record used by Semi-NCA during tree construction.
	struct InfoRec {
	unsigned DFSNum = 0;
	unsigned Parent = 0;
	unsigned Semi = 0;
	NodePtr Label = nullptr;
	NodePtr IDom = nullptr;
	SmallVector<NodePtr, 2> ReverseChildren;
	};

	// Number to node mapping is 1-based. Initialize the mapping to start with
	// a dummy element.
	std::vector<NodePtr> NumToNode = {nullptr};
	DenseMap<NodePtr, InfoRec> NodeToInfo;

	using UpdateT = typename DomTreeT::UpdateType;
	struct BatchUpdateInfo {
	SmallVector<UpdateT, 4> Updates;
	using NodePtrAndKind = PointerIntPair<NodePtr, 1, UpdateKind>;

	// In order to be able to walk a CFG that is out of sync with the CFG
	// DominatorTree last knew about, use the list of updates to reconstruct
	// previous CFG versions of the current CFG. For each node, we store a set
	// of its virtually added/deleted future successors and predecessors.
	// Note that these children are from the future relative to what the
	// DominatorTree knows about -- using them to gets us some snapshot of the
	// CFG from the past (relative to the state of the CFG).
	DenseMap<NodePtr, SmallDenseSet<NodePtrAndKind, 4>> FutureSuccessors;
	DenseMap<NodePtr, SmallDenseSet<NodePtrAndKind, 4>> FuturePredecessors;
	// Remembers if the whole tree was recalculated at some point during the
	// current batch update.
	bool IsRecalculated = false;
	};

	BatchUpdateInfo *BatchUpdates;
	using BatchUpdatePtr = BatchUpdateInfo *;

	// If BUI is a nullptr, then there's no batch update in progress.
	SemiNCAInfo(BatchUpdatePtr BUI) : BatchUpdates(BUI) {}

	void clear() {
	NumToNode = {nullptr}; // Restore to initial state with a dummy start node.
	NodeToInfo.clear();
	// Don't reset the pointer to BatchUpdateInfo here -- if there's an update
	// in progress, we need this information to continue it.
	}

	template <bool Inverse>
	struct ChildrenGetter {
	using ResultTy = SmallVector<NodePtr, 8>;

	static ResultTy Get(NodePtr N, std::integral_constant<bool, false>) {
	auto RChildren = reverse(children<NodePtr>(N));
	return ResultTy(RChildren.begin(), RChildren.end());
	}

	static ResultTy Get(NodePtr N, std::integral_constant<bool, true>) {
	auto IChildren = inverse_children<NodePtr>(N);
	return ResultTy(IChildren.begin(), IChildren.end());
	}

	using Tag = std::integral_constant<bool, Inverse>;

	// The function below is the core part of the batch updater. It allows the
	// Depth Based Search algorithm to perform incremental updates in lockstep
	// with updates to the CFG. We emulated lockstep CFG updates by getting its
	// next snapshots by reverse-applying future updates.
	static ResultTy Get(NodePtr N, BatchUpdatePtr BUI) {
	ResultTy Res = Get(N, Tag());
	// If there's no batch update in progress, simply return node's children.
	if (!BUI) return Res;

	// CFG children are actually its most current children, and we have to
	// reverse-apply the future updates to get the node's children at the
	// point in time the update was performed.
	auto &FutureChildren = (Inverse != IsPostDom) ? BUI->FuturePredecessors
	: BUI->FutureSuccessors;
	auto FCIt = FutureChildren.find(N);
	if (FCIt == FutureChildren.end()) return Res;

	for (auto ChildAndKind : FCIt->second) {
	const NodePtr Child = ChildAndKind.getPointer();
	const UpdateKind UK = ChildAndKind.getInt();

	// Reverse-apply the future update.
	if (UK == UpdateKind::Insert) {
	// If there's an insertion in the future, it means that the edge must
	// exist in the current CFG, but was not present in it before.
	assert(llvm::find(Res, Child) != Res.end()
	&& "Expected child not found in the CFG");
	Res.erase(std::remove(Res.begin(), Res.end(), Child), Res.end());
	DEBUG(dbgs() << "\tHiding edge " << BlockNamePrinter(N) << " -> "
	<< BlockNamePrinter(Child) << "\n");
	} else {
	// If there's an deletion in the future, it means that the edge cannot
	// exist in the current CFG, but existed in it before.
	assert(llvm::find(Res, Child) == Res.end() &&
	"Unexpected child found in the CFG");
	DEBUG(dbgs() << "\tShowing virtual edge " << BlockNamePrinter(N)
	<< " -> " << BlockNamePrinter(Child) << "\n");
	Res.push_back(Child);
	}
	}

	return Res;
	}
	};

	NodePtr getIDom(NodePtr BB) const {
	auto InfoIt = NodeToInfo.find(BB);
	if (InfoIt == NodeToInfo.end()) return nullptr;

	return InfoIt->second.IDom;
	}

	TreeNodePtr getNodeForBlock(NodePtr BB, DomTreeT &DT) {
	if (TreeNodePtr Node = DT.getNode(BB)) return Node;

	// Haven't calculated this node yet? Get or calculate the node for the
	// immediate dominator.
	NodePtr IDom = getIDom(BB);

	assert(IDom \|\| DT.DomTreeNodes[nullptr]);
	TreeNodePtr IDomNode = getNodeForBlock(IDom, DT);

	// Add a new tree node for this NodeT, and link it as a child of
	// IDomNode
	return (DT.DomTreeNodes[BB] = IDomNode->addChild(
	llvm::make_unique<DomTreeNodeBase<NodeT>>(BB, IDomNode)))
	.get();
	}

	static bool AlwaysDescend(NodePtr, NodePtr) { return true; }

	struct BlockNamePrinter {
	NodePtr N;

	BlockNamePrinter(NodePtr Block) : N(Block) {}
	BlockNamePrinter(TreeNodePtr TN) : N(TN ? TN->getBlock() : nullptr) {}

	friend raw_ostream &operator<<(raw_ostream &O, const BlockNamePrinter &BP) {
	if (!BP.N)
	O << "nullptr";
	else
	BP.N->printAsOperand(O, false);

	return O;
	}
	};

	// Custom DFS implementation which can skip nodes based on a provided
	// predicate. It also collects ReverseChildren so that we don't have to spend
	// time getting predecessors in SemiNCA.
	//
	// If IsReverse is set to true, the DFS walk will be performed backwards
	// relative to IsPostDom -- using reverse edges for dominators and forward
	// edges for postdominators.
	template <bool IsReverse = false, typename DescendCondition>
	unsigned runDFS(NodePtr V, unsigned LastNum, DescendCondition Condition,
	unsigned AttachToNum) {
	assert(V);
	SmallVector<NodePtr, 64> WorkList = {V};
	if (NodeToInfo.count(V) != 0) NodeToInfo[V].Parent = AttachToNum;

	while (!WorkList.empty()) {
	const NodePtr BB = WorkList.pop_back_val();
	auto &BBInfo = NodeToInfo[BB];

	// Visited nodes always have positive DFS numbers.
	if (BBInfo.DFSNum != 0) continue;
	BBInfo.DFSNum = BBInfo.Semi = ++LastNum;
	BBInfo.Label = BB;
	NumToNode.push_back(BB);

	constexpr bool Direction = IsReverse != IsPostDom; // XOR.
	for (const NodePtr Succ :
	ChildrenGetter<Direction>::Get(BB, BatchUpdates)) {
	const auto SIT = NodeToInfo.find(Succ);
	// Don't visit nodes more than once but remember to collect
	// ReverseChildren.
	if (SIT != NodeToInfo.end() && SIT->second.DFSNum != 0) {
	if (Succ != BB) SIT->second.ReverseChildren.push_back(BB);
	continue;
	}

	if (!Condition(BB, Succ)) continue;

	// It's fine to add Succ to the map, because we know that it will be
	// visited later.
	auto &SuccInfo = NodeToInfo[Succ];
	WorkList.push_back(Succ);
	SuccInfo.Parent = LastNum;
	SuccInfo.ReverseChildren.push_back(BB);
	}
	}

	return LastNum;
	}

	NodePtr eval(NodePtr VIn, unsigned LastLinked) {
	auto &VInInfo = NodeToInfo[VIn];
	if (VInInfo.DFSNum < LastLinked)
	return VIn;

	SmallVector<NodePtr, 32> Work;
	SmallPtrSet<NodePtr, 32> Visited;

	if (VInInfo.Parent >= LastLinked)
	Work.push_back(VIn);

	while (!Work.empty()) {
	NodePtr V = Work.back();
	auto &VInfo = NodeToInfo[V];
	NodePtr VAncestor = NumToNode[VInfo.Parent];

	// Process Ancestor first
	if (Visited.insert(VAncestor).second && VInfo.Parent >= LastLinked) {
	Work.push_back(VAncestor);
	continue;
	}
	Work.pop_back();

	// Update VInfo based on Ancestor info
	if (VInfo.Parent < LastLinked)
	continue;

	auto &VAInfo = NodeToInfo[VAncestor];
	NodePtr VAncestorLabel = VAInfo.Label;
	NodePtr VLabel = VInfo.Label;
	if (NodeToInfo[VAncestorLabel].Semi < NodeToInfo[VLabel].Semi)
	VInfo.Label = VAncestorLabel;
	VInfo.Parent = VAInfo.Parent;
	}

	return VInInfo.Label;
	}

	// This function requires DFS to be run before calling it.
	void runSemiNCA(DomTreeT &DT, const unsigned MinLevel = 0) {
	const unsigned NextDFSNum(NumToNode.size());
	// Initialize IDoms to spanning tree parents.
	for (unsigned i = 1; i < NextDFSNum; ++i) {
	const NodePtr V = NumToNode[i];
	auto &VInfo = NodeToInfo[V];
	VInfo.IDom = NumToNode[VInfo.Parent];
	}

	// Step #1: Calculate the semidominators of all vertices.
	for (unsigned i = NextDFSNum - 1; i >= 2; --i) {
	NodePtr W = NumToNode[i];
	auto &WInfo = NodeToInfo[W];

	// Initialize the semi dominator to point to the parent node.
	WInfo.Semi = WInfo.Parent;
	for (const auto &N : WInfo.ReverseChildren) {
	if (NodeToInfo.count(N) == 0) // Skip unreachable predecessors.
	continue;

	const TreeNodePtr TN = DT.getNode(N);
	// Skip predecessors whose level is above the subtree we are processing.
	if (TN && TN->getLevel() < MinLevel)
	continue;

	unsigned SemiU = NodeToInfo[eval(N, i + 1)].Semi;
	if (SemiU < WInfo.Semi) WInfo.Semi = SemiU;
	}
	}

	// Step #2: Explicitly define the immediate dominator of each vertex.
	// IDom[i] = NCA(SDom[i], SpanningTreeParent(i)).
	// Note that the parents were stored in IDoms and later got invalidated
	// during path compression in Eval.
	for (unsigned i = 2; i < NextDFSNum; ++i) {
	const NodePtr W = NumToNode[i];
	auto &WInfo = NodeToInfo[W];
	const unsigned SDomNum = NodeToInfo[NumToNode[WInfo.Semi]].DFSNum;
	NodePtr WIDomCandidate = WInfo.IDom;
	while (NodeToInfo[WIDomCandidate].DFSNum > SDomNum)
	WIDomCandidate = NodeToInfo[WIDomCandidate].IDom;

	WInfo.IDom = WIDomCandidate;
	}
	}

	// PostDominatorTree always has a virtual root that represents a virtual CFG
	// node that serves as a single exit from the function. All the other exits
	// (CFG nodes with terminators and nodes in infinite loops are logically
	// connected to this virtual CFG exit node).
	// This functions maps a nullptr CFG node to the virtual root tree node.
	void addVirtualRoot() {
	assert(IsPostDom && "Only postdominators have a virtual root");
	assert(NumToNode.size() == 1 && "SNCAInfo must be freshly constructed");

	auto &BBInfo = NodeToInfo[nullptr];
	BBInfo.DFSNum = BBInfo.Semi = 1;
	BBInfo.Label = nullptr;

	NumToNode.push_back(nullptr); // NumToNode[1] = nullptr;
	}

	// For postdominators, nodes with no forward successors are trivial roots that
	// are always selected as tree roots. Roots with forward successors correspond
	// to CFG nodes within infinite loops.
	static bool HasForwardSuccessors(const NodePtr N, BatchUpdatePtr BUI) {
	assert(N && "N must be a valid node");
	return !ChildrenGetter<false>::Get(N, BUI).empty();
	}

	static NodePtr GetEntryNode(const DomTreeT &DT) {
	assert(DT.Parent && "Parent not set");
	return GraphTraits<typename DomTreeT::ParentPtr>::getEntryNode(DT.Parent);
	}

	// Finds all roots without relaying on the set of roots already stored in the
	// tree.
	// We define roots to be some non-redundant set of the CFG nodes
	static RootsT FindRoots(const DomTreeT &DT, BatchUpdatePtr BUI) {
	assert(DT.Parent && "Parent pointer is not set");
	RootsT Roots;

	// For dominators, function entry CFG node is always a tree root node.
	if (!IsPostDom) {
	Roots.push_back(GetEntryNode(DT));
	return Roots;
	}

	SemiNCAInfo SNCA(BUI);

	// PostDominatorTree always has a virtual root.
	SNCA.addVirtualRoot();
	unsigned Num = 1;

	DEBUG(dbgs() << "\t\tLooking for trivial roots\n");

	// Step #1: Find all the trivial roots that are going to will definitely
	// remain tree roots.
	unsigned Total = 0;
	// It may happen that there are some new nodes in the CFG that are result of
	// the ongoing batch update, but we cannot really pretend that they don't
	// exist -- we won't see any outgoing or incoming edges to them, so it's
	// fine to discover them here, as they would end up appearing in the CFG at
	// some point anyway.
	for (const NodePtr N : nodes(DT.Parent)) {
	++Total;
	// If it has no successors, it is definitely a root.
	if (!HasForwardSuccessors(N, BUI)) {
	Roots.push_back(N);
	// Run DFS not to walk this part of CFG later.
	Num = SNCA.runDFS(N, Num, AlwaysDescend, 1);
	DEBUG(dbgs() << "Found a new trivial root: " << BlockNamePrinter(N)
	<< "\n");
	DEBUG(dbgs() << "Last visited node: "
	<< BlockNamePrinter(SNCA.NumToNode[Num]) << "\n");
	}
	}

	DEBUG(dbgs() << "\t\tLooking for non-trivial roots\n");

	// Step #2: Find all non-trivial root candidates. Those are CFG nodes that
	// are reverse-unreachable were not visited by previous DFS walks (i.e. CFG
	// nodes in infinite loops).
	bool HasNonTrivialRoots = false;
	// Accounting for the virtual exit, see if we had any reverse-unreachable
	// nodes.
	if (Total + 1 != Num) {
	HasNonTrivialRoots = true;
	// Make another DFS pass over all other nodes to find the
	// reverse-unreachable blocks, and find the furthest paths we'll be able
	// to make.
	// Note that this looks N^2, but it's really 2N worst case, if every node
	// is unreachable. This is because we are still going to only visit each
	// unreachable node once, we may just visit it in two directions,
	// depending on how lucky we get.
	SmallPtrSet<NodePtr, 4> ConnectToExitBlock;
	for (const NodePtr I : nodes(DT.Parent)) {
	if (SNCA.NodeToInfo.count(I) == 0) {
	DEBUG(dbgs() << "\t\t\tVisiting node " << BlockNamePrinter(I)
	<< "\n");
	// Find the furthest away we can get by following successors, then
	// follow them in reverse. This gives us some reasonable answer about
	// the post-dom tree inside any infinite loop. In particular, it
	// guarantees we get to the farthest away point along some
	// path. This also matches the GCC's behavior.
	// If we really wanted a totally complete picture of dominance inside
	// this infinite loop, we could do it with SCC-like algorithms to find
	// the lowest and highest points in the infinite loop. In theory, it
	// would be nice to give the canonical backedge for the loop, but it's
	// expensive and does not always lead to a minimal set of roots.
	DEBUG(dbgs() << "\t\t\tRunning forward DFS\n");

	const unsigned NewNum = SNCA.runDFS<true>(I, Num, AlwaysDescend, Num);
	const NodePtr FurthestAway = SNCA.NumToNode[NewNum];
	DEBUG(dbgs() << "\t\t\tFound a new furthest away node "
	<< "(non-trivial root): "
	<< BlockNamePrinter(FurthestAway) << "\n");
	ConnectToExitBlock.insert(FurthestAway);
	Roots.push_back(FurthestAway);
	DEBUG(dbgs() << "\t\t\tPrev DFSNum: " << Num << ", new DFSNum: "
	<< NewNum << "\n\t\t\tRemoving DFS info\n");
	for (unsigned i = NewNum; i > Num; --i) {
	const NodePtr N = SNCA.NumToNode[i];
	DEBUG(dbgs() << "\t\t\t\tRemoving DFS info for "
	<< BlockNamePrinter(N) << "\n");
	SNCA.NodeToInfo.erase(N);
	SNCA.NumToNode.pop_back();
	}
	const unsigned PrevNum = Num;
	DEBUG(dbgs() << "\t\t\tRunning reverse DFS\n");
	Num = SNCA.runDFS(FurthestAway, Num, AlwaysDescend, 1);
	for (unsigned i = PrevNum + 1; i <= Num; ++i)
	DEBUG(dbgs() << "\t\t\t\tfound node "
	<< BlockNamePrinter(SNCA.NumToNode[i]) << "\n");
	}
	}
	}

	DEBUG(dbgs() << "Total: " << Total << ", Num: " << Num << "\n");
	DEBUG(dbgs() << "Discovered CFG nodes:\n");
	DEBUG(for (size_t i = 0; i <= Num; ++i) dbgs()
	<< i << ": " << BlockNamePrinter(SNCA.NumToNode[i]) << "\n");

	assert((Total + 1 == Num) && "Everything should have been visited");

	// Step #3: If we found some non-trivial roots, make them non-redundant.
	if (HasNonTrivialRoots) RemoveRedundantRoots(DT, BUI, Roots);

	DEBUG(dbgs() << "Found roots: ");
	DEBUG(for (auto *Root : Roots) dbgs() << BlockNamePrinter(Root) << " ");
	DEBUG(dbgs() << "\n");

	return Roots;
	}

	// This function only makes sense for postdominators.
	// We define roots to be some set of CFG nodes where (reverse) DFS walks have
	// to start in order to visit all the CFG nodes (including the
	// reverse-unreachable ones).
	// When the search for non-trivial roots is done it may happen that some of
	// the non-trivial roots are reverse-reachable from other non-trivial roots,
	// which makes them redundant. This function removes them from the set of
	// input roots.
	static void RemoveRedundantRoots(const DomTreeT &DT, BatchUpdatePtr BUI,
	RootsT &Roots) {
	assert(IsPostDom && "This function is for postdominators only");
	DEBUG(dbgs() << "Removing redundant roots\n");

	SemiNCAInfo SNCA(BUI);

	for (unsigned i = 0; i < Roots.size(); ++i) {
	auto &Root = Roots[i];
	// Trivial roots are always non-redundant.
	if (!HasForwardSuccessors(Root, BUI)) continue;
	DEBUG(dbgs() << "\tChecking if " << BlockNamePrinter(Root)
	<< " remains a root\n");
	SNCA.clear();
	// Do a forward walk looking for the other roots.
	const unsigned Num = SNCA.runDFS<true>(Root, 0, AlwaysDescend, 0);
	// Skip the start node and begin from the second one (note that DFS uses
	// 1-based indexing).
	for (unsigned x = 2; x <= Num; ++x) {
	const NodePtr N = SNCA.NumToNode[x];
	// If we wound another root in a (forward) DFS walk, remove the current
	// root from the set of roots, as it is reverse-reachable from the other
	// one.
	if (llvm::find(Roots, N) != Roots.end()) {
	DEBUG(dbgs() << "\tForward DFS walk found another root "
	<< BlockNamePrinter(N) << "\n\tRemoving root "
	<< BlockNamePrinter(Root) << "\n");
	std::swap(Root, Roots.back());
	Roots.pop_back();

	// Root at the back takes the current root's place.
	// Start the next loop iteration with the same index.
	--i;
	break;
	}
	}
	}
	}

	template <typename DescendCondition>
	void doFullDFSWalk(const DomTreeT &DT, DescendCondition DC) {
	if (!IsPostDom) {
	assert(DT.Roots.size() == 1 && "Dominators should have a singe root");
	runDFS(DT.Roots[0], 0, DC, 0);
	return;
	}

	addVirtualRoot();
	unsigned Num = 1;
	for (const NodePtr Root : DT.Roots) Num = runDFS(Root, Num, DC, 0);
	}

	static void CalculateFromScratch(DomTreeT &DT, BatchUpdatePtr BUI) {
	auto *Parent = DT.Parent;
	DT.reset();
	DT.Parent = Parent;
	SemiNCAInfo SNCA(nullptr); // Since we are rebuilding the whole tree,
	// there's no point doing it incrementally.

	// Step #0: Number blocks in depth-first order and initialize variables used
	// in later stages of the algorithm.
	DT.Roots = FindRoots(DT, nullptr);
	SNCA.doFullDFSWalk(DT, AlwaysDescend);

	SNCA.runSemiNCA(DT);
	if (BUI) {
	BUI->IsRecalculated = true;
	DEBUG(dbgs() << "DomTree recalculated, skipping future batch updates\n");
	}

	if (DT.Roots.empty()) return;

	// Add a node for the root. If the tree is a PostDominatorTree it will be
	// the virtual exit (denoted by (BasicBlock *) nullptr) which postdominates
	// all real exits (including multiple exit blocks, infinite loops).
	NodePtr Root = IsPostDom ? nullptr : DT.Roots[0];

	DT.RootNode = (DT.DomTreeNodes[Root] =
	llvm::make_unique<DomTreeNodeBase<NodeT>>(Root, nullptr))
	.get();
	SNCA.attachNewSubtree(DT, DT.RootNode);
	}

	void attachNewSubtree(DomTreeT& DT, const TreeNodePtr AttachTo) {
	// Attach the first unreachable block to AttachTo.
	NodeToInfo[NumToNode[1]].IDom = AttachTo->getBlock();
	// Loop over all of the discovered blocks in the function...
	for (size_t i = 1, e = NumToNode.size(); i != e; ++i) {
	NodePtr W = NumToNode[i];
	DEBUG(dbgs() << "\tdiscovered a new reachable node "
	<< BlockNamePrinter(W) << "\n");

	// Don't replace this with 'count', the insertion side effect is important
	if (DT.DomTreeNodes[W]) continue; // Haven't calculated this node yet?

	NodePtr ImmDom = getIDom(W);

	// Get or calculate the node for the immediate dominator.
	TreeNodePtr IDomNode = getNodeForBlock(ImmDom, DT);

	// Add a new tree node for this BasicBlock, and link it as a child of
	// IDomNode.
	DT.DomTreeNodes[W] = IDomNode->addChild(
	llvm::make_unique<DomTreeNodeBase<NodeT>>(W, IDomNode));
	}
	}

	void reattachExistingSubtree(DomTreeT &DT, const TreeNodePtr AttachTo) {
	NodeToInfo[NumToNode[1]].IDom = AttachTo->getBlock();
	for (size_t i = 1, e = NumToNode.size(); i != e; ++i) {
	const NodePtr N = NumToNode[i];
	const TreeNodePtr TN = DT.getNode(N);
	assert(TN);
	const TreeNodePtr NewIDom = DT.getNode(NodeToInfo[N].IDom);
	TN->setIDom(NewIDom);
	}
	}

	// Helper struct used during edge insertions.
	struct InsertionInfo {
	using BucketElementTy = std::pair<unsigned, TreeNodePtr>;
	struct DecreasingLevel {
	bool operator()(const BucketElementTy &First,
	const BucketElementTy &Second) const {
	return First.first > Second.first;
	}
	};

	std::priority_queue<BucketElementTy, SmallVector<BucketElementTy, 8>,
	DecreasingLevel>
	Bucket; // Queue of tree nodes sorted by level in descending order.
	SmallDenseSet<TreeNodePtr, 8> Affected;
	SmallDenseMap<TreeNodePtr, unsigned, 8> Visited;
	SmallVector<TreeNodePtr, 8> AffectedQueue;
	SmallVector<TreeNodePtr, 8> VisitedNotAffectedQueue;
	};

	static void InsertEdge(DomTreeT &DT, const BatchUpdatePtr BUI,
	const NodePtr From, const NodePtr To) {
	assert((From \|\| IsPostDom) &&
	"From has to be a valid CFG node or a virtual root");
	assert(To && "Cannot be a nullptr");
	DEBUG(dbgs() << "Inserting edge " << BlockNamePrinter(From) << " -> "
	<< BlockNamePrinter(To) << "\n");
	TreeNodePtr FromTN = DT.getNode(From);

	if (!FromTN) {
	// Ignore edges from unreachable nodes for (forward) dominators.
	if (!IsPostDom) return;

	// The unreachable node becomes a new root -- a tree node for it.
	TreeNodePtr VirtualRoot = DT.getNode(nullptr);
	FromTN =
	(DT.DomTreeNodes[From] = VirtualRoot->addChild(
	llvm::make_unique<DomTreeNodeBase<NodeT>>(From, VirtualRoot)))
	.get();
	DT.Roots.push_back(From);
	}

	DT.DFSInfoValid = false;

	const TreeNodePtr ToTN = DT.getNode(To);
	if (!ToTN)
	InsertUnreachable(DT, BUI, FromTN, To);
	else
	InsertReachable(DT, BUI, FromTN, ToTN);
	}

	// Determines if some existing root becomes reverse-reachable after the
	// insertion. Rebuilds the whole tree if that situation happens.
	static bool UpdateRootsBeforeInsertion(DomTreeT &DT, const BatchUpdatePtr BUI,
	const TreeNodePtr From,
	const TreeNodePtr To) {
	assert(IsPostDom && "This function is only for postdominators");
	// Destination node is not attached to the virtual root, so it cannot be a
	// root.
	if (!DT.isVirtualRoot(To->getIDom())) return false;

	auto RIt = llvm::find(DT.Roots, To->getBlock());
	if (RIt == DT.Roots.end())
	return false; // To is not a root, nothing to update.

	DEBUG(dbgs() << "\t\tAfter the insertion, " << BlockNamePrinter(To)
	<< " is no longer a root\n\t\tRebuilding the tree!!!\n");

	CalculateFromScratch(DT, BUI);
	return true;
	}

	// Updates the set of roots after insertion or deletion. This ensures that
	// roots are the same when after a series of updates and when the tree would
	// be built from scratch.
	static void UpdateRootsAfterUpdate(DomTreeT &DT, const BatchUpdatePtr BUI) {
	assert(IsPostDom && "This function is only for postdominators");

	// The tree has only trivial roots -- nothing to update.
	if (std::none_of(DT.Roots.begin(), DT.Roots.end(), [BUI](const NodePtr N) {
	return HasForwardSuccessors(N, BUI);
	}))
	return;

	// Recalculate the set of roots.
	- DT.Roots = FindRoots(DT, BUI);
	- for (const NodePtr R : DT.Roots) {
	- const TreeNodePtr TN = DT.getNode(R);
	- // A CFG node was selected as a tree root, but the corresponding tree node
	- // is not connected to the virtual root. This is because the incremental
	- // algorithm does not really know or use the set of roots and can make a
	- // different (implicit) decision about which nodes within an infinite loop
	- // becomes a root.
	- if (TN && !DT.isVirtualRoot(TN->getIDom())) {
	- DEBUG(dbgs() << "Root " << BlockNamePrinter(R)
	- << " is not virtual root's child\n"
	- << "The entire tree needs to be rebuilt\n");
	- // It should be possible to rotate the subtree instead of recalculating
	- // the whole tree, but this situation happens extremely rarely in
	- // practice.
	- CalculateFromScratch(DT, BUI);
	- return;
	- }
	+ auto Roots = FindRoots(DT, BUI);
	+ if (DT.Roots.size() != Roots.size() \|\|
	+ !std::is_permutation(DT.Roots.begin(), DT.Roots.end(), Roots.begin())) {
	+ // The roots chosen in the CFG have changed. This is because the
	+ // incremental algorithm does not really know or use the set of roots and
	+ // can make a different (implicit) decision about which node within an
	+ // infinite loop becomes a root.
	+
	+ DEBUG(dbgs() << "Roots are different in updated trees\n"
	+ << "The entire tree needs to be rebuilt\n");
	+ // It may be possible to update the tree without recalculating it, but
	+ // we do not know yet how to do it, and it happens rarely in practise.
	+ CalculateFromScratch(DT, BUI);
	+ return;
	}
	}

	// Handles insertion to a node already in the dominator tree.
	static void InsertReachable(DomTreeT &DT, const BatchUpdatePtr BUI,
	const TreeNodePtr From, const TreeNodePtr To) {
	DEBUG(dbgs() << "\tReachable " << BlockNamePrinter(From->getBlock())
	<< " -> " << BlockNamePrinter(To->getBlock()) << "\n");
	if (IsPostDom && UpdateRootsBeforeInsertion(DT, BUI, From, To)) return;
	// DT.findNCD expects both pointers to be valid. When From is a virtual
	// root, then its CFG block pointer is a nullptr, so we have to 'compute'
	// the NCD manually.
	const NodePtr NCDBlock =
	(From->getBlock() && To->getBlock())
	? DT.findNearestCommonDominator(From->getBlock(), To->getBlock())
	: nullptr;
	assert(NCDBlock \|\| DT.isPostDominator());
	const TreeNodePtr NCD = DT.getNode(NCDBlock);
	assert(NCD);

	DEBUG(dbgs() << "\t\tNCA == " << BlockNamePrinter(NCD) << "\n");
	const TreeNodePtr ToIDom = To->getIDom();

	// Nothing affected -- NCA property holds.
	// (Based on the lemma 2.5 from the second paper.)
	if (NCD == To \|\| NCD == ToIDom) return;

	// Identify and collect affected nodes.
	InsertionInfo II;
	DEBUG(dbgs() << "Marking " << BlockNamePrinter(To) << " as affected\n");
	II.Affected.insert(To);
	const unsigned ToLevel = To->getLevel();
	DEBUG(dbgs() << "Putting " << BlockNamePrinter(To) << " into a Bucket\n");
	II.Bucket.push({ToLevel, To});

	while (!II.Bucket.empty()) {
	const TreeNodePtr CurrentNode = II.Bucket.top().second;
	const unsigned CurrentLevel = CurrentNode->getLevel();
	II.Bucket.pop();
	DEBUG(dbgs() << "\tAdding to Visited and AffectedQueue: "
	<< BlockNamePrinter(CurrentNode) << "\n");

	II.Visited.insert({CurrentNode, CurrentLevel});
	II.AffectedQueue.push_back(CurrentNode);

	// Discover and collect affected successors of the current node.
	VisitInsertion(DT, BUI, CurrentNode, CurrentLevel, NCD, II);
	}

	// Finish by updating immediate dominators and levels.
	UpdateInsertion(DT, BUI, NCD, II);
	}

	// Visits an affected node and collect its affected successors.
	static void VisitInsertion(DomTreeT &DT, const BatchUpdatePtr BUI,
	const TreeNodePtr TN, const unsigned RootLevel,
	const TreeNodePtr NCD, InsertionInfo &II) {
	const unsigned NCDLevel = NCD->getLevel();
	DEBUG(dbgs() << "Visiting " << BlockNamePrinter(TN) << ", RootLevel "
	<< RootLevel << "\n");

	SmallVector<TreeNodePtr, 8> Stack = {TN};
	assert(TN->getBlock() && II.Visited.count(TN) && "Preconditions!");

	SmallPtrSet<TreeNodePtr, 8> Processed;

	do {
	TreeNodePtr Next = Stack.pop_back_val();
	DEBUG(dbgs() << " Next: " << BlockNamePrinter(Next) << "\n");

	for (const NodePtr Succ :
	ChildrenGetter<IsPostDom>::Get(Next->getBlock(), BUI)) {
	const TreeNodePtr SuccTN = DT.getNode(Succ);
	assert(SuccTN && "Unreachable successor found at reachable insertion");
	const unsigned SuccLevel = SuccTN->getLevel();

	DEBUG(dbgs() << "\tSuccessor " << BlockNamePrinter(Succ) << ", level = "
	<< SuccLevel << "\n");

	// Do not process the same node multiple times.
	if (Processed.count(Next) > 0)
	continue;

	// Succ dominated by subtree From -- not affected.
	// (Based on the lemma 2.5 from the second paper.)
	if (SuccLevel > RootLevel) {
	DEBUG(dbgs() << "\t\tDominated by subtree From\n");
	if (II.Visited.count(SuccTN) != 0) {
	DEBUG(dbgs() << "\t\t\talready visited at level "
	<< II.Visited[SuccTN] << "\n\t\t\tcurrent level "
	<< RootLevel << ")\n");

	// A node can be necessary to visit again if we see it again at
	// a lower level than before.
	if (II.Visited[SuccTN] >= RootLevel)
	continue;
	}

	DEBUG(dbgs() << "\t\tMarking visited not affected "
	<< BlockNamePrinter(Succ) << "\n");
	II.Visited.insert({SuccTN, RootLevel});
	II.VisitedNotAffectedQueue.push_back(SuccTN);
	Stack.push_back(SuccTN);
	} else if ((SuccLevel > NCDLevel + 1) &&
	II.Affected.count(SuccTN) == 0) {
	DEBUG(dbgs() << "\t\tMarking affected and adding "
	<< BlockNamePrinter(Succ) << " to a Bucket\n");
	II.Affected.insert(SuccTN);
	II.Bucket.push({SuccLevel, SuccTN});
	}
	}

	Processed.insert(Next);
	} while (!Stack.empty());
	}

	// Updates immediate dominators and levels after insertion.
	static void UpdateInsertion(DomTreeT &DT, const BatchUpdatePtr BUI,
	const TreeNodePtr NCD, InsertionInfo &II) {
	DEBUG(dbgs() << "Updating NCD = " << BlockNamePrinter(NCD) << "\n");

	for (const TreeNodePtr TN : II.AffectedQueue) {
	DEBUG(dbgs() << "\tIDom(" << BlockNamePrinter(TN)
	<< ") = " << BlockNamePrinter(NCD) << "\n");
	TN->setIDom(NCD);
	}

	UpdateLevelsAfterInsertion(II);
	if (IsPostDom) UpdateRootsAfterUpdate(DT, BUI);
	}

	static void UpdateLevelsAfterInsertion(InsertionInfo &II) {
	DEBUG(dbgs() << "Updating levels for visited but not affected nodes\n");

	for (const TreeNodePtr TN : II.VisitedNotAffectedQueue) {
	DEBUG(dbgs() << "\tlevel(" << BlockNamePrinter(TN) << ") = ("
	<< BlockNamePrinter(TN->getIDom()) << ") "
	<< TN->getIDom()->getLevel() << " + 1\n");
	TN->UpdateLevel();
	}
	}

	// Handles insertion to previously unreachable nodes.
	static void InsertUnreachable(DomTreeT &DT, const BatchUpdatePtr BUI,
	const TreeNodePtr From, const NodePtr To) {
	DEBUG(dbgs() << "Inserting " << BlockNamePrinter(From)
	<< " -> (unreachable) " << BlockNamePrinter(To) << "\n");

	// Collect discovered edges to already reachable nodes.
	SmallVector<std::pair<NodePtr, TreeNodePtr>, 8> DiscoveredEdgesToReachable;
	// Discover and connect nodes that became reachable with the insertion.
	ComputeUnreachableDominators(DT, BUI, To, From, DiscoveredEdgesToReachable);

	DEBUG(dbgs() << "Inserted " << BlockNamePrinter(From)
	<< " -> (prev unreachable) " << BlockNamePrinter(To) << "\n");

	// Used the discovered edges and inset discovered connecting (incoming)
	// edges.
	for (const auto &Edge : DiscoveredEdgesToReachable) {
	DEBUG(dbgs() << "\tInserting discovered connecting edge "
	<< BlockNamePrinter(Edge.first) << " -> "
	<< BlockNamePrinter(Edge.second) << "\n");
	InsertReachable(DT, BUI, DT.getNode(Edge.first), Edge.second);
	}
	}

	// Connects nodes that become reachable with an insertion.
	static void ComputeUnreachableDominators(
	DomTreeT &DT, const BatchUpdatePtr BUI, const NodePtr Root,
	const TreeNodePtr Incoming,
	SmallVectorImpl<std::pair<NodePtr, TreeNodePtr>>
	&DiscoveredConnectingEdges) {
	assert(!DT.getNode(Root) && "Root must not be reachable");

	// Visit only previously unreachable nodes.
	auto UnreachableDescender = [&DT, &DiscoveredConnectingEdges](NodePtr From,
	NodePtr To) {
	const TreeNodePtr ToTN = DT.getNode(To);
	if (!ToTN) return true;

	DiscoveredConnectingEdges.push_back({From, ToTN});
	return false;
	};

	SemiNCAInfo SNCA(BUI);
	SNCA.runDFS(Root, 0, UnreachableDescender, 0);
	SNCA.runSemiNCA(DT);
	SNCA.attachNewSubtree(DT, Incoming);

	DEBUG(dbgs() << "After adding unreachable nodes\n");
	}

	static void DeleteEdge(DomTreeT &DT, const BatchUpdatePtr BUI,
	const NodePtr From, const NodePtr To) {
	assert(From && To && "Cannot disconnect nullptrs");
	DEBUG(dbgs() << "Deleting edge " << BlockNamePrinter(From) << " -> "
	<< BlockNamePrinter(To) << "\n");

	#ifndef NDEBUG
	// Ensure that the edge was in fact deleted from the CFG before informing
	// the DomTree about it.
	// The check is O(N), so run it only in debug configuration.
	auto IsSuccessor = [BUI](const NodePtr SuccCandidate, const NodePtr Of) {
	auto Successors = ChildrenGetter<IsPostDom>::Get(Of, BUI);
	return llvm::find(Successors, SuccCandidate) != Successors.end();
	};
	(void)IsSuccessor;
	assert(!IsSuccessor(To, From) && "Deleted edge still exists in the CFG!");
	#endif

	const TreeNodePtr FromTN = DT.getNode(From);
	// Deletion in an unreachable subtree -- nothing to do.
	if (!FromTN) return;

	const TreeNodePtr ToTN = DT.getNode(To);
	if (!ToTN) {
	DEBUG(dbgs() << "\tTo (" << BlockNamePrinter(To)
	<< ") already unreachable -- there is no edge to delete\n");
	return;
	}

	const NodePtr NCDBlock = DT.findNearestCommonDominator(From, To);
	const TreeNodePtr NCD = DT.getNode(NCDBlock);

	// If To dominates From -- nothing to do.
	if (ToTN != NCD) {
	DT.DFSInfoValid = false;

	const TreeNodePtr ToIDom = ToTN->getIDom();
	DEBUG(dbgs() << "\tNCD " << BlockNamePrinter(NCD) << ", ToIDom "
	<< BlockNamePrinter(ToIDom) << "\n");

	// To remains reachable after deletion.
	// (Based on the caption under Figure 4. from the second paper.)
	if (FromTN != ToIDom \|\| HasProperSupport(DT, BUI, ToTN))
	DeleteReachable(DT, BUI, FromTN, ToTN);
	else
	DeleteUnreachable(DT, BUI, ToTN);
	}

	if (IsPostDom) UpdateRootsAfterUpdate(DT, BUI);
	}

	// Handles deletions that leave destination nodes reachable.
	static void DeleteReachable(DomTreeT &DT, const BatchUpdatePtr BUI,
	const TreeNodePtr FromTN,
	const TreeNodePtr ToTN) {
	DEBUG(dbgs() << "Deleting reachable " << BlockNamePrinter(FromTN) << " -> "
	<< BlockNamePrinter(ToTN) << "\n");
	DEBUG(dbgs() << "\tRebuilding subtree\n");

	// Find the top of the subtree that needs to be rebuilt.
	// (Based on the lemma 2.6 from the second paper.)
	const NodePtr ToIDom =
	DT.findNearestCommonDominator(FromTN->getBlock(), ToTN->getBlock());
	assert(ToIDom \|\| DT.isPostDominator());
	const TreeNodePtr ToIDomTN = DT.getNode(ToIDom);
	assert(ToIDomTN);
	const TreeNodePtr PrevIDomSubTree = ToIDomTN->getIDom();
	// Top of the subtree to rebuild is the root node. Rebuild the tree from
	// scratch.
	if (!PrevIDomSubTree) {
	DEBUG(dbgs() << "The entire tree needs to be rebuilt\n");
	CalculateFromScratch(DT, BUI);
	return;
	}

	// Only visit nodes in the subtree starting at To.
	const unsigned Level = ToIDomTN->getLevel();
	auto DescendBelow = [Level, &DT](NodePtr, NodePtr To) {
	return DT.getNode(To)->getLevel() > Level;
	};

	DEBUG(dbgs() << "\tTop of subtree: " << BlockNamePrinter(ToIDomTN) << "\n");

	SemiNCAInfo SNCA(BUI);
	SNCA.runDFS(ToIDom, 0, DescendBelow, 0);
	DEBUG(dbgs() << "\tRunning Semi-NCA\n");
	SNCA.runSemiNCA(DT, Level);
	SNCA.reattachExistingSubtree(DT, PrevIDomSubTree);
	}

	// Checks if a node has proper support, as defined on the page 3 and later
	// explained on the page 7 of the second paper.
	static bool HasProperSupport(DomTreeT &DT, const BatchUpdatePtr BUI,
	const TreeNodePtr TN) {
	DEBUG(dbgs() << "IsReachableFromIDom " << BlockNamePrinter(TN) << "\n");
	for (const NodePtr Pred :
	ChildrenGetter<!IsPostDom>::Get(TN->getBlock(), BUI)) {
	DEBUG(dbgs() << "\tPred " << BlockNamePrinter(Pred) << "\n");
	if (!DT.getNode(Pred)) continue;

	const NodePtr Support =
	DT.findNearestCommonDominator(TN->getBlock(), Pred);
	DEBUG(dbgs() << "\tSupport " << BlockNamePrinter(Support) << "\n");
	if (Support != TN->getBlock()) {
	DEBUG(dbgs() << "\t" << BlockNamePrinter(TN)
	<< " is reachable from support "
	<< BlockNamePrinter(Support) << "\n");
	return true;
	}
	}

	return false;
	}

	// Handle deletions that make destination node unreachable.
	// (Based on the lemma 2.7 from the second paper.)
	static void DeleteUnreachable(DomTreeT &DT, const BatchUpdatePtr BUI,
	const TreeNodePtr ToTN) {
	DEBUG(dbgs() << "Deleting unreachable subtree " << BlockNamePrinter(ToTN)
	<< "\n");
	assert(ToTN);
	assert(ToTN->getBlock());

	if (IsPostDom) {
	// Deletion makes a region reverse-unreachable and creates a new root.
	// Simulate that by inserting an edge from the virtual root to ToTN and
	// adding it as a new root.
	DEBUG(dbgs() << "\tDeletion made a region reverse-unreachable\n");
	DEBUG(dbgs() << "\tAdding new root " << BlockNamePrinter(ToTN) << "\n");
	DT.Roots.push_back(ToTN->getBlock());
	InsertReachable(DT, BUI, DT.getNode(nullptr), ToTN);
	return;
	}

	SmallVector<NodePtr, 16> AffectedQueue;
	const unsigned Level = ToTN->getLevel();

	// Traverse destination node's descendants with greater level in the tree
	// and collect visited nodes.
	auto DescendAndCollect = [Level, &AffectedQueue, &DT](NodePtr, NodePtr To) {
	const TreeNodePtr TN = DT.getNode(To);
	assert(TN);
	if (TN->getLevel() > Level) return true;
	if (llvm::find(AffectedQueue, To) == AffectedQueue.end())
	AffectedQueue.push_back(To);

	return false;
	};

	SemiNCAInfo SNCA(BUI);
	unsigned LastDFSNum =
	SNCA.runDFS(ToTN->getBlock(), 0, DescendAndCollect, 0);

	TreeNodePtr MinNode = ToTN;

	// Identify the top of the subtree to rebuild by finding the NCD of all
	// the affected nodes.
	for (const NodePtr N : AffectedQueue) {
	const TreeNodePtr TN = DT.getNode(N);
	const NodePtr NCDBlock =
	DT.findNearestCommonDominator(TN->getBlock(), ToTN->getBlock());
	assert(NCDBlock \|\| DT.isPostDominator());
	const TreeNodePtr NCD = DT.getNode(NCDBlock);
	assert(NCD);

	DEBUG(dbgs() << "Processing affected node " << BlockNamePrinter(TN)
	<< " with NCD = " << BlockNamePrinter(NCD)
	<< ", MinNode =" << BlockNamePrinter(MinNode) << "\n");
	if (NCD != TN && NCD->getLevel() < MinNode->getLevel()) MinNode = NCD;
	}

	// Root reached, rebuild the whole tree from scratch.
	if (!MinNode->getIDom()) {
	DEBUG(dbgs() << "The entire tree needs to be rebuilt\n");
	CalculateFromScratch(DT, BUI);
	return;
	}

	// Erase the unreachable subtree in reverse preorder to process all children
	// before deleting their parent.
	for (unsigned i = LastDFSNum; i > 0; --i) {
	const NodePtr N = SNCA.NumToNode[i];
	const TreeNodePtr TN = DT.getNode(N);
	DEBUG(dbgs() << "Erasing node " << BlockNamePrinter(TN) << "\n");

	EraseNode(DT, TN);
	}

	// The affected subtree start at the To node -- there's no extra work to do.
	if (MinNode == ToTN) return;

	DEBUG(dbgs() << "DeleteUnreachable: running DFS with MinNode = "
	<< BlockNamePrinter(MinNode) << "\n");
	const unsigned MinLevel = MinNode->getLevel();
	const TreeNodePtr PrevIDom = MinNode->getIDom();
	assert(PrevIDom);
	SNCA.clear();

	// Identify nodes that remain in the affected subtree.
	auto DescendBelow = [MinLevel, &DT](NodePtr, NodePtr To) {
	const TreeNodePtr ToTN = DT.getNode(To);
	return ToTN && ToTN->getLevel() > MinLevel;
	};
	SNCA.runDFS(MinNode->getBlock(), 0, DescendBelow, 0);

	DEBUG(dbgs() << "Previous IDom(MinNode) = " << BlockNamePrinter(PrevIDom)
	<< "\nRunning Semi-NCA\n");

	// Rebuild the remaining part of affected subtree.
	SNCA.runSemiNCA(DT, MinLevel);
	SNCA.reattachExistingSubtree(DT, PrevIDom);
	}

	// Removes leaf tree nodes from the dominator tree.
	static void EraseNode(DomTreeT &DT, const TreeNodePtr TN) {
	assert(TN);
	assert(TN->getNumChildren() == 0 && "Not a tree leaf");

	const TreeNodePtr IDom = TN->getIDom();
	assert(IDom);

	auto ChIt = llvm::find(IDom->Children, TN);
	assert(ChIt != IDom->Children.end());
	std::swap(*ChIt, IDom->Children.back());
	IDom->Children.pop_back();

	DT.DomTreeNodes.erase(TN->getBlock());
	}

	//~~
	//===--------------------- DomTree Batch Updater --------------------------===
	//~~

	static void ApplyUpdates(DomTreeT &DT, ArrayRef<UpdateT> Updates) {
	const size_t NumUpdates = Updates.size();
	if (NumUpdates == 0)
	return;

	// Take the fast path for a single update and avoid running the batch update
	// machinery.
	if (NumUpdates == 1) {
	const auto &Update = Updates.front();
	if (Update.getKind() == UpdateKind::Insert)
	DT.insertEdge(Update.getFrom(), Update.getTo());
	else
	DT.deleteEdge(Update.getFrom(), Update.getTo());

	return;
	}

	BatchUpdateInfo BUI;
	LegalizeUpdates(Updates, BUI.Updates);

	const size_t NumLegalized = BUI.Updates.size();
	BUI.FutureSuccessors.reserve(NumLegalized);
	BUI.FuturePredecessors.reserve(NumLegalized);

	// Use the legalized future updates to initialize future successors and
	// predecessors. Note that these sets will only decrease size over time, as
	// the next CFG snapshots slowly approach the actual (current) CFG.
	for (UpdateT &U : BUI.Updates) {
	BUI.FutureSuccessors[U.getFrom()].insert({U.getTo(), U.getKind()});
	BUI.FuturePredecessors[U.getTo()].insert({U.getFrom(), U.getKind()});
	}

	DEBUG(dbgs() << "About to apply " << NumLegalized << " updates\n");
	DEBUG(if (NumLegalized < 32) for (const auto &U
	: reverse(BUI.Updates)) dbgs()
	<< '\t' << U << "\n");
	DEBUG(dbgs() << "\n");

	// If the DominatorTree was recalculated at some point, stop the batch
	// updates. Full recalculations ignore batch updates and look at the actual
	// CFG.
	for (size_t i = 0; i < NumLegalized && !BUI.IsRecalculated; ++i)
	ApplyNextUpdate(DT, BUI);
	}

	// This function serves double purpose:
	// a) It removes redundant updates, which makes it easier to reverse-apply
	// them when traversing CFG.
	// b) It optimizes away updates that cancel each other out, as the end result
	// is the same.
	//
	// It relies on the property of the incremental updates that says that the
	// order of updates doesn't matter. This allows us to reorder them and end up
	// with the exact same DomTree every time.
	//
	// Following the same logic, the function doesn't care about the order of
	// input updates, so it's OK to pass it an unordered sequence of updates, that
	// doesn't make sense when applied sequentially, eg. performing double
	// insertions or deletions and then doing an opposite update.
	//
	// In the future, it should be possible to schedule updates in way that
	// minimizes the amount of work needed done during incremental updates.
	static void LegalizeUpdates(ArrayRef<UpdateT> AllUpdates,
	SmallVectorImpl<UpdateT> &Result) {
	DEBUG(dbgs() << "Legalizing " << AllUpdates.size() << " updates\n");
	// Count the total number of inserions of each edge.
	// Each insertion adds 1 and deletion subtracts 1. The end number should be
	// one of {-1 (deletion), 0 (NOP), +1 (insertion)}. Otherwise, the sequence
	// of updates contains multiple updates of the same kind and we assert for
	// that case.
	SmallDenseMap<std::pair<NodePtr, NodePtr>, int, 4> Operations;
	Operations.reserve(AllUpdates.size());

	for (const auto &U : AllUpdates) {
	NodePtr From = U.getFrom();
	NodePtr To = U.getTo();
	if (IsPostDom) std::swap(From, To); // Reverse edge for postdominators.

	Operations[{From, To}] += (U.getKind() == UpdateKind::Insert ? 1 : -1);
	}

	Result.clear();
	Result.reserve(Operations.size());
	for (auto &Op : Operations) {
	const int NumInsertions = Op.second;
	assert(std::abs(NumInsertions) <= 1 && "Unbalanced operations!");
	if (NumInsertions == 0) continue;
	const UpdateKind UK =
	NumInsertions > 0 ? UpdateKind::Insert : UpdateKind::Delete;
	Result.push_back({UK, Op.first.first, Op.first.second});
	}

	// Make the order consistent by not relying on pointer values within the
	// set. Reuse the old Operations map.
	// In the future, we should sort by something else to minimize the amount
	// of work needed to perform the series of updates.
	for (size_t i = 0, e = AllUpdates.size(); i != e; ++i) {
	const auto &U = AllUpdates[i];
	if (!IsPostDom)
	Operations[{U.getFrom(), U.getTo()}] = int(i);
	else
	Operations[{U.getTo(), U.getFrom()}] = int(i);
	}

	std::sort(Result.begin(), Result.end(),
	[&Operations](const UpdateT &A, const UpdateT &B) {
	return Operations[{A.getFrom(), A.getTo()}] >
	Operations[{B.getFrom(), B.getTo()}];
	});
	}

	static void ApplyNextUpdate(DomTreeT &DT, BatchUpdateInfo &BUI) {
	assert(!BUI.Updates.empty() && "No updates to apply!");
	UpdateT CurrentUpdate = BUI.Updates.pop_back_val();
	DEBUG(dbgs() << "Applying update: " << CurrentUpdate << "\n");

	// Move to the next snapshot of the CFG by removing the reverse-applied
	// current update.
	auto &FS = BUI.FutureSuccessors[CurrentUpdate.getFrom()];
	FS.erase({CurrentUpdate.getTo(), CurrentUpdate.getKind()});
	if (FS.empty()) BUI.FutureSuccessors.erase(CurrentUpdate.getFrom());

	auto &FP = BUI.FuturePredecessors[CurrentUpdate.getTo()];
	FP.erase({CurrentUpdate.getFrom(), CurrentUpdate.getKind()});
	if (FP.empty()) BUI.FuturePredecessors.erase(CurrentUpdate.getTo());

	if (CurrentUpdate.getKind() == UpdateKind::Insert)
	InsertEdge(DT, &BUI, CurrentUpdate.getFrom(), CurrentUpdate.getTo());
	else
	DeleteEdge(DT, &BUI, CurrentUpdate.getFrom(), CurrentUpdate.getTo());
	}

	//~~
	//===--------------- DomTree correctness verification ---------------------===
	//~~

	// Check if the tree has correct roots. A DominatorTree always has a single
	// root which is the function's entry node. A PostDominatorTree can have
	// multiple roots - one for each node with no successors and for infinite
	// loops.
	bool verifyRoots(const DomTreeT &DT) {
	if (!DT.Parent && !DT.Roots.empty()) {
	errs() << "Tree has no parent but has roots!\n";
	errs().flush();
	return false;
	}

	if (!IsPostDom) {
	if (DT.Roots.empty()) {
	errs() << "Tree doesn't have a root!\n";
	errs().flush();
	return false;
	}

	if (DT.getRoot() != GetEntryNode(DT)) {
	errs() << "Tree's root is not its parent's entry node!\n";
	errs().flush();
	return false;
	}
	}

	RootsT ComputedRoots = FindRoots(DT, nullptr);
	if (DT.Roots.size() != ComputedRoots.size() \|\|
	!std::is_permutation(DT.Roots.begin(), DT.Roots.end(),
	ComputedRoots.begin())) {
	errs() << "Tree has different roots than freshly computed ones!\n";
	errs() << "\tPDT roots: ";
	for (const NodePtr N : DT.Roots) errs() << BlockNamePrinter(N) << ", ";
	errs() << "\n\tComputed roots: ";
	for (const NodePtr N : ComputedRoots)
	errs() << BlockNamePrinter(N) << ", ";
	errs() << "\n";
	errs().flush();
	return false;
	}

	return true;
	}

	// Checks if the tree contains all reachable nodes in the input graph.
	bool verifyReachability(const DomTreeT &DT) {
	clear();
	doFullDFSWalk(DT, AlwaysDescend);

	for (auto &NodeToTN : DT.DomTreeNodes) {
	const TreeNodePtr TN = NodeToTN.second.get();
	const NodePtr BB = TN->getBlock();

	// Virtual root has a corresponding virtual CFG node.
	if (DT.isVirtualRoot(TN)) continue;

	if (NodeToInfo.count(BB) == 0) {
	errs() << "DomTree node " << BlockNamePrinter(BB)
	<< " not found by DFS walk!\n";
	errs().flush();

	return false;
	}
	}

	for (const NodePtr N : NumToNode) {
	if (N && !DT.getNode(N)) {
	errs() << "CFG node " << BlockNamePrinter(N)
	<< " not found in the DomTree!\n";
	errs().flush();

	return false;
	}
	}

	return true;
	}

	// Check if for every parent with a level L in the tree all of its children
	// have level L + 1.
	static bool VerifyLevels(const DomTreeT &DT) {
	for (auto &NodeToTN : DT.DomTreeNodes) {
	const TreeNodePtr TN = NodeToTN.second.get();
	const NodePtr BB = TN->getBlock();
	if (!BB) continue;

	const TreeNodePtr IDom = TN->getIDom();
	if (!IDom && TN->getLevel() != 0) {
	errs() << "Node without an IDom " << BlockNamePrinter(BB)
	<< " has a nonzero level " << TN->getLevel() << "!\n";
	errs().flush();

	return false;
	}

	if (IDom && TN->getLevel() != IDom->getLevel() + 1) {
	errs() << "Node " << BlockNamePrinter(BB) << " has level "
	<< TN->getLevel() << " while its IDom "
	<< BlockNamePrinter(IDom->getBlock()) << " has level "
	<< IDom->getLevel() << "!\n";
	errs().flush();

	return false;
	}
	}

	return true;
	}

	// Check if the computed DFS numbers are correct. Note that DFS info may not
	// be valid, and when that is the case, we don't verify the numbers.
	static bool VerifyDFSNumbers(const DomTreeT &DT) {
	if (!DT.DFSInfoValid \|\| !DT.Parent)
	return true;

	const NodePtr RootBB = IsPostDom ? nullptr : DT.getRoots()[0];
	const TreeNodePtr Root = DT.getNode(RootBB);

	auto PrintNodeAndDFSNums = [](const TreeNodePtr TN) {
	errs() << BlockNamePrinter(TN) << " {" << TN->getDFSNumIn() << ", "
	<< TN->getDFSNumOut() << '}';
	};

	// Verify the root's DFS In number. Although DFS numbering would also work
	// if we started from some other value, we assume 0-based numbering.
	if (Root->getDFSNumIn() != 0) {
	errs() << "DFSIn number for the tree root is not:\n\t";
	PrintNodeAndDFSNums(Root);
	errs() << '\n';
	errs().flush();
	return false;
	}

	// For each tree node verify if children's DFS numbers cover their parent's
	// DFS numbers with no gaps.
	for (const auto &NodeToTN : DT.DomTreeNodes) {
	const TreeNodePtr Node = NodeToTN.second.get();

	// Handle tree leaves.
	if (Node->getChildren().empty()) {
	if (Node->getDFSNumIn() + 1 != Node->getDFSNumOut()) {
	errs() << "Tree leaf should have DFSOut = DFSIn + 1:\n\t";
	PrintNodeAndDFSNums(Node);
	errs() << '\n';
	errs().flush();
	return false;
	}

	continue;
	}

	// Make a copy and sort it such that it is possible to check if there are
	// no gaps between DFS numbers of adjacent children.
	SmallVector<TreeNodePtr, 8> Children(Node->begin(), Node->end());
	std::sort(Children.begin(), Children.end(),
	[](const TreeNodePtr Ch1, const TreeNodePtr Ch2) {
	return Ch1->getDFSNumIn() < Ch2->getDFSNumIn();
	});

	auto PrintChildrenError = [Node, &Children, PrintNodeAndDFSNums](
	const TreeNodePtr FirstCh, const TreeNodePtr SecondCh) {
	assert(FirstCh);

	errs() << "Incorrect DFS numbers for:\n\tParent ";
	PrintNodeAndDFSNums(Node);

	errs() << "\n\tChild ";
	PrintNodeAndDFSNums(FirstCh);

	if (SecondCh) {
	errs() << "\n\tSecond child ";
	PrintNodeAndDFSNums(SecondCh);
	}

	errs() << "\nAll children: ";
	for (const TreeNodePtr Ch : Children) {
	PrintNodeAndDFSNums(Ch);
	errs() << ", ";
	}

	errs() << '\n';
	errs().flush();
	};

	if (Children.front()->getDFSNumIn() != Node->getDFSNumIn() + 1) {
	PrintChildrenError(Children.front(), nullptr);
	return false;
	}

	if (Children.back()->getDFSNumOut() + 1 != Node->getDFSNumOut()) {
	PrintChildrenError(Children.back(), nullptr);
	return false;
	}

	for (size_t i = 0, e = Children.size() - 1; i != e; ++i) {
	if (Children[i]->getDFSNumOut() + 1 != Children[i + 1]->getDFSNumIn()) {
	PrintChildrenError(Children[i], Children[i + 1]);
	return false;
	}
	}
	}

	return true;
	}

	// The below routines verify the correctness of the dominator tree relative to
	// the CFG it's coming from. A tree is a dominator tree iff it has two
	// properties, called the parent property and the sibling property. Tarjan
	// and Lengauer prove (but don't explicitly name) the properties as part of
	// the proofs in their 1972 paper, but the proofs are mostly part of proving
	// things about semidominators and idoms, and some of them are simply asserted
	// based on even earlier papers (see, e.g., lemma 2). Some papers refer to
	// these properties as "valid" and "co-valid". See, e.g., "Dominators,
	// directed bipolar orders, and independent spanning trees" by Loukas
	// Georgiadis and Robert E. Tarjan, as well as "Dominator Tree Verification
	// and Vertex-Disjoint Paths " by the same authors.

	// A very simple and direct explanation of these properties can be found in
	// "An Experimental Study of Dynamic Dominators", found at
	// https://arxiv.org/abs/1604.02711

	// The easiest way to think of the parent property is that it's a requirement
	// of being a dominator. Let's just take immediate dominators. For PARENT to
	// be an immediate dominator of CHILD, all paths in the CFG must go through
	// PARENT before they hit CHILD. This implies that if you were to cut PARENT
	// out of the CFG, there should be no paths to CHILD that are reachable. If
	// there are, then you now have a path from PARENT to CHILD that goes around
	// PARENT and still reaches CHILD, which by definition, means PARENT can't be
	// a dominator of CHILD (let alone an immediate one).

	// The sibling property is similar. It says that for each pair of sibling
	// nodes in the dominator tree (LEFT and RIGHT) , they must not dominate each
	// other. If sibling LEFT dominated sibling RIGHT, it means there are no
	// paths in the CFG from sibling LEFT to sibling RIGHT that do not go through
	// LEFT, and thus, LEFT is really an ancestor (in the dominator tree) of
	// RIGHT, not a sibling.

	// It is possible to verify the parent and sibling properties in
	// linear time, but the algorithms are complex. Instead, we do it in a
	// straightforward N^2 and N^3 way below, using direct path reachability.


	// Checks if the tree has the parent property: if for all edges from V to W in
	// the input graph, such that V is reachable, the parent of W in the tree is
	// an ancestor of V in the tree.
	//
	// This means that if a node gets disconnected from the graph, then all of
	// the nodes it dominated previously will now become unreachable.
	bool verifyParentProperty(const DomTreeT &DT) {
	for (auto &NodeToTN : DT.DomTreeNodes) {
	const TreeNodePtr TN = NodeToTN.second.get();
	const NodePtr BB = TN->getBlock();
	if (!BB \|\| TN->getChildren().empty()) continue;

	DEBUG(dbgs() << "Verifying parent property of node "
	<< BlockNamePrinter(TN) << "\n");
	clear();
	doFullDFSWalk(DT, [BB](NodePtr From, NodePtr To) {
	return From != BB && To != BB;
	});

	for (TreeNodePtr Child : TN->getChildren())
	if (NodeToInfo.count(Child->getBlock()) != 0) {
	errs() << "Child " << BlockNamePrinter(Child)
	<< " reachable after its parent " << BlockNamePrinter(BB)
	<< " is removed!\n";
	errs().flush();

	return false;
	}
	}

	return true;
	}

	// Check if the tree has sibling property: if a node V does not dominate a
	// node W for all siblings V and W in the tree.
	//
	// This means that if a node gets disconnected from the graph, then all of its
	// siblings will now still be reachable.
	bool verifySiblingProperty(const DomTreeT &DT) {
	for (auto &NodeToTN : DT.DomTreeNodes) {
	const TreeNodePtr TN = NodeToTN.second.get();
	const NodePtr BB = TN->getBlock();
	if (!BB \|\| TN->getChildren().empty()) continue;

	const auto &Siblings = TN->getChildren();
	for (const TreeNodePtr N : Siblings) {
	clear();
	NodePtr BBN = N->getBlock();
	doFullDFSWalk(DT, [BBN](NodePtr From, NodePtr To) {
	return From != BBN && To != BBN;
	});

	for (const TreeNodePtr S : Siblings) {
	if (S == N) continue;

	if (NodeToInfo.count(S->getBlock()) == 0) {
	errs() << "Node " << BlockNamePrinter(S)
	<< " not reachable when its sibling " << BlockNamePrinter(N)
	<< " is removed!\n";
	errs().flush();

	return false;
	}
	}
	}
	}

	return true;
	}
	};

	template <class DomTreeT>
	void Calculate(DomTreeT &DT) {
	SemiNCAInfo<DomTreeT>::CalculateFromScratch(DT, nullptr);
	}

	template <class DomTreeT>
	void InsertEdge(DomTreeT &DT, typename DomTreeT::NodePtr From,
	typename DomTreeT::NodePtr To) {
	if (DT.isPostDominator()) std::swap(From, To);
	SemiNCAInfo<DomTreeT>::InsertEdge(DT, nullptr, From, To);
	}

	template <class DomTreeT>
	void DeleteEdge(DomTreeT &DT, typename DomTreeT::NodePtr From,
	typename DomTreeT::NodePtr To) {
	if (DT.isPostDominator()) std::swap(From, To);
	SemiNCAInfo<DomTreeT>::DeleteEdge(DT, nullptr, From, To);
	}

	template <class DomTreeT>
	void ApplyUpdates(DomTreeT &DT,
	ArrayRef<typename DomTreeT::UpdateType> Updates) {
	SemiNCAInfo<DomTreeT>::ApplyUpdates(DT, Updates);
	}

	template <class DomTreeT>
	bool Verify(const DomTreeT &DT) {
	SemiNCAInfo<DomTreeT> SNCA(nullptr);
	return SNCA.verifyRoots(DT) && SNCA.verifyReachability(DT) &&
	SNCA.VerifyLevels(DT) && SNCA.verifyParentProperty(DT) &&
	SNCA.verifySiblingProperty(DT) && SNCA.VerifyDFSNumbers(DT);
	}

	} // namespace DomTreeBuilder
	} // namespace llvm

	#undef DEBUG_TYPE

	#endif
	Index: head/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
	===================================================================
	--- head/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp (revision 329409)
	+++ head/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp (revision 329410)
	@@ -1,320 +1,321 @@
	//===-- llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp -------- C++ ---===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// Common functionality for different debug information format backends.
	// LLVM currently supports DWARF and CodeView.
	//
	//===----------------------------------------------------------------------===//

	#include "DebugHandlerBase.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/CodeGen/AsmPrinter.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/IR/DebugInfo.h"
	#include "llvm/MC/MCStreamer.h"

	using namespace llvm;

	Optional<DbgVariableLocation>
	DbgVariableLocation::extractFromMachineInstruction(
	const MachineInstr &Instruction) {
	DbgVariableLocation Location;
	if (!Instruction.isDebugValue())
	return None;
	if (!Instruction.getOperand(0).isReg())
	return None;
	Location.Register = Instruction.getOperand(0).getReg();
	Location.FragmentInfo.reset();
	// We only handle expressions generated by DIExpression::appendOffset,
	// which doesn't require a full stack machine.
	int64_t Offset = 0;
	const DIExpression *DIExpr = Instruction.getDebugExpression();
	auto Op = DIExpr->expr_op_begin();
	while (Op != DIExpr->expr_op_end()) {
	switch (Op->getOp()) {
	case dwarf::DW_OP_constu: {
	int Value = Op->getArg(0);
	++Op;
	if (Op != DIExpr->expr_op_end()) {
	switch (Op->getOp()) {
	case dwarf::DW_OP_minus:
	Offset -= Value;
	break;
	case dwarf::DW_OP_plus:
	Offset += Value;
	break;
	default:
	continue;
	}
	}
	} break;
	case dwarf::DW_OP_plus_uconst:
	Offset += Op->getArg(0);
	break;
	case dwarf::DW_OP_LLVM_fragment:
	Location.FragmentInfo = {Op->getArg(1), Op->getArg(0)};
	break;
	case dwarf::DW_OP_deref:
	Location.LoadChain.push_back(Offset);
	Offset = 0;
	break;
	default:
	return None;
	}
	++Op;
	}

	// Do one final implicit DW_OP_deref if this was an indirect DBG_VALUE
	// instruction.
	// FIXME: Replace these with DIExpression.
	if (Instruction.isIndirectDebugValue())
	Location.LoadChain.push_back(Offset);

	return Location;
	}

	DebugHandlerBase::DebugHandlerBase(AsmPrinter *A) : Asm(A), MMI(Asm->MMI) {}

	// Each LexicalScope has first instruction and last instruction to mark
	// beginning and end of a scope respectively. Create an inverse map that list
	// scopes starts (and ends) with an instruction. One instruction may start (or
	// end) multiple scopes. Ignore scopes that are not reachable.
	void DebugHandlerBase::identifyScopeMarkers() {
	SmallVector<LexicalScope *, 4> WorkList;
	WorkList.push_back(LScopes.getCurrentFunctionScope());
	while (!WorkList.empty()) {
	LexicalScope *S = WorkList.pop_back_val();

	const SmallVectorImpl<LexicalScope *> &Children = S->getChildren();
	if (!Children.empty())
	WorkList.append(Children.begin(), Children.end());

	if (S->isAbstractScope())
	continue;

	for (const InsnRange &R : S->getRanges()) {
	assert(R.first && "InsnRange does not have first instruction!");
	assert(R.second && "InsnRange does not have second instruction!");
	requestLabelBeforeInsn(R.first);
	requestLabelAfterInsn(R.second);
	}
	}
	}

	// Return Label preceding the instruction.
	MCSymbol DebugHandlerBase::getLabelBeforeInsn(const MachineInstr MI) {
	MCSymbol *Label = LabelsBeforeInsn.lookup(MI);
	assert(Label && "Didn't insert label before instruction");
	return Label;
	}

	// Return Label immediately following the instruction.
	MCSymbol DebugHandlerBase::getLabelAfterInsn(const MachineInstr MI) {
	return LabelsAfterInsn.lookup(MI);
	}

	int DebugHandlerBase::fragmentCmp(const DIExpression *P1,
	const DIExpression *P2) {
	auto Fragment1 = *P1->getFragmentInfo();
	auto Fragment2 = *P2->getFragmentInfo();
	unsigned l1 = Fragment1.OffsetInBits;
	unsigned l2 = Fragment2.OffsetInBits;
	unsigned r1 = l1 + Fragment1.SizeInBits;
	unsigned r2 = l2 + Fragment2.SizeInBits;
	if (r1 <= l2)
	return -1;
	else if (r2 <= l1)
	return 1;
	else
	return 0;
	}

	bool DebugHandlerBase::fragmentsOverlap(const DIExpression *P1,
	const DIExpression *P2) {
	if (!P1->isFragment() \|\| !P2->isFragment())
	return true;
	return fragmentCmp(P1, P2) == 0;
	}

	/// If this type is derived from a base type then return base type size.
	uint64_t DebugHandlerBase::getBaseTypeSize(const DITypeRef TyRef) {
	DIType *Ty = TyRef.resolve();
	assert(Ty);
	DIDerivedType *DDTy = dyn_cast<DIDerivedType>(Ty);
	if (!DDTy)
	return Ty->getSizeInBits();

	unsigned Tag = DDTy->getTag();

	if (Tag != dwarf::DW_TAG_member && Tag != dwarf::DW_TAG_typedef &&
	Tag != dwarf::DW_TAG_const_type && Tag != dwarf::DW_TAG_volatile_type &&
	Tag != dwarf::DW_TAG_restrict_type && Tag != dwarf::DW_TAG_atomic_type)
	return DDTy->getSizeInBits();

	DIType *BaseType = DDTy->getBaseType().resolve();

	- assert(BaseType && "Unexpected invalid base type");
	+ if (!BaseType)
	+ return 0;

	// If this is a derived type, go ahead and get the base type, unless it's a
	// reference then it's just the size of the field. Pointer types have no need
	// of this since they're a different type of qualification on the type.
	if (BaseType->getTag() == dwarf::DW_TAG_reference_type \|\|
	BaseType->getTag() == dwarf::DW_TAG_rvalue_reference_type)
	return Ty->getSizeInBits();

	return getBaseTypeSize(BaseType);
	}

	static bool hasDebugInfo(const MachineModuleInfo *MMI,
	const MachineFunction *MF) {
	if (!MMI->hasDebugInfo())
	return false;
	auto *SP = MF->getFunction().getSubprogram();
	if (!SP)
	return false;
	assert(SP->getUnit());
	auto EK = SP->getUnit()->getEmissionKind();
	if (EK == DICompileUnit::NoDebug)
	return false;
	return true;
	}

	void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
	PrevInstBB = nullptr;

	if (!Asm \|\| !hasDebugInfo(MMI, MF)) {
	skippedNonDebugFunction();
	return;
	}

	// Grab the lexical scopes for the function, if we don't have any of those
	// then we're not going to be able to do anything.
	LScopes.initialize(*MF);
	if (LScopes.empty()) {
	beginFunctionImpl(MF);
	return;
	}

	// Make sure that each lexical scope will have a begin/end label.
	identifyScopeMarkers();

	// Calculate history for local variables.
	assert(DbgValues.empty() && "DbgValues map wasn't cleaned!");
	calculateDbgValueHistory(MF, Asm->MF->getSubtarget().getRegisterInfo(),
	DbgValues);

	// Request labels for the full history.
	for (const auto &I : DbgValues) {
	const auto &Ranges = I.second;
	if (Ranges.empty())
	continue;

	// The first mention of a function argument gets the CurrentFnBegin
	// label, so arguments are visible when breaking at function entry.
	const DILocalVariable *DIVar = Ranges.front().first->getDebugVariable();
	if (DIVar->isParameter() &&
	getDISubprogram(DIVar->getScope())->describes(&MF->getFunction())) {
	LabelsBeforeInsn[Ranges.front().first] = Asm->getFunctionBegin();
	if (Ranges.front().first->getDebugExpression()->isFragment()) {
	// Mark all non-overlapping initial fragments.
	for (auto I = Ranges.begin(); I != Ranges.end(); ++I) {
	const DIExpression *Fragment = I->first->getDebugExpression();
	if (std::all_of(Ranges.begin(), I,
	[&](DbgValueHistoryMap::InstrRange Pred) {
	return !fragmentsOverlap(
	Fragment, Pred.first->getDebugExpression());
	}))
	LabelsBeforeInsn[I->first] = Asm->getFunctionBegin();
	else
	break;
	}
	}
	}

	for (const auto &Range : Ranges) {
	requestLabelBeforeInsn(Range.first);
	if (Range.second)
	requestLabelAfterInsn(Range.second);
	}
	}

	PrevInstLoc = DebugLoc();
	PrevLabel = Asm->getFunctionBegin();
	beginFunctionImpl(MF);
	}

	void DebugHandlerBase::beginInstruction(const MachineInstr *MI) {
	if (!MMI->hasDebugInfo())
	return;

	assert(CurMI == nullptr);
	CurMI = MI;

	// Insert labels where requested.
	DenseMap<const MachineInstr , MCSymbol >::iterator I =
	LabelsBeforeInsn.find(MI);

	// No label needed.
	if (I == LabelsBeforeInsn.end())
	return;

	// Label already assigned.
	if (I->second)
	return;

	if (!PrevLabel) {
	PrevLabel = MMI->getContext().createTempSymbol();
	Asm->OutStreamer->EmitLabel(PrevLabel);
	}
	I->second = PrevLabel;
	}

	void DebugHandlerBase::endInstruction() {
	if (!MMI->hasDebugInfo())
	return;

	assert(CurMI != nullptr);
	// Don't create a new label after DBG_VALUE and other instructions that don't
	// generate code.
	if (!CurMI->isMetaInstruction()) {
	PrevLabel = nullptr;
	PrevInstBB = CurMI->getParent();
	}

	DenseMap<const MachineInstr , MCSymbol >::iterator I =
	LabelsAfterInsn.find(CurMI);
	CurMI = nullptr;

	// No label needed.
	if (I == LabelsAfterInsn.end())
	return;

	// Label already assigned.
	if (I->second)
	return;

	// We need a label after this instruction.
	if (!PrevLabel) {
	PrevLabel = MMI->getContext().createTempSymbol();
	Asm->OutStreamer->EmitLabel(PrevLabel);
	}
	I->second = PrevLabel;
	}

	void DebugHandlerBase::endFunction(const MachineFunction *MF) {
	if (hasDebugInfo(MMI, MF))
	endFunctionImpl(MF);
	DbgValues.clear();
	LabelsBeforeInsn.clear();
	LabelsAfterInsn.clear();
	}
	Index: head/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
	===================================================================
	--- head/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp (revision 329409)
	+++ head/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp (revision 329410)
	@@ -1,1629 +1,1630 @@
	//===-- llvm/CodeGen/DwarfUnit.cpp - Dwarf Type and Compile Units ---------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains support for constructing a dwarf compile unit.
	//
	//===----------------------------------------------------------------------===//

	#include "DwarfUnit.h"
	#include "AddressPool.h"
	#include "DwarfCompileUnit.h"
	#include "DwarfDebug.h"
	#include "DwarfExpression.h"
	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/None.h"
	#include "llvm/ADT/iterator_range.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/TargetLoweringObjectFile.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/Metadata.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCDwarf.h"
	#include "llvm/MC/MCSection.h"
	#include "llvm/MC/MCStreamer.h"
	#include "llvm/MC/MachineLocation.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CommandLine.h"
	#include <cassert>
	#include <cstdint>
	#include <string>
	#include <utility>

	using namespace llvm;

	#define DEBUG_TYPE "dwarfdebug"

	static cl::opt<bool>
	GenerateDwarfTypeUnits("generate-type-units", cl::Hidden,
	cl::desc("Generate DWARF4 type units."),
	cl::init(false));

	DIEDwarfExpression::DIEDwarfExpression(const AsmPrinter &AP, DwarfUnit &DU,
	DIELoc &DIE)
	: DwarfExpression(AP.getDwarfVersion()), AP(AP), DU(DU),
	DIE(DIE) {}

	void DIEDwarfExpression::emitOp(uint8_t Op, const char* Comment) {
	DU.addUInt(DIE, dwarf::DW_FORM_data1, Op);
	}

	void DIEDwarfExpression::emitSigned(int64_t Value) {
	DU.addSInt(DIE, dwarf::DW_FORM_sdata, Value);
	}

	void DIEDwarfExpression::emitUnsigned(uint64_t Value) {
	DU.addUInt(DIE, dwarf::DW_FORM_udata, Value);
	}

	bool DIEDwarfExpression::isFrameRegister(const TargetRegisterInfo &TRI,
	unsigned MachineReg) {
	return MachineReg == TRI.getFrameRegister(*AP.MF);
	}

	DwarfUnit::DwarfUnit(dwarf::Tag UnitTag, const DICompileUnit *Node,
	AsmPrinter A, DwarfDebug DW, DwarfFile *DWU)
	: DIEUnit(A->getDwarfVersion(), A->MAI->getCodePointerSize(), UnitTag),
	CUNode(Node), Asm(A), DD(DW), DU(DWU), IndexTyDie(nullptr) {
	}

	DwarfTypeUnit::DwarfTypeUnit(DwarfCompileUnit &CU, AsmPrinter *A,
	DwarfDebug DW, DwarfFile DWU,
	MCDwarfDwoLineTable *SplitLineTable)
	: DwarfUnit(dwarf::DW_TAG_type_unit, CU.getCUNode(), A, DW, DWU), CU(CU),
	SplitLineTable(SplitLineTable) {
	if (SplitLineTable)
	addSectionOffset(getUnitDie(), dwarf::DW_AT_stmt_list, 0);
	}

	DwarfUnit::~DwarfUnit() {
	for (unsigned j = 0, M = DIEBlocks.size(); j < M; ++j)
	DIEBlocks[j]->~DIEBlock();
	for (unsigned j = 0, M = DIELocs.size(); j < M; ++j)
	DIELocs[j]->~DIELoc();
	}

	int64_t DwarfUnit::getDefaultLowerBound() const {
	switch (getLanguage()) {
	default:
	break;

	// The languages below have valid values in all DWARF versions.
	case dwarf::DW_LANG_C:
	case dwarf::DW_LANG_C89:
	case dwarf::DW_LANG_C_plus_plus:
	return 0;

	case dwarf::DW_LANG_Fortran77:
	case dwarf::DW_LANG_Fortran90:
	return 1;

	// The languages below have valid values only if the DWARF version >= 3.
	case dwarf::DW_LANG_C99:
	case dwarf::DW_LANG_ObjC:
	case dwarf::DW_LANG_ObjC_plus_plus:
	if (DD->getDwarfVersion() >= 3)
	return 0;
	break;

	case dwarf::DW_LANG_Fortran95:
	if (DD->getDwarfVersion() >= 3)
	return 1;
	break;

	// Starting with DWARF v4, all defined languages have valid values.
	case dwarf::DW_LANG_D:
	case dwarf::DW_LANG_Java:
	case dwarf::DW_LANG_Python:
	case dwarf::DW_LANG_UPC:
	if (DD->getDwarfVersion() >= 4)
	return 0;
	break;

	case dwarf::DW_LANG_Ada83:
	case dwarf::DW_LANG_Ada95:
	case dwarf::DW_LANG_Cobol74:
	case dwarf::DW_LANG_Cobol85:
	case dwarf::DW_LANG_Modula2:
	case dwarf::DW_LANG_Pascal83:
	case dwarf::DW_LANG_PLI:
	if (DD->getDwarfVersion() >= 4)
	return 1;
	break;

	// The languages below are new in DWARF v5.
	case dwarf::DW_LANG_BLISS:
	case dwarf::DW_LANG_C11:
	case dwarf::DW_LANG_C_plus_plus_03:
	case dwarf::DW_LANG_C_plus_plus_11:
	case dwarf::DW_LANG_C_plus_plus_14:
	case dwarf::DW_LANG_Dylan:
	case dwarf::DW_LANG_Go:
	case dwarf::DW_LANG_Haskell:
	case dwarf::DW_LANG_OCaml:
	case dwarf::DW_LANG_OpenCL:
	case dwarf::DW_LANG_RenderScript:
	case dwarf::DW_LANG_Rust:
	case dwarf::DW_LANG_Swift:
	if (DD->getDwarfVersion() >= 5)
	return 0;
	break;

	case dwarf::DW_LANG_Fortran03:
	case dwarf::DW_LANG_Fortran08:
	case dwarf::DW_LANG_Julia:
	case dwarf::DW_LANG_Modula3:
	if (DD->getDwarfVersion() >= 5)
	return 1;
	break;
	}

	return -1;
	}

	/// Check whether the DIE for this MDNode can be shared across CUs.
	bool DwarfUnit::isShareableAcrossCUs(const DINode *D) const {
	// When the MDNode can be part of the type system, the DIE can be shared
	// across CUs.
	// Combining type units and cross-CU DIE sharing is lower value (since
	// cross-CU DIE sharing is used in LTO and removes type redundancy at that
	// level already) but may be implementable for some value in projects
	// building multiple independent libraries with LTO and then linking those
	// together.
	if (isDwoUnit() && !DD->shareAcrossDWOCUs())
	return false;
	return (isa<DIType>(D) \|\|
	(isa<DISubprogram>(D) && !cast<DISubprogram>(D)->isDefinition())) &&
	!GenerateDwarfTypeUnits;
	}

	DIE DwarfUnit::getDIE(const DINode D) const {
	if (isShareableAcrossCUs(D))
	return DU->getDIE(D);
	return MDNodeToDieMap.lookup(D);
	}

	void DwarfUnit::insertDIE(const DINode Desc, DIE D) {
	if (isShareableAcrossCUs(Desc)) {
	DU->insertDIE(Desc, D);
	return;
	}
	MDNodeToDieMap.insert(std::make_pair(Desc, D));
	}

	void DwarfUnit::addFlag(DIE &Die, dwarf::Attribute Attribute) {
	if (DD->getDwarfVersion() >= 4)
	Die.addValue(DIEValueAllocator, Attribute, dwarf::DW_FORM_flag_present,
	DIEInteger(1));
	else
	Die.addValue(DIEValueAllocator, Attribute, dwarf::DW_FORM_flag,
	DIEInteger(1));
	}

	void DwarfUnit::addUInt(DIEValueList &Die, dwarf::Attribute Attribute,
	Optional<dwarf::Form> Form, uint64_t Integer) {
	if (!Form)
	Form = DIEInteger::BestForm(false, Integer);
	assert(Form != dwarf::DW_FORM_implicit_const &&
	"DW_FORM_implicit_const is used only for signed integers");
	Die.addValue(DIEValueAllocator, Attribute, *Form, DIEInteger(Integer));
	}

	void DwarfUnit::addUInt(DIEValueList &Block, dwarf::Form Form,
	uint64_t Integer) {
	addUInt(Block, (dwarf::Attribute)0, Form, Integer);
	}

	void DwarfUnit::addSInt(DIEValueList &Die, dwarf::Attribute Attribute,
	Optional<dwarf::Form> Form, int64_t Integer) {
	if (!Form)
	Form = DIEInteger::BestForm(true, Integer);
	Die.addValue(DIEValueAllocator, Attribute, *Form, DIEInteger(Integer));
	}

	void DwarfUnit::addSInt(DIELoc &Die, Optional<dwarf::Form> Form,
	int64_t Integer) {
	addSInt(Die, (dwarf::Attribute)0, Form, Integer);
	}

	void DwarfUnit::addString(DIE &Die, dwarf::Attribute Attribute,
	StringRef String) {
	Die.addValue(DIEValueAllocator, Attribute,
	isDwoUnit() ? dwarf::DW_FORM_GNU_str_index : dwarf::DW_FORM_strp,
	DIEString(DU->getStringPool().getEntry(*Asm, String)));
	}

	DIEValueList::value_iterator DwarfUnit::addLabel(DIEValueList &Die,
	dwarf::Attribute Attribute,
	dwarf::Form Form,
	const MCSymbol *Label) {
	return Die.addValue(DIEValueAllocator, Attribute, Form, DIELabel(Label));
	}

	void DwarfUnit::addLabel(DIELoc &Die, dwarf::Form Form, const MCSymbol *Label) {
	addLabel(Die, (dwarf::Attribute)0, Form, Label);
	}

	void DwarfUnit::addSectionOffset(DIE &Die, dwarf::Attribute Attribute,
	uint64_t Integer) {
	if (DD->getDwarfVersion() >= 4)
	addUInt(Die, Attribute, dwarf::DW_FORM_sec_offset, Integer);
	else
	addUInt(Die, Attribute, dwarf::DW_FORM_data4, Integer);
	}

	unsigned DwarfTypeUnit::getOrCreateSourceID(StringRef FileName, StringRef DirName) {
	return SplitLineTable ? SplitLineTable->getFile(DirName, FileName)
	: getCU().getOrCreateSourceID(FileName, DirName);
	}

	void DwarfUnit::addOpAddress(DIELoc &Die, const MCSymbol *Sym) {
	if (!DD->useSplitDwarf()) {
	addUInt(Die, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
	addLabel(Die, dwarf::DW_FORM_udata, Sym);
	} else {
	addUInt(Die, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_addr_index);
	addUInt(Die, dwarf::DW_FORM_GNU_addr_index,
	DD->getAddressPool().getIndex(Sym));
	}
	}

	void DwarfUnit::addLabelDelta(DIE &Die, dwarf::Attribute Attribute,
	const MCSymbol Hi, const MCSymbol Lo) {
	Die.addValue(DIEValueAllocator, Attribute, dwarf::DW_FORM_data4,
	new (DIEValueAllocator) DIEDelta(Hi, Lo));
	}

	void DwarfUnit::addDIEEntry(DIE &Die, dwarf::Attribute Attribute, DIE &Entry) {
	addDIEEntry(Die, Attribute, DIEEntry(Entry));
	}

	void DwarfUnit::addDIETypeSignature(DIE &Die, uint64_t Signature) {
	// Flag the type unit reference as a declaration so that if it contains
	// members (implicit special members, static data member definitions, member
	// declarations for definitions in this CU, etc) consumers don't get confused
	// and think this is a full definition.
	addFlag(Die, dwarf::DW_AT_declaration);

	Die.addValue(DIEValueAllocator, dwarf::DW_AT_signature,
	dwarf::DW_FORM_ref_sig8, DIEInteger(Signature));
	}

	void DwarfUnit::addDIEEntry(DIE &Die, dwarf::Attribute Attribute,
	DIEEntry Entry) {
	const DIEUnit *CU = Die.getUnit();
	const DIEUnit *EntryCU = Entry.getEntry().getUnit();
	if (!CU)
	// We assume that Die belongs to this CU, if it is not linked to any CU yet.
	CU = getUnitDie().getUnit();
	if (!EntryCU)
	EntryCU = getUnitDie().getUnit();
	Die.addValue(DIEValueAllocator, Attribute,
	EntryCU == CU ? dwarf::DW_FORM_ref4 : dwarf::DW_FORM_ref_addr,
	Entry);
	}

	DIE &DwarfUnit::createAndAddDIE(unsigned Tag, DIE &Parent, const DINode *N) {
	DIE &Die = Parent.addChild(DIE::get(DIEValueAllocator, (dwarf::Tag)Tag));
	if (N)
	insertDIE(N, &Die);
	return Die;
	}

	void DwarfUnit::addBlock(DIE &Die, dwarf::Attribute Attribute, DIELoc *Loc) {
	Loc->ComputeSize(Asm);
	DIELocs.push_back(Loc); // Memoize so we can call the destructor later on.
	Die.addValue(DIEValueAllocator, Attribute,
	Loc->BestForm(DD->getDwarfVersion()), Loc);
	}

	void DwarfUnit::addBlock(DIE &Die, dwarf::Attribute Attribute,
	DIEBlock *Block) {
	Block->ComputeSize(Asm);
	DIEBlocks.push_back(Block); // Memoize so we can call the destructor later on.
	Die.addValue(DIEValueAllocator, Attribute, Block->BestForm(), Block);
	}

	void DwarfUnit::addSourceLine(DIE &Die, unsigned Line, StringRef File,
	StringRef Directory) {
	if (Line == 0)
	return;

	unsigned FileID = getOrCreateSourceID(File, Directory);
	assert(FileID && "Invalid file id");
	addUInt(Die, dwarf::DW_AT_decl_file, None, FileID);
	addUInt(Die, dwarf::DW_AT_decl_line, None, Line);
	}

	void DwarfUnit::addSourceLine(DIE &Die, const DILocalVariable *V) {
	assert(V);

	addSourceLine(Die, V->getLine(), V->getScope()->getFilename(),
	V->getScope()->getDirectory());
	}

	void DwarfUnit::addSourceLine(DIE &Die, const DIGlobalVariable *G) {
	assert(G);

	addSourceLine(Die, G->getLine(), G->getFilename(), G->getDirectory());
	}

	void DwarfUnit::addSourceLine(DIE &Die, const DISubprogram *SP) {
	assert(SP);

	addSourceLine(Die, SP->getLine(), SP->getFilename(), SP->getDirectory());
	}

	void DwarfUnit::addSourceLine(DIE &Die, const DIType *Ty) {
	assert(Ty);

	addSourceLine(Die, Ty->getLine(), Ty->getFilename(), Ty->getDirectory());
	}

	void DwarfUnit::addSourceLine(DIE &Die, const DIObjCProperty *Ty) {
	assert(Ty);

	addSourceLine(Die, Ty->getLine(), Ty->getFilename(), Ty->getDirectory());
	}

	/* Byref variables, in Blocks, are declared by the programmer as "SomeType
	VarName;", but the compiler creates a __Block_byref_x_VarName struct, and
	gives the variable VarName either the struct, or a pointer to the struct, as
	its type. This is necessary for various behind-the-scenes things the
	compiler needs to do with by-reference variables in Blocks.

	However, as far as the original programmer is concerned, the variable
	should still have type 'SomeType', as originally declared.

	The function getBlockByrefType dives into the __Block_byref_x_VarName
	struct to find the original type of the variable, which is then assigned to
	the variable's Debug Information Entry as its real type. So far, so good.
	However now the debugger will expect the variable VarName to have the type
	SomeType. So we need the location attribute for the variable to be an
	expression that explains to the debugger how to navigate through the
	pointers and struct to find the actual variable of type SomeType.

	The following function does just that. We start by getting
	the "normal" location for the variable. This will be the location
	of either the struct __Block_byref_x_VarName or the pointer to the
	struct __Block_byref_x_VarName.

	The struct will look something like:

	struct __Block_byref_x_VarName {
	... <various fields>
	struct __Block_byref_x_VarName *forwarding;
	... <various other fields>
	SomeType VarName;
	... <maybe more fields>
	};

	If we are given the struct directly (as our starting point) we
	need to tell the debugger to:

	1). Add the offset of the forwarding field.

	2). Follow that pointer to get the real __Block_byref_x_VarName
	struct to use (the real one may have been copied onto the heap).

	3). Add the offset for the field VarName, to find the actual variable.

	If we started with a pointer to the struct, then we need to
	dereference that pointer first, before the other steps.
	Translating this into DWARF ops, we will need to append the following
	to the current location description for the variable:

	DW_OP_deref -- optional, if we start with a pointer
	DW_OP_plus_uconst <forward_fld_offset>
	DW_OP_deref
	DW_OP_plus_uconst <varName_fld_offset>

	That is what this function does. */

	void DwarfUnit::addBlockByrefAddress(const DbgVariable &DV, DIE &Die,
	dwarf::Attribute Attribute,
	const MachineLocation &Location) {
	const DIType *Ty = DV.getType();
	const DIType *TmpTy = Ty;
	uint16_t Tag = Ty->getTag();
	bool isPointer = false;

	StringRef varName = DV.getName();

	if (Tag == dwarf::DW_TAG_pointer_type) {
	auto *DTy = cast<DIDerivedType>(Ty);
	TmpTy = resolve(DTy->getBaseType());
	isPointer = true;
	}

	// Find the __forwarding field and the variable field in the __Block_byref
	// struct.
	DINodeArray Fields = cast<DICompositeType>(TmpTy)->getElements();
	const DIDerivedType *varField = nullptr;
	const DIDerivedType *forwardingField = nullptr;

	for (unsigned i = 0, N = Fields.size(); i < N; ++i) {
	auto *DT = cast<DIDerivedType>(Fields[i]);
	StringRef fieldName = DT->getName();
	if (fieldName == "__forwarding")
	forwardingField = DT;
	else if (fieldName == varName)
	varField = DT;
	}

	// Get the offsets for the forwarding field and the variable field.
	unsigned forwardingFieldOffset = forwardingField->getOffsetInBits() >> 3;
	unsigned varFieldOffset = varField->getOffsetInBits() >> 2;

	// Decode the original location, and use that as the start of the byref
	// variable's location.
	DIELoc *Loc = new (DIEValueAllocator) DIELoc;
	DIEDwarfExpression DwarfExpr(Asm, this, *Loc);
	if (Location.isIndirect())
	DwarfExpr.setMemoryLocationKind();

	SmallVector<uint64_t, 6> Ops;
	// If we started with a pointer to the __Block_byref... struct, then
	// the first thing we need to do is dereference the pointer (DW_OP_deref).
	if (isPointer)
	Ops.push_back(dwarf::DW_OP_deref);

	// Next add the offset for the '__forwarding' field:
	// DW_OP_plus_uconst ForwardingFieldOffset. Note there's no point in
	// adding the offset if it's 0.
	if (forwardingFieldOffset > 0) {
	Ops.push_back(dwarf::DW_OP_plus_uconst);
	Ops.push_back(forwardingFieldOffset);
	}

	// Now dereference the __forwarding field to get to the real __Block_byref
	// struct: DW_OP_deref.
	Ops.push_back(dwarf::DW_OP_deref);

	// Now that we've got the real __Block_byref... struct, add the offset
	// for the variable's field to get to the location of the actual variable:
	// DW_OP_plus_uconst varFieldOffset. Again, don't add if it's 0.
	if (varFieldOffset > 0) {
	Ops.push_back(dwarf::DW_OP_plus_uconst);
	Ops.push_back(varFieldOffset);
	}

	DIExpressionCursor Cursor(Ops);
	const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo();
	if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg()))
	return;
	DwarfExpr.addExpression(std::move(Cursor));

	// Now attach the location information to the DIE.
	addBlock(Die, Attribute, DwarfExpr.finalize());
	}

	/// Return true if type encoding is unsigned.
	static bool isUnsignedDIType(DwarfDebug DD, const DIType Ty) {
	if (auto *CTy = dyn_cast<DICompositeType>(Ty)) {
	// FIXME: Enums without a fixed underlying type have unknown signedness
	// here, leading to incorrectly emitted constants.
	if (CTy->getTag() == dwarf::DW_TAG_enumeration_type)
	return false;

	// (Pieces of) aggregate types that get hacked apart by SROA may be
	// represented by a constant. Encode them as unsigned bytes.
	return true;
	}

	if (auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
	dwarf::Tag T = (dwarf::Tag)Ty->getTag();
	// Encode pointer constants as unsigned bytes. This is used at least for
	// null pointer constant emission.
	// FIXME: reference and rvalue_reference /probably/ shouldn't be allowed
	// here, but accept them for now due to a bug in SROA producing bogus
	// dbg.values.
	if (T == dwarf::DW_TAG_pointer_type \|\|
	T == dwarf::DW_TAG_ptr_to_member_type \|\|
	T == dwarf::DW_TAG_reference_type \|\|
	T == dwarf::DW_TAG_rvalue_reference_type)
	return true;
	assert(T == dwarf::DW_TAG_typedef \|\| T == dwarf::DW_TAG_const_type \|\|
	T == dwarf::DW_TAG_volatile_type \|\|
	T == dwarf::DW_TAG_restrict_type \|\| T == dwarf::DW_TAG_atomic_type);
	DITypeRef Deriv = DTy->getBaseType();
	assert(Deriv && "Expected valid base type");
	return isUnsignedDIType(DD, DD->resolve(Deriv));
	}

	auto *BTy = cast<DIBasicType>(Ty);
	unsigned Encoding = BTy->getEncoding();
	assert((Encoding == dwarf::DW_ATE_unsigned \|\|
	Encoding == dwarf::DW_ATE_unsigned_char \|\|
	Encoding == dwarf::DW_ATE_signed \|\|
	Encoding == dwarf::DW_ATE_signed_char \|\|
	Encoding == dwarf::DW_ATE_float \|\| Encoding == dwarf::DW_ATE_UTF \|\|
	Encoding == dwarf::DW_ATE_boolean \|\|
	(Ty->getTag() == dwarf::DW_TAG_unspecified_type &&
	Ty->getName() == "decltype(nullptr)")) &&
	"Unsupported encoding");
	return Encoding == dwarf::DW_ATE_unsigned \|\|
	Encoding == dwarf::DW_ATE_unsigned_char \|\|
	Encoding == dwarf::DW_ATE_UTF \|\| Encoding == dwarf::DW_ATE_boolean \|\|
	Ty->getTag() == dwarf::DW_TAG_unspecified_type;
	}

	void DwarfUnit::addConstantFPValue(DIE &Die, const MachineOperand &MO) {
	assert(MO.isFPImm() && "Invalid machine operand!");
	DIEBlock *Block = new (DIEValueAllocator) DIEBlock;
	APFloat FPImm = MO.getFPImm()->getValueAPF();

	// Get the raw data form of the floating point.
	const APInt FltVal = FPImm.bitcastToAPInt();
	const char FltPtr = (const char )FltVal.getRawData();

	int NumBytes = FltVal.getBitWidth() / 8; // 8 bits per byte.
	bool LittleEndian = Asm->getDataLayout().isLittleEndian();
	int Incr = (LittleEndian ? 1 : -1);
	int Start = (LittleEndian ? 0 : NumBytes - 1);
	int Stop = (LittleEndian ? NumBytes : -1);

	// Output the constant to DWARF one byte at a time.
	for (; Start != Stop; Start += Incr)
	addUInt(*Block, dwarf::DW_FORM_data1, (unsigned char)0xFF & FltPtr[Start]);

	addBlock(Die, dwarf::DW_AT_const_value, Block);
	}

	void DwarfUnit::addConstantFPValue(DIE &Die, const ConstantFP *CFP) {
	// Pass this down to addConstantValue as an unsigned bag of bits.
	addConstantValue(Die, CFP->getValueAPF().bitcastToAPInt(), true);
	}

	void DwarfUnit::addConstantValue(DIE &Die, const ConstantInt *CI,
	const DIType *Ty) {
	addConstantValue(Die, CI->getValue(), Ty);
	}

	void DwarfUnit::addConstantValue(DIE &Die, const MachineOperand &MO,
	const DIType *Ty) {
	assert(MO.isImm() && "Invalid machine operand!");

	addConstantValue(Die, isUnsignedDIType(DD, Ty), MO.getImm());
	}

	void DwarfUnit::addConstantValue(DIE &Die, bool Unsigned, uint64_t Val) {
	// FIXME: This is a bit conservative/simple - it emits negative values always
	// sign extended to 64 bits rather than minimizing the number of bytes.
	addUInt(Die, dwarf::DW_AT_const_value,
	Unsigned ? dwarf::DW_FORM_udata : dwarf::DW_FORM_sdata, Val);
	}

	void DwarfUnit::addConstantValue(DIE &Die, const APInt &Val, const DIType *Ty) {
	addConstantValue(Die, Val, isUnsignedDIType(DD, Ty));
	}

	void DwarfUnit::addConstantValue(DIE &Die, const APInt &Val, bool Unsigned) {
	unsigned CIBitWidth = Val.getBitWidth();
	if (CIBitWidth <= 64) {
	addConstantValue(Die, Unsigned,
	Unsigned ? Val.getZExtValue() : Val.getSExtValue());
	return;
	}

	DIEBlock *Block = new (DIEValueAllocator) DIEBlock;

	// Get the raw data form of the large APInt.
	const uint64_t *Ptr64 = Val.getRawData();

	int NumBytes = Val.getBitWidth() / 8; // 8 bits per byte.
	bool LittleEndian = Asm->getDataLayout().isLittleEndian();

	// Output the constant to DWARF one byte at a time.
	for (int i = 0; i < NumBytes; i++) {
	uint8_t c;
	if (LittleEndian)
	c = Ptr64[i / 8] >> (8 * (i & 7));
	else
	c = Ptr64[(NumBytes - 1 - i) / 8] >> (8 * ((NumBytes - 1 - i) & 7));
	addUInt(*Block, dwarf::DW_FORM_data1, c);
	}

	addBlock(Die, dwarf::DW_AT_const_value, Block);
	}

	void DwarfUnit::addLinkageName(DIE &Die, StringRef LinkageName) {
	if (!LinkageName.empty())
	addString(Die,
	DD->getDwarfVersion() >= 4 ? dwarf::DW_AT_linkage_name
	: dwarf::DW_AT_MIPS_linkage_name,
	GlobalValue::dropLLVMManglingEscape(LinkageName));
	}

	void DwarfUnit::addTemplateParams(DIE &Buffer, DINodeArray TParams) {
	// Add template parameters.
	for (const auto *Element : TParams) {
	if (auto *TTP = dyn_cast<DITemplateTypeParameter>(Element))
	constructTemplateTypeParameterDIE(Buffer, TTP);
	else if (auto *TVP = dyn_cast<DITemplateValueParameter>(Element))
	constructTemplateValueParameterDIE(Buffer, TVP);
	}
	}

	/// Add thrown types.
	void DwarfUnit::addThrownTypes(DIE &Die, DINodeArray ThrownTypes) {
	for (const auto *Ty : ThrownTypes) {
	DIE &TT = createAndAddDIE(dwarf::DW_TAG_thrown_type, Die);
	addType(TT, cast<DIType>(Ty));
	}
	}

	DIE DwarfUnit::getOrCreateContextDIE(const DIScope Context) {
	if (!Context \|\| isa<DIFile>(Context))
	return &getUnitDie();
	if (auto *T = dyn_cast<DIType>(Context))
	return getOrCreateTypeDIE(T);
	if (auto *NS = dyn_cast<DINamespace>(Context))
	return getOrCreateNameSpace(NS);
	if (auto *SP = dyn_cast<DISubprogram>(Context))
	return getOrCreateSubprogramDIE(SP);
	if (auto *M = dyn_cast<DIModule>(Context))
	return getOrCreateModule(M);
	return getDIE(Context);
	}

	DIE DwarfTypeUnit::createTypeDIE(const DICompositeType Ty) {
	auto *Context = resolve(Ty->getScope());
	DIE *ContextDIE = getOrCreateContextDIE(Context);

	if (DIE *TyDIE = getDIE(Ty))
	return TyDIE;

	// Create new type.
	DIE &TyDIE = createAndAddDIE(Ty->getTag(), *ContextDIE, Ty);

	constructTypeDIE(TyDIE, cast<DICompositeType>(Ty));

	updateAcceleratorTables(Context, Ty, TyDIE);
	return &TyDIE;
	}

	DIE DwarfUnit::getOrCreateTypeDIE(const MDNode TyNode) {
	if (!TyNode)
	return nullptr;

	auto *Ty = cast<DIType>(TyNode);

	// DW_TAG_restrict_type is not supported in DWARF2
	if (Ty->getTag() == dwarf::DW_TAG_restrict_type && DD->getDwarfVersion() <= 2)
	return getOrCreateTypeDIE(resolve(cast<DIDerivedType>(Ty)->getBaseType()));

	// DW_TAG_atomic_type is not supported in DWARF < 5
	if (Ty->getTag() == dwarf::DW_TAG_atomic_type && DD->getDwarfVersion() < 5)
	return getOrCreateTypeDIE(resolve(cast<DIDerivedType>(Ty)->getBaseType()));

	// Construct the context before querying for the existence of the DIE in case
	// such construction creates the DIE.
	auto *Context = resolve(Ty->getScope());
	DIE *ContextDIE = getOrCreateContextDIE(Context);
	assert(ContextDIE);

	if (DIE *TyDIE = getDIE(Ty))
	return TyDIE;

	// Create new type.
	DIE &TyDIE = createAndAddDIE(Ty->getTag(), *ContextDIE, Ty);

	updateAcceleratorTables(Context, Ty, TyDIE);

	if (auto *BT = dyn_cast<DIBasicType>(Ty))
	constructTypeDIE(TyDIE, BT);
	else if (auto *STy = dyn_cast<DISubroutineType>(Ty))
	constructTypeDIE(TyDIE, STy);
	else if (auto *CTy = dyn_cast<DICompositeType>(Ty)) {
	if (GenerateDwarfTypeUnits && !Ty->isForwardDecl())
	if (MDString *TypeId = CTy->getRawIdentifier()) {
	DD->addDwarfTypeUnitType(getCU(), TypeId->getString(), TyDIE, CTy);
	// Skip updating the accelerator tables since this is not the full type.
	return &TyDIE;
	}
	constructTypeDIE(TyDIE, CTy);
	} else {
	constructTypeDIE(TyDIE, cast<DIDerivedType>(Ty));
	}

	return &TyDIE;
	}

	void DwarfUnit::updateAcceleratorTables(const DIScope *Context,
	const DIType *Ty, const DIE &TyDIE) {
	if (!Ty->getName().empty() && !Ty->isForwardDecl()) {
	bool IsImplementation = false;
	if (auto *CT = dyn_cast<DICompositeType>(Ty)) {
	// A runtime language of 0 actually means C/C++ and that any
	// non-negative value is some version of Objective-C/C++.
	IsImplementation = CT->getRuntimeLang() == 0 \|\| CT->isObjcClassComplete();
	}
	unsigned Flags = IsImplementation ? dwarf::DW_FLAG_type_implementation : 0;
	DD->addAccelType(Ty->getName(), TyDIE, Flags);

	if (!Context \|\| isa<DICompileUnit>(Context) \|\| isa<DIFile>(Context) \|\|
	isa<DINamespace>(Context))
	addGlobalType(Ty, TyDIE, Context);
	}
	}

	void DwarfUnit::addType(DIE &Entity, const DIType *Ty,
	dwarf::Attribute Attribute) {
	assert(Ty && "Trying to add a type that doesn't exist?");
	addDIEEntry(Entity, Attribute, DIEEntry(*getOrCreateTypeDIE(Ty)));
	}

	std::string DwarfUnit::getParentContextString(const DIScope *Context) const {
	if (!Context)
	return "";

	// FIXME: Decide whether to implement this for non-C++ languages.
	if (getLanguage() != dwarf::DW_LANG_C_plus_plus)
	return "";

	std::string CS;
	SmallVector<const DIScope *, 1> Parents;
	while (!isa<DICompileUnit>(Context)) {
	Parents.push_back(Context);
	if (Context->getScope())
	Context = resolve(Context->getScope());
	else
	// Structure, etc types will have a NULL context if they're at the top
	// level.
	break;
	}

	// Reverse iterate over our list to go from the outermost construct to the
	// innermost.
	for (const DIScope *Ctx : make_range(Parents.rbegin(), Parents.rend())) {
	StringRef Name = Ctx->getName();
	if (Name.empty() && isa<DINamespace>(Ctx))
	Name = "(anonymous namespace)";
	if (!Name.empty()) {
	CS += Name;
	CS += "::";
	}
	}
	return CS;
	}

	void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIBasicType *BTy) {
	// Get core information.
	StringRef Name = BTy->getName();
	// Add name if not anonymous or intermediate type.
	if (!Name.empty())
	addString(Buffer, dwarf::DW_AT_name, Name);

	// An unspecified type only has a name attribute.
	if (BTy->getTag() == dwarf::DW_TAG_unspecified_type)
	return;

	addUInt(Buffer, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
	BTy->getEncoding());

	uint64_t Size = BTy->getSizeInBits() >> 3;
	addUInt(Buffer, dwarf::DW_AT_byte_size, None, Size);
	}

	void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy) {
	// Get core information.
	StringRef Name = DTy->getName();
	uint64_t Size = DTy->getSizeInBits() >> 3;
	uint16_t Tag = Buffer.getTag();

	// Map to main type, void will not have a type.
	const DIType *FromTy = resolve(DTy->getBaseType());
	if (FromTy)
	addType(Buffer, FromTy);

	// Add name if not anonymous or intermediate type.
	if (!Name.empty())
	addString(Buffer, dwarf::DW_AT_name, Name);

	// Add size if non-zero (derived types might be zero-sized.)
	if (Size && Tag != dwarf::DW_TAG_pointer_type
	&& Tag != dwarf::DW_TAG_ptr_to_member_type
	&& Tag != dwarf::DW_TAG_reference_type
	&& Tag != dwarf::DW_TAG_rvalue_reference_type)
	addUInt(Buffer, dwarf::DW_AT_byte_size, None, Size);

	if (Tag == dwarf::DW_TAG_ptr_to_member_type)
	addDIEEntry(
	Buffer, dwarf::DW_AT_containing_type,
	*getOrCreateTypeDIE(resolve(cast<DIDerivedType>(DTy)->getClassType())));
	// Add source line info if available and TyDesc is not a forward declaration.
	if (!DTy->isForwardDecl())
	addSourceLine(Buffer, DTy);

	// If DWARF address space value is other than None, add it for pointer and
	// reference types as DW_AT_address_class.
	if (DTy->getDWARFAddressSpace() && (Tag == dwarf::DW_TAG_pointer_type \|\|
	Tag == dwarf::DW_TAG_reference_type))
	addUInt(Buffer, dwarf::DW_AT_address_class, dwarf::DW_FORM_data4,
	DTy->getDWARFAddressSpace().getValue());
	}

	void DwarfUnit::constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args) {
	for (unsigned i = 1, N = Args.size(); i < N; ++i) {
	const DIType *Ty = resolve(Args[i]);
	if (!Ty) {
	assert(i == N-1 && "Unspecified parameter must be the last argument");
	createAndAddDIE(dwarf::DW_TAG_unspecified_parameters, Buffer);
	} else {
	DIE &Arg = createAndAddDIE(dwarf::DW_TAG_formal_parameter, Buffer);
	addType(Arg, Ty);
	if (Ty->isArtificial())
	addFlag(Arg, dwarf::DW_AT_artificial);
	}
	}
	}

	void DwarfUnit::constructTypeDIE(DIE &Buffer, const DISubroutineType *CTy) {
	// Add return type. A void return won't have a type.
	auto Elements = cast<DISubroutineType>(CTy)->getTypeArray();
	if (Elements.size())
	if (auto RTy = resolve(Elements[0]))
	addType(Buffer, RTy);

	bool isPrototyped = true;
	if (Elements.size() == 2 && !Elements[1])
	isPrototyped = false;

	constructSubprogramArguments(Buffer, Elements);

	// Add prototype flag if we're dealing with a C language and the function has
	// been prototyped.
	uint16_t Language = getLanguage();
	if (isPrototyped &&
	(Language == dwarf::DW_LANG_C89 \|\| Language == dwarf::DW_LANG_C99 \|\|
	Language == dwarf::DW_LANG_ObjC))
	addFlag(Buffer, dwarf::DW_AT_prototyped);

	// Add a DW_AT_calling_convention if this has an explicit convention.
	if (CTy->getCC() && CTy->getCC() != dwarf::DW_CC_normal)
	addUInt(Buffer, dwarf::DW_AT_calling_convention, dwarf::DW_FORM_data1,
	CTy->getCC());

	if (CTy->isLValueReference())
	addFlag(Buffer, dwarf::DW_AT_reference);

	if (CTy->isRValueReference())
	addFlag(Buffer, dwarf::DW_AT_rvalue_reference);
	}

	void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
	// Add name if not anonymous or intermediate type.
	StringRef Name = CTy->getName();

	uint64_t Size = CTy->getSizeInBits() >> 3;
	uint16_t Tag = Buffer.getTag();

	switch (Tag) {
	case dwarf::DW_TAG_array_type:
	constructArrayTypeDIE(Buffer, CTy);
	break;
	case dwarf::DW_TAG_enumeration_type:
	constructEnumTypeDIE(Buffer, CTy);
	break;
	case dwarf::DW_TAG_structure_type:
	case dwarf::DW_TAG_union_type:
	case dwarf::DW_TAG_class_type: {
	// Add elements to structure type.
	DINodeArray Elements = CTy->getElements();
	for (const auto *Element : Elements) {
	if (!Element)
	continue;
	if (auto *SP = dyn_cast<DISubprogram>(Element))
	getOrCreateSubprogramDIE(SP);
	else if (auto *DDTy = dyn_cast<DIDerivedType>(Element)) {
	if (DDTy->getTag() == dwarf::DW_TAG_friend) {
	DIE &ElemDie = createAndAddDIE(dwarf::DW_TAG_friend, Buffer);
	addType(ElemDie, resolve(DDTy->getBaseType()), dwarf::DW_AT_friend);
	} else if (DDTy->isStaticMember()) {
	getOrCreateStaticMemberDIE(DDTy);
	} else {
	constructMemberDIE(Buffer, DDTy);
	}
	} else if (auto *Property = dyn_cast<DIObjCProperty>(Element)) {
	DIE &ElemDie = createAndAddDIE(Property->getTag(), Buffer);
	StringRef PropertyName = Property->getName();
	addString(ElemDie, dwarf::DW_AT_APPLE_property_name, PropertyName);
	if (Property->getType())
	addType(ElemDie, resolve(Property->getType()));
	addSourceLine(ElemDie, Property);
	StringRef GetterName = Property->getGetterName();
	if (!GetterName.empty())
	addString(ElemDie, dwarf::DW_AT_APPLE_property_getter, GetterName);
	StringRef SetterName = Property->getSetterName();
	if (!SetterName.empty())
	addString(ElemDie, dwarf::DW_AT_APPLE_property_setter, SetterName);
	if (unsigned PropertyAttributes = Property->getAttributes())
	addUInt(ElemDie, dwarf::DW_AT_APPLE_property_attribute, None,
	PropertyAttributes);
	}
	}

	if (CTy->isAppleBlockExtension())
	addFlag(Buffer, dwarf::DW_AT_APPLE_block);

	// This is outside the DWARF spec, but GDB expects a DW_AT_containing_type
	// inside C++ composite types to point to the base class with the vtable.
	// Rust uses DW_AT_containing_type to link a vtable to the type
	// for which it was created.
	if (auto *ContainingType = resolve(CTy->getVTableHolder()))
	addDIEEntry(Buffer, dwarf::DW_AT_containing_type,
	*getOrCreateTypeDIE(ContainingType));

	if (CTy->isObjcClassComplete())
	addFlag(Buffer, dwarf::DW_AT_APPLE_objc_complete_type);

	// Add template parameters to a class, structure or union types.
	// FIXME: The support isn't in the metadata for this yet.
	if (Tag == dwarf::DW_TAG_class_type \|\|
	Tag == dwarf::DW_TAG_structure_type \|\| Tag == dwarf::DW_TAG_union_type)
	addTemplateParams(Buffer, CTy->getTemplateParams());

	break;
	}
	default:
	break;
	}

	// Add name if not anonymous or intermediate type.
	if (!Name.empty())
	addString(Buffer, dwarf::DW_AT_name, Name);

	if (Tag == dwarf::DW_TAG_enumeration_type \|\|
	Tag == dwarf::DW_TAG_class_type \|\| Tag == dwarf::DW_TAG_structure_type \|\|
	Tag == dwarf::DW_TAG_union_type) {
	// Add size if non-zero (derived types might be zero-sized.)
	// TODO: Do we care about size for enum forward declarations?
	if (Size)
	addUInt(Buffer, dwarf::DW_AT_byte_size, None, Size);
	else if (!CTy->isForwardDecl())
	// Add zero size if it is not a forward declaration.
	addUInt(Buffer, dwarf::DW_AT_byte_size, None, 0);

	// If we're a forward decl, say so.
	if (CTy->isForwardDecl())
	addFlag(Buffer, dwarf::DW_AT_declaration);

	// Add source line info if available.
	if (!CTy->isForwardDecl())
	addSourceLine(Buffer, CTy);

	// No harm in adding the runtime language to the declaration.
	unsigned RLang = CTy->getRuntimeLang();
	if (RLang)
	addUInt(Buffer, dwarf::DW_AT_APPLE_runtime_class, dwarf::DW_FORM_data1,
	RLang);

	// Add align info if available.
	if (uint32_t AlignInBytes = CTy->getAlignInBytes())
	addUInt(Buffer, dwarf::DW_AT_alignment, dwarf::DW_FORM_udata,
	AlignInBytes);
	}
	}

	void DwarfUnit::constructTemplateTypeParameterDIE(
	DIE &Buffer, const DITemplateTypeParameter *TP) {
	DIE &ParamDIE =
	createAndAddDIE(dwarf::DW_TAG_template_type_parameter, Buffer);
	// Add the type if it exists, it could be void and therefore no type.
	if (TP->getType())
	addType(ParamDIE, resolve(TP->getType()));
	if (!TP->getName().empty())
	addString(ParamDIE, dwarf::DW_AT_name, TP->getName());
	}

	void DwarfUnit::constructTemplateValueParameterDIE(
	DIE &Buffer, const DITemplateValueParameter *VP) {
	DIE &ParamDIE = createAndAddDIE(VP->getTag(), Buffer);

	// Add the type if there is one, template template and template parameter
	// packs will not have a type.
	if (VP->getTag() == dwarf::DW_TAG_template_value_parameter)
	addType(ParamDIE, resolve(VP->getType()));
	if (!VP->getName().empty())
	addString(ParamDIE, dwarf::DW_AT_name, VP->getName());
	if (Metadata *Val = VP->getValue()) {
	if (ConstantInt *CI = mdconst::dyn_extract<ConstantInt>(Val))
	addConstantValue(ParamDIE, CI, resolve(VP->getType()));
	else if (GlobalValue *GV = mdconst::dyn_extract<GlobalValue>(Val)) {
	// We cannot describe the location of dllimport'd entities: the
	// computation of their address requires loads from the IAT.
	if (!GV->hasDLLImportStorageClass()) {
	// For declaration non-type template parameters (such as global values
	// and functions)
	DIELoc *Loc = new (DIEValueAllocator) DIELoc;
	addOpAddress(*Loc, Asm->getSymbol(GV));
	// Emit DW_OP_stack_value to use the address as the immediate value of
	// the parameter, rather than a pointer to it.
	addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_stack_value);
	addBlock(ParamDIE, dwarf::DW_AT_location, Loc);
	}
	} else if (VP->getTag() == dwarf::DW_TAG_GNU_template_template_param) {
	assert(isa<MDString>(Val));
	addString(ParamDIE, dwarf::DW_AT_GNU_template_name,
	cast<MDString>(Val)->getString());
	} else if (VP->getTag() == dwarf::DW_TAG_GNU_template_parameter_pack) {
	addTemplateParams(ParamDIE, cast<MDTuple>(Val));
	}
	}
	}

	DIE DwarfUnit::getOrCreateNameSpace(const DINamespace NS) {
	// Construct the context before querying for the existence of the DIE in case
	// such construction creates the DIE.
	DIE *ContextDIE = getOrCreateContextDIE(NS->getScope());

	if (DIE *NDie = getDIE(NS))
	return NDie;
	DIE &NDie = createAndAddDIE(dwarf::DW_TAG_namespace, *ContextDIE, NS);

	StringRef Name = NS->getName();
	if (!Name.empty())
	addString(NDie, dwarf::DW_AT_name, NS->getName());
	else
	Name = "(anonymous namespace)";
	DD->addAccelNamespace(Name, NDie);
	addGlobalName(Name, NDie, NS->getScope());
	if (NS->getExportSymbols())
	addFlag(NDie, dwarf::DW_AT_export_symbols);
	return &NDie;
	}

	DIE DwarfUnit::getOrCreateModule(const DIModule M) {
	// Construct the context before querying for the existence of the DIE in case
	// such construction creates the DIE.
	DIE *ContextDIE = getOrCreateContextDIE(M->getScope());

	if (DIE *MDie = getDIE(M))
	return MDie;
	DIE &MDie = createAndAddDIE(dwarf::DW_TAG_module, *ContextDIE, M);

	if (!M->getName().empty()) {
	addString(MDie, dwarf::DW_AT_name, M->getName());
	addGlobalName(M->getName(), MDie, M->getScope());
	}
	if (!M->getConfigurationMacros().empty())
	addString(MDie, dwarf::DW_AT_LLVM_config_macros,
	M->getConfigurationMacros());
	if (!M->getIncludePath().empty())
	addString(MDie, dwarf::DW_AT_LLVM_include_path, M->getIncludePath());
	if (!M->getISysRoot().empty())
	addString(MDie, dwarf::DW_AT_LLVM_isysroot, M->getISysRoot());

	return &MDie;
	}

	DIE DwarfUnit::getOrCreateSubprogramDIE(const DISubprogram SP, bool Minimal) {
	// Construct the context before querying for the existence of the DIE in case
	// such construction creates the DIE (as is the case for member function
	// declarations).
	DIE *ContextDIE =
	Minimal ? &getUnitDie() : getOrCreateContextDIE(resolve(SP->getScope()));

	if (DIE *SPDie = getDIE(SP))
	return SPDie;

	if (auto *SPDecl = SP->getDeclaration()) {
	if (!Minimal) {
	// Add subprogram definitions to the CU die directly.
	ContextDIE = &getUnitDie();
	// Build the decl now to ensure it precedes the definition.
	getOrCreateSubprogramDIE(SPDecl);
	}
	}

	// DW_TAG_inlined_subroutine may refer to this DIE.
	DIE &SPDie = createAndAddDIE(dwarf::DW_TAG_subprogram, *ContextDIE, SP);

	// Stop here and fill this in later, depending on whether or not this
	// subprogram turns out to have inlined instances or not.
	if (SP->isDefinition())
	return &SPDie;

	applySubprogramAttributes(SP, SPDie);
	return &SPDie;
	}

	bool DwarfUnit::applySubprogramDefinitionAttributes(const DISubprogram *SP,
	DIE &SPDie) {
	DIE *DeclDie = nullptr;
	StringRef DeclLinkageName;
	if (auto *SPDecl = SP->getDeclaration()) {
	DeclDie = getDIE(SPDecl);
	assert(DeclDie && "This DIE should've already been constructed when the "
	"definition DIE was created in "
	"getOrCreateSubprogramDIE");
	// Look at the Decl's linkage name only if we emitted it.
	if (DD->useAllLinkageNames())
	DeclLinkageName = SPDecl->getLinkageName();
	unsigned DeclID =
	getOrCreateSourceID(SPDecl->getFilename(), SPDecl->getDirectory());
	unsigned DefID = getOrCreateSourceID(SP->getFilename(), SP->getDirectory());
	if (DeclID != DefID)
	addUInt(SPDie, dwarf::DW_AT_decl_file, None, DefID);

	if (SP->getLine() != SPDecl->getLine())
	addUInt(SPDie, dwarf::DW_AT_decl_line, None, SP->getLine());
	}

	// Add function template parameters.
	addTemplateParams(SPDie, SP->getTemplateParams());

	// Add the linkage name if we have one and it isn't in the Decl.
	StringRef LinkageName = SP->getLinkageName();
	assert(((LinkageName.empty() \|\| DeclLinkageName.empty()) \|\|
	LinkageName == DeclLinkageName) &&
	"decl has a linkage name and it is different");
	if (DeclLinkageName.empty() &&
	// Always emit it for abstract subprograms.
	(DD->useAllLinkageNames() \|\| DU->getAbstractSPDies().lookup(SP)))
	addLinkageName(SPDie, LinkageName);

	if (!DeclDie)
	return false;

	// Refer to the function declaration where all the other attributes will be
	// found.
	addDIEEntry(SPDie, dwarf::DW_AT_specification, *DeclDie);
	return true;
	}

	void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
	bool SkipSPAttributes) {
	// If -fdebug-info-for-profiling is enabled, need to emit the subprogram
	// and its source location.
	bool SkipSPSourceLocation = SkipSPAttributes &&
	!CUNode->getDebugInfoForProfiling();
	if (!SkipSPSourceLocation)
	if (applySubprogramDefinitionAttributes(SP, SPDie))
	return;

	// Constructors and operators for anonymous aggregates do not have names.
	if (!SP->getName().empty())
	addString(SPDie, dwarf::DW_AT_name, SP->getName());

	if (!SkipSPSourceLocation)
	addSourceLine(SPDie, SP);

	// Skip the rest of the attributes under -gmlt to save space.
	if (SkipSPAttributes)
	return;

	// Add the prototype if we have a prototype and we have a C like
	// language.
	uint16_t Language = getLanguage();
	if (SP->isPrototyped() &&
	(Language == dwarf::DW_LANG_C89 \|\| Language == dwarf::DW_LANG_C99 \|\|
	Language == dwarf::DW_LANG_ObjC))
	addFlag(SPDie, dwarf::DW_AT_prototyped);

	unsigned CC = 0;
	DITypeRefArray Args;
	if (const DISubroutineType *SPTy = SP->getType()) {
	Args = SPTy->getTypeArray();
	CC = SPTy->getCC();
	}

	// Add a DW_AT_calling_convention if this has an explicit convention.
	if (CC && CC != dwarf::DW_CC_normal)
	addUInt(SPDie, dwarf::DW_AT_calling_convention, dwarf::DW_FORM_data1, CC);

	// Add a return type. If this is a type like a C/C++ void type we don't add a
	// return type.
	if (Args.size())
	if (auto Ty = resolve(Args[0]))
	addType(SPDie, Ty);

	unsigned VK = SP->getVirtuality();
	if (VK) {
	addUInt(SPDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_data1, VK);
	if (SP->getVirtualIndex() != -1u) {
	DIELoc *Block = getDIELoc();
	addUInt(*Block, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
	addUInt(*Block, dwarf::DW_FORM_udata, SP->getVirtualIndex());
	addBlock(SPDie, dwarf::DW_AT_vtable_elem_location, Block);
	}
	ContainingTypeMap.insert(
	std::make_pair(&SPDie, resolve(SP->getContainingType())));
	}

	if (!SP->isDefinition()) {
	addFlag(SPDie, dwarf::DW_AT_declaration);

	// Add arguments. Do not add arguments for subprogram definition. They will
	// be handled while processing variables.
	constructSubprogramArguments(SPDie, Args);
	}

	addThrownTypes(SPDie, SP->getThrownTypes());

	if (SP->isArtificial())
	addFlag(SPDie, dwarf::DW_AT_artificial);

	if (!SP->isLocalToUnit())
	addFlag(SPDie, dwarf::DW_AT_external);

	if (DD->useAppleExtensionAttributes()) {
	if (SP->isOptimized())
	addFlag(SPDie, dwarf::DW_AT_APPLE_optimized);

	if (unsigned isa = Asm->getISAEncoding())
	addUInt(SPDie, dwarf::DW_AT_APPLE_isa, dwarf::DW_FORM_flag, isa);
	}

	if (SP->isLValueReference())
	addFlag(SPDie, dwarf::DW_AT_reference);

	if (SP->isRValueReference())
	addFlag(SPDie, dwarf::DW_AT_rvalue_reference);

	if (SP->isNoReturn())
	addFlag(SPDie, dwarf::DW_AT_noreturn);

	if (SP->isProtected())
	addUInt(SPDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
	dwarf::DW_ACCESS_protected);
	else if (SP->isPrivate())
	addUInt(SPDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
	dwarf::DW_ACCESS_private);
	else if (SP->isPublic())
	addUInt(SPDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
	dwarf::DW_ACCESS_public);

	if (SP->isExplicit())
	addFlag(SPDie, dwarf::DW_AT_explicit);

	if (SP->isMainSubprogram())
	addFlag(SPDie, dwarf::DW_AT_main_subprogram);
	}

	void DwarfUnit::constructSubrangeDIE(DIE &Buffer, const DISubrange *SR,
	DIE *IndexTy) {
	DIE &DW_Subrange = createAndAddDIE(dwarf::DW_TAG_subrange_type, Buffer);
	addDIEEntry(DW_Subrange, dwarf::DW_AT_type, *IndexTy);

	// The LowerBound value defines the lower bounds which is typically zero for
	// C/C++. The Count value is the number of elements. Values are 64 bit. If
	// Count == -1 then the array is unbounded and we do not emit
	// DW_AT_lower_bound and DW_AT_count attributes.
	int64_t LowerBound = SR->getLowerBound();
	int64_t DefaultLowerBound = getDefaultLowerBound();
	int64_t Count = SR->getCount();

	if (DefaultLowerBound == -1 \|\| LowerBound != DefaultLowerBound)
	addUInt(DW_Subrange, dwarf::DW_AT_lower_bound, None, LowerBound);

	if (Count != -1)
	// FIXME: An unbounded array should reference the expression that defines
	// the array.
	addUInt(DW_Subrange, dwarf::DW_AT_count, None, Count);
	}

	DIE *DwarfUnit::getIndexTyDie() {
	if (IndexTyDie)
	return IndexTyDie;
	// Construct an integer type to use for indexes.
	IndexTyDie = &createAndAddDIE(dwarf::DW_TAG_base_type, getUnitDie());
	addString(*IndexTyDie, dwarf::DW_AT_name, "sizetype");
	addUInt(*IndexTyDie, dwarf::DW_AT_byte_size, None, sizeof(int64_t));
	addUInt(*IndexTyDie, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
	dwarf::DW_ATE_unsigned);
	return IndexTyDie;
	}

	void DwarfUnit::constructArrayTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
	if (CTy->isVector())
	addFlag(Buffer, dwarf::DW_AT_GNU_vector);

	// Emit the element type.
	addType(Buffer, resolve(CTy->getBaseType()));

	// Get an anonymous type for index type.
	// FIXME: This type should be passed down from the front end
	// as different languages may have different sizes for indexes.
	DIE *IdxTy = getIndexTyDie();

	// Add subranges to array type.
	DINodeArray Elements = CTy->getElements();
	for (unsigned i = 0, N = Elements.size(); i < N; ++i) {
	// FIXME: Should this really be such a loose cast?
	if (auto *Element = dyn_cast_or_null<DINode>(Elements[i]))
	if (Element->getTag() == dwarf::DW_TAG_subrange_type)
	constructSubrangeDIE(Buffer, cast<DISubrange>(Element), IdxTy);
	}
	}

	void DwarfUnit::constructEnumTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
	DINodeArray Elements = CTy->getElements();

	// Add enumerators to enumeration type.
	for (unsigned i = 0, N = Elements.size(); i < N; ++i) {
	auto *Enum = dyn_cast_or_null<DIEnumerator>(Elements[i]);
	if (Enum) {
	DIE &Enumerator = createAndAddDIE(dwarf::DW_TAG_enumerator, Buffer);
	StringRef Name = Enum->getName();
	addString(Enumerator, dwarf::DW_AT_name, Name);
	int64_t Value = Enum->getValue();
	addSInt(Enumerator, dwarf::DW_AT_const_value, dwarf::DW_FORM_sdata,
	Value);
	}
	}
	const DIType *DTy = resolve(CTy->getBaseType());
	if (DTy) {
	addType(Buffer, DTy);
	addFlag(Buffer, dwarf::DW_AT_enum_class);
	}
	}

	void DwarfUnit::constructContainingTypeDIEs() {
	for (auto CI = ContainingTypeMap.begin(), CE = ContainingTypeMap.end();
	CI != CE; ++CI) {
	DIE &SPDie = *CI->first;
	const DINode *D = CI->second;
	if (!D)
	continue;
	DIE *NDie = getDIE(D);
	if (!NDie)
	continue;
	addDIEEntry(SPDie, dwarf::DW_AT_containing_type, *NDie);
	}
	}

	void DwarfUnit::constructMemberDIE(DIE &Buffer, const DIDerivedType *DT) {
	DIE &MemberDie = createAndAddDIE(DT->getTag(), Buffer);
	StringRef Name = DT->getName();
	if (!Name.empty())
	addString(MemberDie, dwarf::DW_AT_name, Name);

	- addType(MemberDie, resolve(DT->getBaseType()));
	+ if (DIType *Resolved = resolve(DT->getBaseType()))
	+ addType(MemberDie, Resolved);

	addSourceLine(MemberDie, DT);

	if (DT->getTag() == dwarf::DW_TAG_inheritance && DT->isVirtual()) {

	// For C++, virtual base classes are not at fixed offset. Use following
	// expression to extract appropriate offset from vtable.
	// BaseAddr = ObAddr + ((ObAddr) - Offset)

	DIELoc *VBaseLocationDie = new (DIEValueAllocator) DIELoc;
	addUInt(*VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_dup);
	addUInt(*VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
	addUInt(*VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
	addUInt(*VBaseLocationDie, dwarf::DW_FORM_udata, DT->getOffsetInBits());
	addUInt(*VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_minus);
	addUInt(*VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
	addUInt(*VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);

	addBlock(MemberDie, dwarf::DW_AT_data_member_location, VBaseLocationDie);
	} else {
	uint64_t Size = DT->getSizeInBits();
	uint64_t FieldSize = DD->getBaseTypeSize(DT);
	uint32_t AlignInBytes = DT->getAlignInBytes();
	uint64_t OffsetInBytes;

	bool IsBitfield = FieldSize && Size != FieldSize;
	if (IsBitfield) {
	// Handle bitfield, assume bytes are 8 bits.
	if (DD->useDWARF2Bitfields())
	addUInt(MemberDie, dwarf::DW_AT_byte_size, None, FieldSize/8);
	addUInt(MemberDie, dwarf::DW_AT_bit_size, None, Size);

	uint64_t Offset = DT->getOffsetInBits();
	// We can't use DT->getAlignInBits() here: AlignInBits for member type
	// is non-zero if and only if alignment was forced (e.g. _Alignas()),
	// which can't be done with bitfields. Thus we use FieldSize here.
	uint32_t AlignInBits = FieldSize;
	uint32_t AlignMask = ~(AlignInBits - 1);
	// The bits from the start of the storage unit to the start of the field.
	uint64_t StartBitOffset = Offset - (Offset & AlignMask);
	// The byte offset of the field's aligned storage unit inside the struct.
	OffsetInBytes = (Offset - StartBitOffset) / 8;

	if (DD->useDWARF2Bitfields()) {
	uint64_t HiMark = (Offset + FieldSize) & AlignMask;
	uint64_t FieldOffset = (HiMark - FieldSize);
	Offset -= FieldOffset;

	// Maybe we need to work from the other end.
	if (Asm->getDataLayout().isLittleEndian())
	Offset = FieldSize - (Offset + Size);

	addUInt(MemberDie, dwarf::DW_AT_bit_offset, None, Offset);
	OffsetInBytes = FieldOffset >> 3;
	} else {
	addUInt(MemberDie, dwarf::DW_AT_data_bit_offset, None, Offset);
	}
	} else {
	// This is not a bitfield.
	OffsetInBytes = DT->getOffsetInBits() / 8;
	if (AlignInBytes)
	addUInt(MemberDie, dwarf::DW_AT_alignment, dwarf::DW_FORM_udata,
	AlignInBytes);
	}

	if (DD->getDwarfVersion() <= 2) {
	DIELoc *MemLocationDie = new (DIEValueAllocator) DIELoc;
	addUInt(*MemLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
	addUInt(*MemLocationDie, dwarf::DW_FORM_udata, OffsetInBytes);
	addBlock(MemberDie, dwarf::DW_AT_data_member_location, MemLocationDie);
	} else if (!IsBitfield \|\| DD->useDWARF2Bitfields())
	addUInt(MemberDie, dwarf::DW_AT_data_member_location, None,
	OffsetInBytes);
	}

	if (DT->isProtected())
	addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
	dwarf::DW_ACCESS_protected);
	else if (DT->isPrivate())
	addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
	dwarf::DW_ACCESS_private);
	// Otherwise C++ member and base classes are considered public.
	else if (DT->isPublic())
	addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
	dwarf::DW_ACCESS_public);
	if (DT->isVirtual())
	addUInt(MemberDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_data1,
	dwarf::DW_VIRTUALITY_virtual);

	// Objective-C properties.
	if (DINode *PNode = DT->getObjCProperty())
	if (DIE *PDie = getDIE(PNode))
	MemberDie.addValue(DIEValueAllocator, dwarf::DW_AT_APPLE_property,
	dwarf::DW_FORM_ref4, DIEEntry(*PDie));

	if (DT->isArtificial())
	addFlag(MemberDie, dwarf::DW_AT_artificial);
	}

	DIE DwarfUnit::getOrCreateStaticMemberDIE(const DIDerivedType DT) {
	if (!DT)
	return nullptr;

	// Construct the context before querying for the existence of the DIE in case
	// such construction creates the DIE.
	DIE *ContextDIE = getOrCreateContextDIE(resolve(DT->getScope()));
	assert(dwarf::isType(ContextDIE->getTag()) &&
	"Static member should belong to a type.");

	if (DIE *StaticMemberDIE = getDIE(DT))
	return StaticMemberDIE;

	DIE &StaticMemberDIE = createAndAddDIE(DT->getTag(), *ContextDIE, DT);

	const DIType *Ty = resolve(DT->getBaseType());

	addString(StaticMemberDIE, dwarf::DW_AT_name, DT->getName());
	addType(StaticMemberDIE, Ty);
	addSourceLine(StaticMemberDIE, DT);
	addFlag(StaticMemberDIE, dwarf::DW_AT_external);
	addFlag(StaticMemberDIE, dwarf::DW_AT_declaration);

	// FIXME: We could omit private if the parent is a class_type, and
	// public if the parent is something else.
	if (DT->isProtected())
	addUInt(StaticMemberDIE, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
	dwarf::DW_ACCESS_protected);
	else if (DT->isPrivate())
	addUInt(StaticMemberDIE, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
	dwarf::DW_ACCESS_private);
	else if (DT->isPublic())
	addUInt(StaticMemberDIE, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
	dwarf::DW_ACCESS_public);

	if (const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(DT->getConstant()))
	addConstantValue(StaticMemberDIE, CI, Ty);
	if (const ConstantFP *CFP = dyn_cast_or_null<ConstantFP>(DT->getConstant()))
	addConstantFPValue(StaticMemberDIE, CFP);

	if (uint32_t AlignInBytes = DT->getAlignInBytes())
	addUInt(StaticMemberDIE, dwarf::DW_AT_alignment, dwarf::DW_FORM_udata,
	AlignInBytes);

	return &StaticMemberDIE;
	}

	void DwarfUnit::emitCommonHeader(bool UseOffsets, dwarf::UnitType UT) {
	// Emit size of content not including length itself
	Asm->OutStreamer->AddComment("Length of Unit");
	Asm->EmitInt32(getHeaderSize() + getUnitDie().getSize());

	Asm->OutStreamer->AddComment("DWARF version number");
	unsigned Version = DD->getDwarfVersion();
	Asm->EmitInt16(Version);

	// DWARF v5 reorders the address size and adds a unit type.
	if (Version >= 5) {
	Asm->OutStreamer->AddComment("DWARF Unit Type");
	Asm->EmitInt8(UT);
	Asm->OutStreamer->AddComment("Address Size (in bytes)");
	Asm->EmitInt8(Asm->MAI->getCodePointerSize());
	}

	// We share one abbreviations table across all units so it's always at the
	// start of the section. Use a relocatable offset where needed to ensure
	// linking doesn't invalidate that offset.
	Asm->OutStreamer->AddComment("Offset Into Abbrev. Section");
	const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
	if (UseOffsets)
	Asm->EmitInt32(0);
	else
	Asm->emitDwarfSymbolReference(
	TLOF.getDwarfAbbrevSection()->getBeginSymbol(), false);

	if (Version <= 4) {
	Asm->OutStreamer->AddComment("Address Size (in bytes)");
	Asm->EmitInt8(Asm->MAI->getCodePointerSize());
	}
	}

	void DwarfTypeUnit::emitHeader(bool UseOffsets) {
	DwarfUnit::emitCommonHeader(UseOffsets,
	DD->useSplitDwarf() ? dwarf::DW_UT_split_type
	: dwarf::DW_UT_type);
	Asm->OutStreamer->AddComment("Type Signature");
	Asm->OutStreamer->EmitIntValue(TypeSignature, sizeof(TypeSignature));
	Asm->OutStreamer->AddComment("Type DIE Offset");
	// In a skeleton type unit there is no type DIE so emit a zero offset.
	Asm->OutStreamer->EmitIntValue(Ty ? Ty->getOffset() : 0,
	sizeof(Ty->getOffset()));
	}

	DIE::value_iterator
	DwarfUnit::addSectionDelta(DIE &Die, dwarf::Attribute Attribute,
	const MCSymbol Hi, const MCSymbol Lo) {
	return Die.addValue(DIEValueAllocator, Attribute,
	DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
	: dwarf::DW_FORM_data4,
	new (DIEValueAllocator) DIEDelta(Hi, Lo));
	}

	DIE::value_iterator
	DwarfUnit::addSectionLabel(DIE &Die, dwarf::Attribute Attribute,
	const MCSymbol Label, const MCSymbol Sec) {
	if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
	return addLabel(Die, Attribute,
	DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
	: dwarf::DW_FORM_data4,
	Label);
	return addSectionDelta(Die, Attribute, Label, Sec);
	}

	bool DwarfTypeUnit::isDwoUnit() const {
	// Since there are no skeleton type units, all type units are dwo type units
	// when split DWARF is being used.
	return DD->useSplitDwarf();
	}

	void DwarfTypeUnit::addGlobalName(StringRef Name, const DIE &Die,
	const DIScope *Context) {
	getCU().addGlobalNameForTypeUnit(Name, Context);
	}

	void DwarfTypeUnit::addGlobalType(const DIType *Ty, const DIE &Die,
	const DIScope *Context) {
	getCU().addGlobalTypeUnitType(Ty, Context);
	}

	const MCSymbol *DwarfUnit::getCrossSectionRelativeBaseAddress() const {
	if (!Asm->MAI->doesDwarfUseRelocationsAcrossSections())
	return nullptr;
	if (isDwoUnit())
	return nullptr;
	return getSection()->getBeginSymbol();
	}
	Index: head/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp
	===================================================================
	--- head/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp (revision 329409)
	+++ head/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp (revision 329410)
	@@ -1,329 +1,326 @@
	//===--- LivePhysRegs.cpp - Live Physical Register Set --------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the LivePhysRegs utility for tracking liveness of
	// physical registers across machine instructions in forward or backward order.
	// A more detailed description can be found in the corresponding header file.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/CodeGen/LivePhysRegs.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstrBundle.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/raw_ostream.h"
	using namespace llvm;


	/// \brief Remove all registers from the set that get clobbered by the register
	/// mask.
	/// The clobbers set will be the list of live registers clobbered
	/// by the regmask.
	void LivePhysRegs::removeRegsInMask(const MachineOperand &MO,
	SmallVectorImpl<std::pair<unsigned, const MachineOperand>> Clobbers) {
	SparseSet<unsigned>::iterator LRI = LiveRegs.begin();
	while (LRI != LiveRegs.end()) {
	if (MO.clobbersPhysReg(*LRI)) {
	if (Clobbers)
	Clobbers->push_back(std::make_pair(*LRI, &MO));
	LRI = LiveRegs.erase(LRI);
	} else
	++LRI;
	}
	}

	/// Remove defined registers and regmask kills from the set.
	void LivePhysRegs::removeDefs(const MachineInstr &MI) {
	for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
	if (O->isReg()) {
	if (!O->isDef())
	continue;
	unsigned Reg = O->getReg();
	if (!TargetRegisterInfo::isPhysicalRegister(Reg))
	continue;
	removeReg(Reg);
	} else if (O->isRegMask())
	removeRegsInMask(*O);
	}
	}

	/// Add uses to the set.
	void LivePhysRegs::addUses(const MachineInstr &MI) {
	for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
	if (!O->isReg() \|\| !O->readsReg())
	continue;
	unsigned Reg = O->getReg();
	if (!TargetRegisterInfo::isPhysicalRegister(Reg))
	continue;
	addReg(Reg);
	}
	}

	/// Simulates liveness when stepping backwards over an instruction(bundle):
	/// Remove Defs, add uses. This is the recommended way of calculating liveness.
	void LivePhysRegs::stepBackward(const MachineInstr &MI) {
	// Remove defined registers and regmask kills from the set.
	removeDefs(MI);

	// Add uses to the set.
	addUses(MI);
	}

	/// Simulates liveness when stepping forward over an instruction(bundle): Remove
	/// killed-uses, add defs. This is the not recommended way, because it depends
	/// on accurate kill flags. If possible use stepBackward() instead of this
	/// function.
	void LivePhysRegs::stepForward(const MachineInstr &MI,
	SmallVectorImpl<std::pair<unsigned, const MachineOperand*>> &Clobbers) {
	// Remove killed registers from the set.
	for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
	if (O->isReg()) {
	unsigned Reg = O->getReg();
	if (!TargetRegisterInfo::isPhysicalRegister(Reg))
	continue;
	if (O->isDef()) {
	// Note, dead defs are still recorded. The caller should decide how to
	// handle them.
	Clobbers.push_back(std::make_pair(Reg, &*O));
	} else {
	if (!O->isKill())
	continue;
	assert(O->isUse());
	removeReg(Reg);
	}
	} else if (O->isRegMask())
	removeRegsInMask(*O, &Clobbers);
	}

	// Add defs to the set.
	for (auto Reg : Clobbers) {
	// Skip dead defs. They shouldn't be added to the set.
	if (Reg.second->isReg() && Reg.second->isDead())
	continue;
	addReg(Reg.first);
	}
	}

	/// Prin the currently live registers to OS.
	void LivePhysRegs::print(raw_ostream &OS) const {
	OS << "Live Registers:";
	if (!TRI) {
	OS << " (uninitialized)\n";
	return;
	}

	if (empty()) {
	OS << " (empty)\n";
	return;
	}

	for (const_iterator I = begin(), E = end(); I != E; ++I)
	OS << " " << printReg(*I, TRI);
	OS << "\n";
	}

	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	LLVM_DUMP_METHOD void LivePhysRegs::dump() const {
	dbgs() << " " << *this;
	}
	#endif

	bool LivePhysRegs::available(const MachineRegisterInfo &MRI,
	unsigned Reg) const {
	if (LiveRegs.count(Reg))
	return false;
	if (MRI.isReserved(Reg))
	return false;
	for (MCRegAliasIterator R(Reg, TRI, false); R.isValid(); ++R) {
	if (LiveRegs.count(*R))
	return false;
	}
	return true;
	}

	/// Add live-in registers of basic block \p MBB to \p LiveRegs.
	void LivePhysRegs::addBlockLiveIns(const MachineBasicBlock &MBB) {
	for (const auto &LI : MBB.liveins()) {
	unsigned Reg = LI.PhysReg;
	LaneBitmask Mask = LI.LaneMask;
	MCSubRegIndexIterator S(Reg, TRI);
	assert(Mask.any() && "Invalid livein mask");
	if (Mask.all() \|\| !S.isValid()) {
	addReg(Reg);
	continue;
	}
	for (; S.isValid(); ++S) {
	unsigned SI = S.getSubRegIndex();
	if ((Mask & TRI->getSubRegIndexLaneMask(SI)).any())
	addReg(S.getSubReg());
	}
	}
	}

	/// Adds all callee saved registers to \p LiveRegs.
	static void addCalleeSavedRegs(LivePhysRegs &LiveRegs,
	const MachineFunction &MF) {
	const MachineRegisterInfo &MRI = MF.getRegInfo();
	for (const MCPhysReg CSR = MRI.getCalleeSavedRegs(); CSR && CSR; ++CSR)
	LiveRegs.addReg(*CSR);
	}

	void LivePhysRegs::addPristines(const MachineFunction &MF) {
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	if (!MFI.isCalleeSavedInfoValid())
	return;
	/// This function will usually be called on an empty object, handle this
	/// as a special case.
	if (empty()) {
	/// Add all callee saved regs, then remove the ones that are saved and
	/// restored.
	addCalleeSavedRegs(*this, MF);
	/// Remove the ones that are not saved/restored; they are pristine.
	for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo())
	removeReg(Info.getReg());
	return;
	}
	/// If a callee-saved register that is not pristine is already present
	/// in the set, we should make sure that it stays in it. Precompute the
	/// set of pristine registers in a separate object.
	/// Add all callee saved regs, then remove the ones that are saved+restored.
	LivePhysRegs Pristine(*TRI);
	addCalleeSavedRegs(Pristine, MF);
	/// Remove the ones that are not saved/restored; they are pristine.
	for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo())
	Pristine.removeReg(Info.getReg());
	for (MCPhysReg R : Pristine)
	addReg(R);
	}

	void LivePhysRegs::addLiveOutsNoPristines(const MachineBasicBlock &MBB) {
	- if (!MBB.succ_empty()) {
	- // To get the live-outs we simply merge the live-ins of all successors.
	- for (const MachineBasicBlock *Succ : MBB.successors())
	- addBlockLiveIns(*Succ);
	- } else if (MBB.isReturnBlock()) {
	- // For the return block: Add all callee saved registers that are saved and
	- // restored (somewhere); This does not include callee saved registers that
	- // are unused and hence not saved and restored; they are called pristine.
	+ // To get the live-outs we simply merge the live-ins of all successors.
	+ for (const MachineBasicBlock *Succ : MBB.successors())
	+ addBlockLiveIns(*Succ);
	+ if (MBB.isReturnBlock()) {
	+ // Return blocks are a special case because we currently don't mark up
	+ // return instructions completely: specifically, there is no explicit
	+ // use for callee-saved registers. So we add all callee saved registers
	+ // that are saved and restored (somewhere). This does not include
	+ // callee saved registers that are unused and hence not saved and
	+ // restored; they are called pristine.
	+ // FIXME: PEI should add explicit markings to return instructions
	+ // instead of implicitly handling them here.
	const MachineFunction &MF = *MBB.getParent();
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	if (MFI.isCalleeSavedInfoValid()) {
	for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo())
	if (Info.isRestored())
	addReg(Info.getReg());
	}
	}
	}

	void LivePhysRegs::addLiveOuts(const MachineBasicBlock &MBB) {
	const MachineFunction &MF = *MBB.getParent();
	- if (!MBB.succ_empty()) {
	- addPristines(MF);
	- addLiveOutsNoPristines(MBB);
	- } else if (MBB.isReturnBlock()) {
	- // For the return block: Add all callee saved registers.
	- const MachineFrameInfo &MFI = MF.getFrameInfo();
	- if (MFI.isCalleeSavedInfoValid())
	- addCalleeSavedRegs(*this, MF);
	- }
	+ addPristines(MF);
	+ addLiveOutsNoPristines(MBB);
	}

	void LivePhysRegs::addLiveIns(const MachineBasicBlock &MBB) {
	const MachineFunction &MF = *MBB.getParent();
	addPristines(MF);
	addBlockLiveIns(MBB);
	}

	void llvm::computeLiveIns(LivePhysRegs &LiveRegs,
	const MachineBasicBlock &MBB) {
	const MachineFunction &MF = *MBB.getParent();
	const MachineRegisterInfo &MRI = MF.getRegInfo();
	const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
	LiveRegs.init(TRI);
	LiveRegs.addLiveOutsNoPristines(MBB);
	for (const MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend()))
	LiveRegs.stepBackward(MI);
	}

	void llvm::addLiveIns(MachineBasicBlock &MBB, const LivePhysRegs &LiveRegs) {
	assert(MBB.livein_empty() && "Expected empty live-in list");
	const MachineFunction &MF = *MBB.getParent();
	const MachineRegisterInfo &MRI = MF.getRegInfo();
	const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
	for (MCPhysReg Reg : LiveRegs) {
	if (MRI.isReserved(Reg))
	continue;
	// Skip the register if we are about to add one of its super registers.
	bool ContainsSuperReg = false;
	for (MCSuperRegIterator SReg(Reg, &TRI); SReg.isValid(); ++SReg) {
	if (LiveRegs.contains(SReg) && !MRI.isReserved(SReg)) {
	ContainsSuperReg = true;
	break;
	}
	}
	if (ContainsSuperReg)
	continue;
	MBB.addLiveIn(Reg);
	}
	}

	void llvm::recomputeLivenessFlags(MachineBasicBlock &MBB) {
	const MachineFunction &MF = *MBB.getParent();
	const MachineRegisterInfo &MRI = MF.getRegInfo();
	const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();

	// We walk through the block backwards and start with the live outs.
	LivePhysRegs LiveRegs;
	LiveRegs.init(TRI);
	LiveRegs.addLiveOutsNoPristines(MBB);

	for (MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) {
	// Recompute dead flags.
	for (MIBundleOperands MO(MI); MO.isValid(); ++MO) {
	if (!MO->isReg() \|\| !MO->isDef() \|\| MO->isDebug())
	continue;

	unsigned Reg = MO->getReg();
	if (Reg == 0)
	continue;
	assert(TargetRegisterInfo::isPhysicalRegister(Reg));

	bool IsNotLive = LiveRegs.available(MRI, Reg);
	MO->setIsDead(IsNotLive);
	}

	// Step backward over defs.
	LiveRegs.removeDefs(MI);

	// Recompute kill flags.
	for (MIBundleOperands MO(MI); MO.isValid(); ++MO) {
	if (!MO->isReg() \|\| !MO->readsReg() \|\| MO->isDebug())
	continue;

	unsigned Reg = MO->getReg();
	if (Reg == 0)
	continue;
	assert(TargetRegisterInfo::isPhysicalRegister(Reg));

	bool IsNotLive = LiveRegs.available(MRI, Reg);
	MO->setIsKill(IsNotLive);
	}

	// Complete the stepbackward.
	LiveRegs.addUses(MI);
	}
	}

	void llvm::computeAndAddLiveIns(LivePhysRegs &LiveRegs,
	MachineBasicBlock &MBB) {
	computeLiveIns(LiveRegs, MBB);
	addLiveIns(MBB, LiveRegs);
	}
	Index: head/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
	===================================================================
	--- head/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (revision 329409)
	+++ head/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (revision 329410)
	@@ -1,17751 +1,17753 @@
	//===- DAGCombiner.cpp - Implement a DAG node combiner --------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This pass combines dag nodes to form fewer, simpler DAG nodes. It can be run
	// both before and after the DAG is legalized.
	//
	// This pass is not a substitute for the LLVM IR instcombine pass. This pass is
	// primarily intended to handle simplification opportunities that are implicit
	// in the LLVM IR and exposed by the various codegen lowering phases.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/None.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SetVector.h"
	#include "llvm/ADT/SmallBitVector.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/Analysis/AliasAnalysis.h"
	#include "llvm/Analysis/MemoryLocation.h"
	#include "llvm/CodeGen/DAGCombine.h"
	#include "llvm/CodeGen/ISDOpcodes.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineValueType.h"
	#include "llvm/CodeGen/RuntimeLibcalls.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/Metadata.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include <algorithm>
	#include <cassert>
	#include <cstdint>
	#include <functional>
	#include <iterator>
	#include <string>
	#include <tuple>
	#include <utility>
	#include <vector>

	using namespace llvm;

	#define DEBUG_TYPE "dagcombine"

	STATISTIC(NodesCombined , "Number of dag nodes combined");
	STATISTIC(PreIndexedNodes , "Number of pre-indexed nodes created");
	STATISTIC(PostIndexedNodes, "Number of post-indexed nodes created");
	STATISTIC(OpsNarrowed , "Number of load/op/store narrowed");
	STATISTIC(LdStFP2Int , "Number of fp load/store pairs transformed to int");
	STATISTIC(SlicedLoads, "Number of load sliced");

	static cl::opt<bool>
	CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden,
	cl::desc("Enable DAG combiner's use of IR alias analysis"));

	static cl::opt<bool>
	UseTBAA("combiner-use-tbaa", cl::Hidden, cl::init(true),
	cl::desc("Enable DAG combiner's use of TBAA"));

	#ifndef NDEBUG
	static cl::opt<std::string>
	CombinerAAOnlyFunc("combiner-aa-only-func", cl::Hidden,
	cl::desc("Only use DAG-combiner alias analysis in this"
	" function"));
	#endif

	/// Hidden option to stress test load slicing, i.e., when this option
	/// is enabled, load slicing bypasses most of its profitability guards.
	static cl::opt<bool>
	StressLoadSlicing("combiner-stress-load-slicing", cl::Hidden,
	cl::desc("Bypass the profitability model of load slicing"),
	cl::init(false));

	static cl::opt<bool>
	MaySplitLoadIndex("combiner-split-load-index", cl::Hidden, cl::init(true),
	cl::desc("DAG combiner may split indexing from loads"));

	namespace {

	class DAGCombiner {
	SelectionDAG &DAG;
	const TargetLowering &TLI;
	CombineLevel Level;
	CodeGenOpt::Level OptLevel;
	bool LegalOperations = false;
	bool LegalTypes = false;
	bool ForCodeSize;

	/// \brief Worklist of all of the nodes that need to be simplified.
	///
	/// This must behave as a stack -- new nodes to process are pushed onto the
	/// back and when processing we pop off of the back.
	///
	/// The worklist will not contain duplicates but may contain null entries
	/// due to nodes being deleted from the underlying DAG.
	SmallVector<SDNode *, 64> Worklist;

	/// \brief Mapping from an SDNode to its position on the worklist.
	///
	/// This is used to find and remove nodes from the worklist (by nulling
	/// them) when they are deleted from the underlying DAG. It relies on
	/// stable indices of nodes within the worklist.
	DenseMap<SDNode *, unsigned> WorklistMap;

	/// \brief Set of nodes which have been combined (at least once).
	///
	/// This is used to allow us to reliably add any operands of a DAG node
	/// which have not yet been combined to the worklist.
	SmallPtrSet<SDNode *, 32> CombinedNodes;

	// AA - Used for DAG load/store alias analysis.
	AliasAnalysis *AA;

	/// When an instruction is simplified, add all users of the instruction to
	/// the work lists because they might get more simplified now.
	void AddUsersToWorklist(SDNode *N) {
	for (SDNode *Node : N->uses())
	AddToWorklist(Node);
	}

	/// Call the node-specific routine that folds each particular type of node.
	SDValue visit(SDNode *N);

	public:
	DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL)
	: DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
	OptLevel(OL), AA(AA) {
	ForCodeSize = DAG.getMachineFunction().getFunction().optForSize();

	MaximumLegalStoreInBits = 0;
	for (MVT VT : MVT::all_valuetypes())
	if (EVT(VT).isSimple() && VT != MVT::Other &&
	TLI.isTypeLegal(EVT(VT)) &&
	VT.getSizeInBits() >= MaximumLegalStoreInBits)
	MaximumLegalStoreInBits = VT.getSizeInBits();
	}

	/// Add to the worklist making sure its instance is at the back (next to be
	/// processed.)
	void AddToWorklist(SDNode *N) {
	assert(N->getOpcode() != ISD::DELETED_NODE &&
	"Deleted Node added to Worklist");

	// Skip handle nodes as they can't usefully be combined and confuse the
	// zero-use deletion strategy.
	if (N->getOpcode() == ISD::HANDLENODE)
	return;

	if (WorklistMap.insert(std::make_pair(N, Worklist.size())).second)
	Worklist.push_back(N);
	}

	/// Remove all instances of N from the worklist.
	void removeFromWorklist(SDNode *N) {
	CombinedNodes.erase(N);

	auto It = WorklistMap.find(N);
	if (It == WorklistMap.end())
	return; // Not in the worklist.

	// Null out the entry rather than erasing it to avoid a linear operation.
	Worklist[It->second] = nullptr;
	WorklistMap.erase(It);
	}

	void deleteAndRecombine(SDNode *N);
	bool recursivelyDeleteUnusedNodes(SDNode *N);

	/// Replaces all uses of the results of one DAG node with new values.
	SDValue CombineTo(SDNode N, const SDValue To, unsigned NumTo,
	bool AddTo = true);

	/// Replaces all uses of the results of one DAG node with new values.
	SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true) {
	return CombineTo(N, &Res, 1, AddTo);
	}

	/// Replaces all uses of the results of one DAG node with new values.
	SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1,
	bool AddTo = true) {
	SDValue To[] = { Res0, Res1 };
	return CombineTo(N, To, 2, AddTo);
	}

	void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO);

	private:
	unsigned MaximumLegalStoreInBits;

	/// Check the specified integer node value to see if it can be simplified or
	/// if things it uses can be simplified by bit propagation.
	/// If so, return true.
	bool SimplifyDemandedBits(SDValue Op) {
	unsigned BitWidth = Op.getScalarValueSizeInBits();
	APInt Demanded = APInt::getAllOnesValue(BitWidth);
	return SimplifyDemandedBits(Op, Demanded);
	}

	bool SimplifyDemandedBits(SDValue Op, const APInt &Demanded);

	bool CombineToPreIndexedLoadStore(SDNode *N);
	bool CombineToPostIndexedLoadStore(SDNode *N);
	SDValue SplitIndexingFromLoad(LoadSDNode *LD);
	bool SliceUpLoad(SDNode *N);

	/// \brief Replace an ISD::EXTRACT_VECTOR_ELT of a load with a narrowed
	/// load.
	///
	/// \param EVE ISD::EXTRACT_VECTOR_ELT to be replaced.
	/// \param InVecVT type of the input vector to EVE with bitcasts resolved.
	/// \param EltNo index of the vector element to load.
	/// \param OriginalLoad load that EVE came from to be replaced.
	/// \returns EVE on success SDValue() on failure.
	SDValue ReplaceExtractVectorEltOfLoadWithNarrowedLoad(
	SDNode EVE, EVT InVecVT, SDValue EltNo, LoadSDNode OriginalLoad);
	void ReplaceLoadWithPromotedLoad(SDNode Load, SDNode ExtLoad);
	SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace);
	SDValue SExtPromoteOperand(SDValue Op, EVT PVT);
	SDValue ZExtPromoteOperand(SDValue Op, EVT PVT);
	SDValue PromoteIntBinOp(SDValue Op);
	SDValue PromoteIntShiftOp(SDValue Op);
	SDValue PromoteExtend(SDValue Op);
	bool PromoteLoad(SDValue Op);

	void ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs, SDValue Trunc,
	SDValue ExtLoad, const SDLoc &DL,
	ISD::NodeType ExtType);

	/// Call the node-specific routine that knows how to fold each
	/// particular type of node. If that doesn't do anything, try the
	/// target-specific DAG combines.
	SDValue combine(SDNode *N);

	// Visitation implementation - Implement dag node combining for different
	// node types. The semantics are as follows:
	// Return Value:
	// SDValue.getNode() == 0 - No change was made
	// SDValue.getNode() == N - N was replaced, is dead and has been handled.
	// otherwise - N should be replaced by the returned Operand.
	//
	SDValue visitTokenFactor(SDNode *N);
	SDValue visitMERGE_VALUES(SDNode *N);
	SDValue visitADD(SDNode *N);
	SDValue visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference);
	SDValue visitSUB(SDNode *N);
	SDValue visitADDC(SDNode *N);
	SDValue visitUADDO(SDNode *N);
	SDValue visitUADDOLike(SDValue N0, SDValue N1, SDNode *N);
	SDValue visitSUBC(SDNode *N);
	SDValue visitUSUBO(SDNode *N);
	SDValue visitADDE(SDNode *N);
	SDValue visitADDCARRY(SDNode *N);
	SDValue visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, SDNode *N);
	SDValue visitSUBE(SDNode *N);
	SDValue visitSUBCARRY(SDNode *N);
	SDValue visitMUL(SDNode *N);
	SDValue useDivRem(SDNode *N);
	SDValue visitSDIV(SDNode *N);
	SDValue visitUDIV(SDNode *N);
	SDValue visitREM(SDNode *N);
	SDValue visitMULHU(SDNode *N);
	SDValue visitMULHS(SDNode *N);
	SDValue visitSMUL_LOHI(SDNode *N);
	SDValue visitUMUL_LOHI(SDNode *N);
	SDValue visitSMULO(SDNode *N);
	SDValue visitUMULO(SDNode *N);
	SDValue visitIMINMAX(SDNode *N);
	SDValue visitAND(SDNode *N);
	SDValue visitANDLike(SDValue N0, SDValue N1, SDNode *LocReference);
	SDValue visitOR(SDNode *N);
	SDValue visitORLike(SDValue N0, SDValue N1, SDNode *LocReference);
	SDValue visitXOR(SDNode *N);
	SDValue SimplifyVBinOp(SDNode *N);
	SDValue visitSHL(SDNode *N);
	SDValue visitSRA(SDNode *N);
	SDValue visitSRL(SDNode *N);
	SDValue visitRotate(SDNode *N);
	SDValue visitABS(SDNode *N);
	SDValue visitBSWAP(SDNode *N);
	SDValue visitBITREVERSE(SDNode *N);
	SDValue visitCTLZ(SDNode *N);
	SDValue visitCTLZ_ZERO_UNDEF(SDNode *N);
	SDValue visitCTTZ(SDNode *N);
	SDValue visitCTTZ_ZERO_UNDEF(SDNode *N);
	SDValue visitCTPOP(SDNode *N);
	SDValue visitSELECT(SDNode *N);
	SDValue visitVSELECT(SDNode *N);
	SDValue visitSELECT_CC(SDNode *N);
	SDValue visitSETCC(SDNode *N);
	SDValue visitSETCCE(SDNode *N);
	SDValue visitSETCCCARRY(SDNode *N);
	SDValue visitSIGN_EXTEND(SDNode *N);
	SDValue visitZERO_EXTEND(SDNode *N);
	SDValue visitANY_EXTEND(SDNode *N);
	SDValue visitAssertExt(SDNode *N);
	SDValue visitSIGN_EXTEND_INREG(SDNode *N);
	SDValue visitSIGN_EXTEND_VECTOR_INREG(SDNode *N);
	SDValue visitZERO_EXTEND_VECTOR_INREG(SDNode *N);
	SDValue visitTRUNCATE(SDNode *N);
	SDValue visitBITCAST(SDNode *N);
	SDValue visitBUILD_PAIR(SDNode *N);
	SDValue visitFADD(SDNode *N);
	SDValue visitFSUB(SDNode *N);
	SDValue visitFMUL(SDNode *N);
	SDValue visitFMA(SDNode *N);
	SDValue visitFDIV(SDNode *N);
	SDValue visitFREM(SDNode *N);
	SDValue visitFSQRT(SDNode *N);
	SDValue visitFCOPYSIGN(SDNode *N);
	SDValue visitSINT_TO_FP(SDNode *N);
	SDValue visitUINT_TO_FP(SDNode *N);
	SDValue visitFP_TO_SINT(SDNode *N);
	SDValue visitFP_TO_UINT(SDNode *N);
	SDValue visitFP_ROUND(SDNode *N);
	SDValue visitFP_ROUND_INREG(SDNode *N);
	SDValue visitFP_EXTEND(SDNode *N);
	SDValue visitFNEG(SDNode *N);
	SDValue visitFABS(SDNode *N);
	SDValue visitFCEIL(SDNode *N);
	SDValue visitFTRUNC(SDNode *N);
	SDValue visitFFLOOR(SDNode *N);
	SDValue visitFMINNUM(SDNode *N);
	SDValue visitFMAXNUM(SDNode *N);
	SDValue visitBRCOND(SDNode *N);
	SDValue visitBR_CC(SDNode *N);
	SDValue visitLOAD(SDNode *N);

	SDValue replaceStoreChain(StoreSDNode *ST, SDValue BetterChain);
	SDValue replaceStoreOfFPConstant(StoreSDNode *ST);

	SDValue visitSTORE(SDNode *N);
	SDValue visitINSERT_VECTOR_ELT(SDNode *N);
	SDValue visitEXTRACT_VECTOR_ELT(SDNode *N);
	SDValue visitBUILD_VECTOR(SDNode *N);
	SDValue visitCONCAT_VECTORS(SDNode *N);
	SDValue visitEXTRACT_SUBVECTOR(SDNode *N);
	SDValue visitVECTOR_SHUFFLE(SDNode *N);
	SDValue visitSCALAR_TO_VECTOR(SDNode *N);
	SDValue visitINSERT_SUBVECTOR(SDNode *N);
	SDValue visitMLOAD(SDNode *N);
	SDValue visitMSTORE(SDNode *N);
	SDValue visitMGATHER(SDNode *N);
	SDValue visitMSCATTER(SDNode *N);
	SDValue visitFP_TO_FP16(SDNode *N);
	SDValue visitFP16_TO_FP(SDNode *N);

	SDValue visitFADDForFMACombine(SDNode *N);
	SDValue visitFSUBForFMACombine(SDNode *N);
	SDValue visitFMULForFMADistributiveCombine(SDNode *N);

	SDValue XformToShuffleWithZero(SDNode *N);
	SDValue ReassociateOps(unsigned Opc, const SDLoc &DL, SDValue LHS,
	SDValue RHS);

	SDValue visitShiftByConstant(SDNode N, ConstantSDNode Amt);

	SDValue foldSelectOfConstants(SDNode *N);
	SDValue foldVSelectOfConstants(SDNode *N);
	SDValue foldBinOpIntoSelect(SDNode *BO);
	bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS);
	SDValue SimplifyBinOpWithSameOpcodeHands(SDNode *N);
	SDValue SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2);
	SDValue SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
	SDValue N2, SDValue N3, ISD::CondCode CC,
	bool NotExtCompare = false);
	SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1,
	SDValue N2, SDValue N3, ISD::CondCode CC);
	SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
	const SDLoc &DL);
	SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
	const SDLoc &DL, bool foldBooleans = true);

	bool isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
	SDValue &CC) const;
	bool isOneUseSetCC(SDValue N) const;

	SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
	unsigned HiOp);
	SDValue CombineConsecutiveLoads(SDNode *N, EVT VT);
	SDValue CombineExtLoad(SDNode *N);
	SDValue combineRepeatedFPDivisors(SDNode *N);
	SDValue combineInsertEltToShuffle(SDNode *N, unsigned InsIndex);
	SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT);
	SDValue BuildSDIV(SDNode *N);
	SDValue BuildSDIVPow2(SDNode *N);
	SDValue BuildUDIV(SDNode *N);
	SDValue BuildLogBase2(SDValue Op, const SDLoc &DL);
	SDValue BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags);
	SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags);
	SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags Flags);
	SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, bool Recip);
	SDValue buildSqrtNROneConst(SDValue Op, SDValue Est, unsigned Iterations,
	SDNodeFlags Flags, bool Reciprocal);
	SDValue buildSqrtNRTwoConst(SDValue Op, SDValue Est, unsigned Iterations,
	SDNodeFlags Flags, bool Reciprocal);
	SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
	bool DemandHighBits = true);
	SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
	SDNode *MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg,
	SDValue InnerPos, SDValue InnerNeg,
	unsigned PosOpcode, unsigned NegOpcode,
	const SDLoc &DL);
	SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
	SDValue MatchLoadCombine(SDNode *N);
	SDValue ReduceLoadWidth(SDNode *N);
	SDValue ReduceLoadOpStoreWidth(SDNode *N);
	SDValue splitMergedValStore(StoreSDNode *ST);
	SDValue TransformFPLoadStorePair(SDNode *N);
	SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);
	SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N);
	SDValue reduceBuildVecToShuffle(SDNode *N);
	SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N,
	ArrayRef<int> VectorMask, SDValue VecIn1,
	SDValue VecIn2, unsigned LeftIdx);
	SDValue matchVSelectOpSizesWithSetCC(SDNode *N);

	/// Walk up chain skipping non-aliasing memory nodes,
	/// looking for aliasing nodes and adding them to the Aliases vector.
	void GatherAllAliases(SDNode *N, SDValue OriginalChain,
	SmallVectorImpl<SDValue> &Aliases);

	/// Return true if there is any possibility that the two addresses overlap.
	bool isAlias(LSBaseSDNode Op0, LSBaseSDNode Op1) const;

	/// Walk up chain skipping non-aliasing memory nodes, looking for a better
	/// chain (aliasing node.)
	SDValue FindBetterChain(SDNode *N, SDValue Chain);

	/// Try to replace a store and any possibly adjacent stores on
	/// consecutive chains with better chains. Return true only if St is
	/// replaced.
	///
	/// Notice that other chains may still be replaced even if the function
	/// returns false.
	bool findBetterNeighborChains(StoreSDNode *St);

	/// Match "(X shl/srl V1) & V2" where V2 may not be present.
	bool MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask);

	/// Holds a pointer to an LSBaseSDNode as well as information on where it
	/// is located in a sequence of memory operations connected by a chain.
	struct MemOpLink {
	// Ptr to the mem node.
	LSBaseSDNode *MemNode;

	// Offset from the base ptr.
	int64_t OffsetFromBase;

	MemOpLink(LSBaseSDNode *N, int64_t Offset)
	: MemNode(N), OffsetFromBase(Offset) {}
	};

	/// This is a helper function for visitMUL to check the profitability
	/// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).
	/// MulNode is the original multiply, AddNode is (add x, c1),
	/// and ConstNode is c2.
	bool isMulAddWithConstProfitable(SDNode *MulNode,
	SDValue &AddNode,
	SDValue &ConstNode);

	/// This is a helper function for visitAND and visitZERO_EXTEND. Returns
	/// true if the (and (load x) c) pattern matches an extload. ExtVT returns
	/// the type of the loaded value to be extended.
	bool isAndLoadExtLoad(ConstantSDNode AndC, LoadSDNode LoadN,
	EVT LoadResultTy, EVT &ExtVT);

	/// Helper function to calculate whether the given Load can have its
	/// width reduced to ExtVT.
	bool isLegalNarrowLoad(LoadSDNode *LoadN, ISD::LoadExtType ExtType,
	EVT &ExtVT, unsigned ShAmt = 0);

	/// Used by BackwardsPropagateMask to find suitable loads.
	bool SearchForAndLoads(SDNode N, SmallPtrSetImpl<LoadSDNode> &Loads,
	SmallPtrSetImpl<SDNode*> &NodeWithConsts,
	ConstantSDNode Mask, SDNode &UncombinedNode);
	/// Attempt to propagate a given AND node back to load leaves so that they
	/// can be combined into narrow loads.
	bool BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG);

	/// Helper function for MergeConsecutiveStores which merges the
	/// component store chains.
	SDValue getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
	unsigned NumStores);

	/// This is a helper function for MergeConsecutiveStores. When the
	/// source elements of the consecutive stores are all constants or
	/// all extracted vector elements, try to merge them into one
	/// larger store introducing bitcasts if necessary. \return True
	/// if a merged store was created.
	bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes,
	EVT MemVT, unsigned NumStores,
	bool IsConstantSrc, bool UseVector,
	bool UseTrunc);

	/// This is a helper function for MergeConsecutiveStores. Stores
	/// that potentially may be merged with St are placed in
	/// StoreNodes.
	void getStoreMergeCandidates(StoreSDNode *St,
	SmallVectorImpl<MemOpLink> &StoreNodes);

	/// Helper function for MergeConsecutiveStores. Checks if
	/// candidate stores have indirect dependency through their
	/// operands. \return True if safe to merge.
	bool checkMergeStoreCandidatesForDependencies(
	SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores);

	/// Merge consecutive store operations into a wide store.
	/// This optimization uses wide integers or vectors when possible.
	/// \return number of stores that were merged into a merged store (the
	/// affected nodes are stored as a prefix in \p StoreNodes).
	bool MergeConsecutiveStores(StoreSDNode *N);

	/// \brief Try to transform a truncation where C is a constant:
	/// (trunc (and X, C)) -> (and (trunc X), (trunc C))
	///
	/// \p N needs to be a truncation and its first operand an AND. Other
	/// requirements are checked by the function (e.g. that trunc is
	/// single-use) and if missed an empty SDValue is returned.
	SDValue distributeTruncateThroughAnd(SDNode *N);

	public:
	/// Runs the dag combiner on all nodes in the work list
	void Run(CombineLevel AtLevel);

	SelectionDAG &getDAG() const { return DAG; }

	/// Returns a type large enough to hold any valid shift amount - before type
	/// legalization these can be huge.
	EVT getShiftAmountTy(EVT LHSTy) {
	assert(LHSTy.isInteger() && "Shift amount is not an integer type!");
	if (LHSTy.isVector())
	return LHSTy;
	auto &DL = DAG.getDataLayout();
	return LegalTypes ? TLI.getScalarShiftAmountTy(DL, LHSTy)
	: TLI.getPointerTy(DL);
	}

	/// This method returns true if we are running before type legalization or
	/// if the specified VT is legal.
	bool isTypeLegal(const EVT &VT) {
	if (!LegalTypes) return true;
	return TLI.isTypeLegal(VT);
	}

	/// Convenience wrapper around TargetLowering::getSetCCResultType
	EVT getSetCCResultType(EVT VT) const {
	return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
	}
	};

	/// This class is a DAGUpdateListener that removes any deleted
	/// nodes from the worklist.
	class WorklistRemover : public SelectionDAG::DAGUpdateListener {
	DAGCombiner &DC;

	public:
	explicit WorklistRemover(DAGCombiner &dc)
	: SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {}

	void NodeDeleted(SDNode N, SDNode E) override {
	DC.removeFromWorklist(N);
	}
	};

	} // end anonymous namespace

	//===----------------------------------------------------------------------===//
	// TargetLowering::DAGCombinerInfo implementation
	//===----------------------------------------------------------------------===//

	void TargetLowering::DAGCombinerInfo::AddToWorklist(SDNode *N) {
	((DAGCombiner*)DC)->AddToWorklist(N);
	}

	SDValue TargetLowering::DAGCombinerInfo::
	CombineTo(SDNode *N, ArrayRef<SDValue> To, bool AddTo) {
	return ((DAGCombiner*)DC)->CombineTo(N, &To[0], To.size(), AddTo);
	}

	SDValue TargetLowering::DAGCombinerInfo::
	CombineTo(SDNode *N, SDValue Res, bool AddTo) {
	return ((DAGCombiner*)DC)->CombineTo(N, Res, AddTo);
	}

	SDValue TargetLowering::DAGCombinerInfo::
	CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo) {
	return ((DAGCombiner*)DC)->CombineTo(N, Res0, Res1, AddTo);
	}

	void TargetLowering::DAGCombinerInfo::
	CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
	return ((DAGCombiner*)DC)->CommitTargetLoweringOpt(TLO);
	}

	//===----------------------------------------------------------------------===//
	// Helper Functions
	//===----------------------------------------------------------------------===//

	void DAGCombiner::deleteAndRecombine(SDNode *N) {
	removeFromWorklist(N);

	// If the operands of this node are only used by the node, they will now be
	// dead. Make sure to re-visit them and recursively delete dead nodes.
	for (const SDValue &Op : N->ops())
	// For an operand generating multiple values, one of the values may
	// become dead allowing further simplification (e.g. split index
	// arithmetic from an indexed load).
	if (Op->hasOneUse() \|\| Op->getNumValues() > 1)
	AddToWorklist(Op.getNode());

	DAG.DeleteNode(N);
	}

	/// Return 1 if we can compute the negated form of the specified expression for
	/// the same cost as the expression itself, or 2 if we can compute the negated
	/// form more cheaply than the expression itself.
	static char isNegatibleForFree(SDValue Op, bool LegalOperations,
	const TargetLowering &TLI,
	const TargetOptions *Options,
	unsigned Depth = 0) {
	// fneg is removable even if it has multiple uses.
	if (Op.getOpcode() == ISD::FNEG) return 2;

	// Don't allow anything with multiple uses.
	if (!Op.hasOneUse()) return 0;

	// Don't recurse exponentially.
	if (Depth > 6) return 0;

	switch (Op.getOpcode()) {
	default: return false;
	case ISD::ConstantFP: {
	if (!LegalOperations)
	return 1;

	// Don't invert constant FP values after legalization unless the target says
	// the negated constant is legal.
	EVT VT = Op.getValueType();
	return TLI.isOperationLegal(ISD::ConstantFP, VT) \|\|
	TLI.isFPImmLegal(neg(cast<ConstantFPSDNode>(Op)->getValueAPF()), VT);
	}
	case ISD::FADD:
	// FIXME: determine better conditions for this xform.
	if (!Options->UnsafeFPMath) return 0;

	// After operation legalization, it might not be legal to create new FSUBs.
	if (LegalOperations &&
	!TLI.isOperationLegalOrCustom(ISD::FSUB, Op.getValueType()))
	return 0;

	// fold (fneg (fadd A, B)) -> (fsub (fneg A), B)
	if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI,
	Options, Depth + 1))
	return V;
	// fold (fneg (fadd A, B)) -> (fsub (fneg B), A)
	return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options,
	Depth + 1);
	case ISD::FSUB:
	// We can't turn -(A-B) into B-A when we honor signed zeros.
	if (!Options->NoSignedZerosFPMath &&
	!Op.getNode()->getFlags().hasNoSignedZeros())
	return 0;

	// fold (fneg (fsub A, B)) -> (fsub B, A)
	return 1;

	case ISD::FMUL:
	case ISD::FDIV:
	if (Options->HonorSignDependentRoundingFPMath()) return 0;

	// fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) or (fmul X, (fneg Y))
	if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI,
	Options, Depth + 1))
	return V;

	return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options,
	Depth + 1);

	case ISD::FP_EXTEND:
	case ISD::FP_ROUND:
	case ISD::FSIN:
	return isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, Options,
	Depth + 1);
	}
	}

	/// If isNegatibleForFree returns true, return the newly negated expression.
	static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,
	bool LegalOperations, unsigned Depth = 0) {
	const TargetOptions &Options = DAG.getTarget().Options;
	// fneg is removable even if it has multiple uses.
	if (Op.getOpcode() == ISD::FNEG) return Op.getOperand(0);

	// Don't allow anything with multiple uses.
	assert(Op.hasOneUse() && "Unknown reuse!");

	assert(Depth <= 6 && "GetNegatedExpression doesn't match isNegatibleForFree");

	const SDNodeFlags Flags = Op.getNode()->getFlags();

	switch (Op.getOpcode()) {
	default: llvm_unreachable("Unknown code");
	case ISD::ConstantFP: {
	APFloat V = cast<ConstantFPSDNode>(Op)->getValueAPF();
	V.changeSign();
	return DAG.getConstantFP(V, SDLoc(Op), Op.getValueType());
	}
	case ISD::FADD:
	// FIXME: determine better conditions for this xform.
	assert(Options.UnsafeFPMath);

	// fold (fneg (fadd A, B)) -> (fsub (fneg A), B)
	if (isNegatibleForFree(Op.getOperand(0), LegalOperations,
	DAG.getTargetLoweringInfo(), &Options, Depth+1))
	return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
	GetNegatedExpression(Op.getOperand(0), DAG,
	LegalOperations, Depth+1),
	Op.getOperand(1), Flags);
	// fold (fneg (fadd A, B)) -> (fsub (fneg B), A)
	return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
	GetNegatedExpression(Op.getOperand(1), DAG,
	LegalOperations, Depth+1),
	Op.getOperand(0), Flags);
	case ISD::FSUB:
	// fold (fneg (fsub 0, B)) -> B
	if (ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(Op.getOperand(0)))
	if (N0CFP->isZero())
	return Op.getOperand(1);

	// fold (fneg (fsub A, B)) -> (fsub B, A)
	return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
	Op.getOperand(1), Op.getOperand(0), Flags);

	case ISD::FMUL:
	case ISD::FDIV:
	assert(!Options.HonorSignDependentRoundingFPMath());

	// fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y)
	if (isNegatibleForFree(Op.getOperand(0), LegalOperations,
	DAG.getTargetLoweringInfo(), &Options, Depth+1))
	return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
	GetNegatedExpression(Op.getOperand(0), DAG,
	LegalOperations, Depth+1),
	Op.getOperand(1), Flags);

	// fold (fneg (fmul X, Y)) -> (fmul X, (fneg Y))
	return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
	Op.getOperand(0),
	GetNegatedExpression(Op.getOperand(1), DAG,
	LegalOperations, Depth+1), Flags);

	case ISD::FP_EXTEND:
	case ISD::FSIN:
	return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
	GetNegatedExpression(Op.getOperand(0), DAG,
	LegalOperations, Depth+1));
	case ISD::FP_ROUND:
	return DAG.getNode(ISD::FP_ROUND, SDLoc(Op), Op.getValueType(),
	GetNegatedExpression(Op.getOperand(0), DAG,
	LegalOperations, Depth+1),
	Op.getOperand(1));
	}
	}

	// APInts must be the same size for most operations, this helper
	// function zero extends the shorter of the pair so that they match.
	// We provide an Offset so that we can create bitwidths that won't overflow.
	static void zeroExtendToMatch(APInt &LHS, APInt &RHS, unsigned Offset = 0) {
	unsigned Bits = Offset + std::max(LHS.getBitWidth(), RHS.getBitWidth());
	LHS = LHS.zextOrSelf(Bits);
	RHS = RHS.zextOrSelf(Bits);
	}

	// Return true if this node is a setcc, or is a select_cc
	// that selects between the target values used for true and false, making it
	// equivalent to a setcc. Also, set the incoming LHS, RHS, and CC references to
	// the appropriate nodes based on the type of node we are checking. This
	// simplifies life a bit for the callers.
	bool DAGCombiner::isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
	SDValue &CC) const {
	if (N.getOpcode() == ISD::SETCC) {
	LHS = N.getOperand(0);
	RHS = N.getOperand(1);
	CC = N.getOperand(2);
	return true;
	}

	if (N.getOpcode() != ISD::SELECT_CC \|\|
	!TLI.isConstTrueVal(N.getOperand(2).getNode()) \|\|
	!TLI.isConstFalseVal(N.getOperand(3).getNode()))
	return false;

	if (TLI.getBooleanContents(N.getValueType()) ==
	TargetLowering::UndefinedBooleanContent)
	return false;

	LHS = N.getOperand(0);
	RHS = N.getOperand(1);
	CC = N.getOperand(4);
	return true;
	}

	/// Return true if this is a SetCC-equivalent operation with only one use.
	/// If this is true, it allows the users to invert the operation for free when
	/// it is profitable to do so.
	bool DAGCombiner::isOneUseSetCC(SDValue N) const {
	SDValue N0, N1, N2;
	if (isSetCCEquivalent(N, N0, N1, N2) && N.getNode()->hasOneUse())
	return true;
	return false;
	}

	// \brief Returns the SDNode if it is a constant float BuildVector
	// or constant float.
	static SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N) {
	if (isa<ConstantFPSDNode>(N))
	return N.getNode();
	if (ISD::isBuildVectorOfConstantFPSDNodes(N.getNode()))
	return N.getNode();
	return nullptr;
	}

	// Determines if it is a constant integer or a build vector of constant
	// integers (and undefs).
	// Do not permit build vector implicit truncation.
	static bool isConstantOrConstantVector(SDValue N, bool NoOpaques = false) {
	if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N))
	return !(Const->isOpaque() && NoOpaques);
	if (N.getOpcode() != ISD::BUILD_VECTOR)
	return false;
	unsigned BitWidth = N.getScalarValueSizeInBits();
	for (const SDValue &Op : N->op_values()) {
	if (Op.isUndef())
	continue;
	ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Op);
	if (!Const \|\| Const->getAPIntValue().getBitWidth() != BitWidth \|\|
	(Const->isOpaque() && NoOpaques))
	return false;
	}
	return true;
	}

	// Determines if it is a constant null integer or a splatted vector of a
	// constant null integer (with no undefs).
	// Build vector implicit truncation is not an issue for null values.
	static bool isNullConstantOrNullSplatConstant(SDValue N) {
	if (ConstantSDNode *Splat = isConstOrConstSplat(N))
	return Splat->isNullValue();
	return false;
	}

	// Determines if it is a constant integer of one or a splatted vector of a
	// constant integer of one (with no undefs).
	// Do not permit build vector implicit truncation.
	static bool isOneConstantOrOneSplatConstant(SDValue N) {
	unsigned BitWidth = N.getScalarValueSizeInBits();
	if (ConstantSDNode *Splat = isConstOrConstSplat(N))
	return Splat->isOne() && Splat->getAPIntValue().getBitWidth() == BitWidth;
	return false;
	}

	// Determines if it is a constant integer of all ones or a splatted vector of a
	// constant integer of all ones (with no undefs).
	// Do not permit build vector implicit truncation.
	static bool isAllOnesConstantOrAllOnesSplatConstant(SDValue N) {
	unsigned BitWidth = N.getScalarValueSizeInBits();
	if (ConstantSDNode *Splat = isConstOrConstSplat(N))
	return Splat->isAllOnesValue() &&
	Splat->getAPIntValue().getBitWidth() == BitWidth;
	return false;
	}

	// Determines if a BUILD_VECTOR is composed of all-constants possibly mixed with
	// undef's.
	static bool isAnyConstantBuildVector(const SDNode *N) {
	return ISD::isBuildVectorOfConstantSDNodes(N) \|\|
	ISD::isBuildVectorOfConstantFPSDNodes(N);
	}

	// Attempt to match a unary predicate against a scalar/splat constant or
	// every element of a constant BUILD_VECTOR.
	static bool matchUnaryPredicate(SDValue Op,
	std::function<bool(ConstantSDNode *)> Match) {
	if (auto *Cst = dyn_cast<ConstantSDNode>(Op))
	return Match(Cst);

	if (ISD::BUILD_VECTOR != Op.getOpcode())
	return false;

	EVT SVT = Op.getValueType().getScalarType();
	for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
	auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(i));
	if (!Cst \|\| Cst->getValueType(0) != SVT \|\| !Match(Cst))
	return false;
	}
	return true;
	}

	// Attempt to match a binary predicate against a pair of scalar/splat constants
	// or every element of a pair of constant BUILD_VECTORs.
	static bool matchBinaryPredicate(
	SDValue LHS, SDValue RHS,
	std::function<bool(ConstantSDNode , ConstantSDNode )> Match) {
	if (LHS.getValueType() != RHS.getValueType())
	return false;

	if (auto *LHSCst = dyn_cast<ConstantSDNode>(LHS))
	if (auto *RHSCst = dyn_cast<ConstantSDNode>(RHS))
	return Match(LHSCst, RHSCst);

	if (ISD::BUILD_VECTOR != LHS.getOpcode() \|\|
	ISD::BUILD_VECTOR != RHS.getOpcode())
	return false;

	EVT SVT = LHS.getValueType().getScalarType();
	for (unsigned i = 0, e = LHS.getNumOperands(); i != e; ++i) {
	auto *LHSCst = dyn_cast<ConstantSDNode>(LHS.getOperand(i));
	auto *RHSCst = dyn_cast<ConstantSDNode>(RHS.getOperand(i));
	if (!LHSCst \|\| !RHSCst)
	return false;
	if (LHSCst->getValueType(0) != SVT \|\|
	LHSCst->getValueType(0) != RHSCst->getValueType(0))
	return false;
	if (!Match(LHSCst, RHSCst))
	return false;
	}
	return true;
	}

	SDValue DAGCombiner::ReassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
	SDValue N1) {
	EVT VT = N0.getValueType();
	if (N0.getOpcode() == Opc) {
	if (SDNode *L = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) {
	if (SDNode *R = DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
	// reassoc. (op (op x, c1), c2) -> (op x, (op c1, c2))
	if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, L, R))
	return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode);
	return SDValue();
	}
	if (N0.hasOneUse()) {
	// reassoc. (op (op x, c1), y) -> (op (op x, y), c1) iff x+c1 has one
	// use
	SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0.getOperand(0), N1);
	if (!OpNode.getNode())
	return SDValue();
	AddToWorklist(OpNode.getNode());
	return DAG.getNode(Opc, DL, VT, OpNode, N0.getOperand(1));
	}
	}
	}

	if (N1.getOpcode() == Opc) {
	if (SDNode *R = DAG.isConstantIntBuildVectorOrConstantInt(N1.getOperand(1))) {
	if (SDNode *L = DAG.isConstantIntBuildVectorOrConstantInt(N0)) {
	// reassoc. (op c2, (op x, c1)) -> (op x, (op c1, c2))
	if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, R, L))
	return DAG.getNode(Opc, DL, VT, N1.getOperand(0), OpNode);
	return SDValue();
	}
	if (N1.hasOneUse()) {
	// reassoc. (op x, (op y, c1)) -> (op (op x, y), c1) iff x+c1 has one
	// use
	SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0, N1.getOperand(0));
	if (!OpNode.getNode())
	return SDValue();
	AddToWorklist(OpNode.getNode());
	return DAG.getNode(Opc, DL, VT, OpNode, N1.getOperand(1));
	}
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::CombineTo(SDNode N, const SDValue To, unsigned NumTo,
	bool AddTo) {
	assert(N->getNumValues() == NumTo && "Broken CombineTo call!");
	++NodesCombined;
	DEBUG(dbgs() << "\nReplacing.1 ";
	N->dump(&DAG);
	dbgs() << "\nWith: ";
	To[0].getNode()->dump(&DAG);
	dbgs() << " and " << NumTo-1 << " other values\n");
	for (unsigned i = 0, e = NumTo; i != e; ++i)
	assert((!To[i].getNode() \|\|
	N->getValueType(i) == To[i].getValueType()) &&
	"Cannot combine value to value of different type!");

	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesWith(N, To);
	if (AddTo) {
	// Push the new nodes and any users onto the worklist
	for (unsigned i = 0, e = NumTo; i != e; ++i) {
	if (To[i].getNode()) {
	AddToWorklist(To[i].getNode());
	AddUsersToWorklist(To[i].getNode());
	}
	}
	}

	// Finally, if the node is now dead, remove it from the graph. The node
	// may not be dead if the replacement process recursively simplified to
	// something else needing this node.
	if (N->use_empty())
	deleteAndRecombine(N);
	return SDValue(N, 0);
	}

	void DAGCombiner::
	CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
	// Replace all uses. If any nodes become isomorphic to other nodes and
	// are deleted, make sure to remove them from our worklist.
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(TLO.Old, TLO.New);

	// Push the new node and any (possibly new) users onto the worklist.
	AddToWorklist(TLO.New.getNode());
	AddUsersToWorklist(TLO.New.getNode());

	// Finally, if the node is now dead, remove it from the graph. The node
	// may not be dead if the replacement process recursively simplified to
	// something else needing this node.
	if (TLO.Old.getNode()->use_empty())
	deleteAndRecombine(TLO.Old.getNode());
	}

	/// Check the specified integer node value to see if it can be simplified or if
	/// things it uses can be simplified by bit propagation. If so, return true.
	bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) {
	TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
	KnownBits Known;
	if (!TLI.SimplifyDemandedBits(Op, Demanded, Known, TLO))
	return false;

	// Revisit the node.
	AddToWorklist(Op.getNode());

	// Replace the old value with the new one.
	++NodesCombined;
	DEBUG(dbgs() << "\nReplacing.2 ";
	TLO.Old.getNode()->dump(&DAG);
	dbgs() << "\nWith: ";
	TLO.New.getNode()->dump(&DAG);
	dbgs() << '\n');

	CommitTargetLoweringOpt(TLO);
	return true;
	}

	void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode Load, SDNode ExtLoad) {
	SDLoc DL(Load);
	EVT VT = Load->getValueType(0);
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, SDValue(ExtLoad, 0));

	DEBUG(dbgs() << "\nReplacing.9 ";
	Load->dump(&DAG);
	dbgs() << "\nWith: ";
	Trunc.getNode()->dump(&DAG);
	dbgs() << '\n');
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), Trunc);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), SDValue(ExtLoad, 1));
	deleteAndRecombine(Load);
	AddToWorklist(Trunc.getNode());
	}

	SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) {
	Replace = false;
	SDLoc DL(Op);
	if (ISD::isUNINDEXEDLoad(Op.getNode())) {
	LoadSDNode *LD = cast<LoadSDNode>(Op);
	EVT MemVT = LD->getMemoryVT();
	ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD)
	? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, PVT, MemVT) ? ISD::ZEXTLOAD
	: ISD::EXTLOAD)
	: LD->getExtensionType();
	Replace = true;
	return DAG.getExtLoad(ExtType, DL, PVT,
	LD->getChain(), LD->getBasePtr(),
	MemVT, LD->getMemOperand());
	}

	unsigned Opc = Op.getOpcode();
	switch (Opc) {
	default: break;
	case ISD::AssertSext:
	if (SDValue Op0 = SExtPromoteOperand(Op.getOperand(0), PVT))
	return DAG.getNode(ISD::AssertSext, DL, PVT, Op0, Op.getOperand(1));
	break;
	case ISD::AssertZext:
	if (SDValue Op0 = ZExtPromoteOperand(Op.getOperand(0), PVT))
	return DAG.getNode(ISD::AssertZext, DL, PVT, Op0, Op.getOperand(1));
	break;
	case ISD::Constant: {
	unsigned ExtOpc =
	Op.getValueType().isByteSized() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	return DAG.getNode(ExtOpc, DL, PVT, Op);
	}
	}

	if (!TLI.isOperationLegal(ISD::ANY_EXTEND, PVT))
	return SDValue();
	return DAG.getNode(ISD::ANY_EXTEND, DL, PVT, Op);
	}

	SDValue DAGCombiner::SExtPromoteOperand(SDValue Op, EVT PVT) {
	if (!TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, PVT))
	return SDValue();
	EVT OldVT = Op.getValueType();
	SDLoc DL(Op);
	bool Replace = false;
	SDValue NewOp = PromoteOperand(Op, PVT, Replace);
	if (!NewOp.getNode())
	return SDValue();
	AddToWorklist(NewOp.getNode());

	if (Replace)
	ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode());
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, NewOp.getValueType(), NewOp,
	DAG.getValueType(OldVT));
	}

	SDValue DAGCombiner::ZExtPromoteOperand(SDValue Op, EVT PVT) {
	EVT OldVT = Op.getValueType();
	SDLoc DL(Op);
	bool Replace = false;
	SDValue NewOp = PromoteOperand(Op, PVT, Replace);
	if (!NewOp.getNode())
	return SDValue();
	AddToWorklist(NewOp.getNode());

	if (Replace)
	ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode());
	return DAG.getZeroExtendInReg(NewOp, DL, OldVT);
	}

	/// Promote the specified integer binary operation if the target indicates it is
	/// beneficial. e.g. On x86, it's usually better to promote i16 operations to
	/// i32 since i16 instructions are longer.
	SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {
	if (!LegalOperations)
	return SDValue();

	EVT VT = Op.getValueType();
	if (VT.isVector() \|\| !VT.isInteger())
	return SDValue();

	// If operation type is 'undesirable', e.g. i16 on x86, consider
	// promoting it.
	unsigned Opc = Op.getOpcode();
	if (TLI.isTypeDesirableForOp(Opc, VT))
	return SDValue();

	EVT PVT = VT;
	// Consult target whether it is a good idea to promote this operation and
	// what's the right type to promote it to.
	if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
	assert(PVT != VT && "Don't know what type to promote to!");

	DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));

	bool Replace0 = false;
	SDValue N0 = Op.getOperand(0);
	SDValue NN0 = PromoteOperand(N0, PVT, Replace0);

	bool Replace1 = false;
	SDValue N1 = Op.getOperand(1);
	SDValue NN1 = PromoteOperand(N1, PVT, Replace1);
	SDLoc DL(Op);

	SDValue RV =
	DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, NN0, NN1));

	// We are always replacing N0/N1's use in N and only need
	// additional replacements if there are additional uses.
	Replace0 &= !N0->hasOneUse();
	Replace1 &= (N0 != N1) && !N1->hasOneUse();

	// Combine Op here so it is preserved past replacements.
	CombineTo(Op.getNode(), RV);

	// If operands have a use ordering, make sure we deal with
	// predecessor first.
	if (Replace0 && Replace1 && N0.getNode()->isPredecessorOf(N1.getNode())) {
	std::swap(N0, N1);
	std::swap(NN0, NN1);
	}

	if (Replace0) {
	AddToWorklist(NN0.getNode());
	ReplaceLoadWithPromotedLoad(N0.getNode(), NN0.getNode());
	}
	if (Replace1) {
	AddToWorklist(NN1.getNode());
	ReplaceLoadWithPromotedLoad(N1.getNode(), NN1.getNode());
	}
	return Op;
	}
	return SDValue();
	}

	/// Promote the specified integer shift operation if the target indicates it is
	/// beneficial. e.g. On x86, it's usually better to promote i16 operations to
	/// i32 since i16 instructions are longer.
	SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) {
	if (!LegalOperations)
	return SDValue();

	EVT VT = Op.getValueType();
	if (VT.isVector() \|\| !VT.isInteger())
	return SDValue();

	// If operation type is 'undesirable', e.g. i16 on x86, consider
	// promoting it.
	unsigned Opc = Op.getOpcode();
	if (TLI.isTypeDesirableForOp(Opc, VT))
	return SDValue();

	EVT PVT = VT;
	// Consult target whether it is a good idea to promote this operation and
	// what's the right type to promote it to.
	if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
	assert(PVT != VT && "Don't know what type to promote to!");

	DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));

	bool Replace = false;
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);
	if (Opc == ISD::SRA)
	N0 = SExtPromoteOperand(N0, PVT);
	else if (Opc == ISD::SRL)
	N0 = ZExtPromoteOperand(N0, PVT);
	else
	N0 = PromoteOperand(N0, PVT, Replace);

	if (!N0.getNode())
	return SDValue();

	SDLoc DL(Op);
	SDValue RV =
	DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, N0, N1));

	AddToWorklist(N0.getNode());
	if (Replace)
	ReplaceLoadWithPromotedLoad(Op.getOperand(0).getNode(), N0.getNode());

	// Deal with Op being deleted.
	if (Op && Op.getOpcode() != ISD::DELETED_NODE)
	return RV;
	}
	return SDValue();
	}

	SDValue DAGCombiner::PromoteExtend(SDValue Op) {
	if (!LegalOperations)
	return SDValue();

	EVT VT = Op.getValueType();
	if (VT.isVector() \|\| !VT.isInteger())
	return SDValue();

	// If operation type is 'undesirable', e.g. i16 on x86, consider
	// promoting it.
	unsigned Opc = Op.getOpcode();
	if (TLI.isTypeDesirableForOp(Opc, VT))
	return SDValue();

	EVT PVT = VT;
	// Consult target whether it is a good idea to promote this operation and
	// what's the right type to promote it to.
	if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
	assert(PVT != VT && "Don't know what type to promote to!");
	// fold (aext (aext x)) -> (aext x)
	// fold (aext (zext x)) -> (zext x)
	// fold (aext (sext x)) -> (sext x)
	DEBUG(dbgs() << "\nPromoting ";
	Op.getNode()->dump(&DAG));
	return DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, Op.getOperand(0));
	}
	return SDValue();
	}

	bool DAGCombiner::PromoteLoad(SDValue Op) {
	if (!LegalOperations)
	return false;

	if (!ISD::isUNINDEXEDLoad(Op.getNode()))
	return false;

	EVT VT = Op.getValueType();
	if (VT.isVector() \|\| !VT.isInteger())
	return false;

	// If operation type is 'undesirable', e.g. i16 on x86, consider
	// promoting it.
	unsigned Opc = Op.getOpcode();
	if (TLI.isTypeDesirableForOp(Opc, VT))
	return false;

	EVT PVT = VT;
	// Consult target whether it is a good idea to promote this operation and
	// what's the right type to promote it to.
	if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
	assert(PVT != VT && "Don't know what type to promote to!");

	SDLoc DL(Op);
	SDNode *N = Op.getNode();
	LoadSDNode *LD = cast<LoadSDNode>(N);
	EVT MemVT = LD->getMemoryVT();
	ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD)
	? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, PVT, MemVT) ? ISD::ZEXTLOAD
	: ISD::EXTLOAD)
	: LD->getExtensionType();
	SDValue NewLD = DAG.getExtLoad(ExtType, DL, PVT,
	LD->getChain(), LD->getBasePtr(),
	MemVT, LD->getMemOperand());
	SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD);

	DEBUG(dbgs() << "\nPromoting ";
	N->dump(&DAG);
	dbgs() << "\nTo: ";
	Result.getNode()->dump(&DAG);
	dbgs() << '\n');
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1));
	deleteAndRecombine(N);
	AddToWorklist(Result.getNode());
	return true;
	}
	return false;
	}

	/// \brief Recursively delete a node which has no uses and any operands for
	/// which it is the only use.
	///
	/// Note that this both deletes the nodes and removes them from the worklist.
	/// It also adds any nodes who have had a user deleted to the worklist as they
	/// may now have only one use and subject to other combines.
	bool DAGCombiner::recursivelyDeleteUnusedNodes(SDNode *N) {
	if (!N->use_empty())
	return false;

	SmallSetVector<SDNode *, 16> Nodes;
	Nodes.insert(N);
	do {
	N = Nodes.pop_back_val();
	if (!N)
	continue;

	if (N->use_empty()) {
	for (const SDValue &ChildN : N->op_values())
	Nodes.insert(ChildN.getNode());

	removeFromWorklist(N);
	DAG.DeleteNode(N);
	} else {
	AddToWorklist(N);
	}
	} while (!Nodes.empty());
	return true;
	}

	//===----------------------------------------------------------------------===//
	// Main DAG Combiner implementation
	//===----------------------------------------------------------------------===//

	void DAGCombiner::Run(CombineLevel AtLevel) {
	// set the instance variables, so that the various visit routines may use it.
	Level = AtLevel;
	LegalOperations = Level >= AfterLegalizeVectorOps;
	LegalTypes = Level >= AfterLegalizeTypes;

	// Add all the dag nodes to the worklist.
	for (SDNode &Node : DAG.allnodes())
	AddToWorklist(&Node);

	// Create a dummy node (which is not added to allnodes), that adds a reference
	// to the root node, preventing it from being deleted, and tracking any
	// changes of the root.
	HandleSDNode Dummy(DAG.getRoot());

	// While the worklist isn't empty, find a node and try to combine it.
	while (!WorklistMap.empty()) {
	SDNode *N;
	// The Worklist holds the SDNodes in order, but it may contain null entries.
	do {
	N = Worklist.pop_back_val();
	} while (!N);

	bool GoodWorklistEntry = WorklistMap.erase(N);
	(void)GoodWorklistEntry;
	assert(GoodWorklistEntry &&
	"Found a worklist entry without a corresponding map entry!");

	// If N has no uses, it is dead. Make sure to revisit all N's operands once
	// N is deleted from the DAG, since they too may now be dead or may have a
	// reduced number of uses, allowing other xforms.
	if (recursivelyDeleteUnusedNodes(N))
	continue;

	WorklistRemover DeadNodes(*this);

	// If this combine is running after legalizing the DAG, re-legalize any
	// nodes pulled off the worklist.
	if (Level == AfterLegalizeDAG) {
	SmallSetVector<SDNode *, 16> UpdatedNodes;
	bool NIsValid = DAG.LegalizeOp(N, UpdatedNodes);

	for (SDNode *LN : UpdatedNodes) {
	AddToWorklist(LN);
	AddUsersToWorklist(LN);
	}
	if (!NIsValid)
	continue;
	}

	DEBUG(dbgs() << "\nCombining: "; N->dump(&DAG));

	// Add any operands of the new node which have not yet been combined to the
	// worklist as well. Because the worklist uniques things already, this
	// won't repeatedly process the same operand.
	CombinedNodes.insert(N);
	for (const SDValue &ChildN : N->op_values())
	if (!CombinedNodes.count(ChildN.getNode()))
	AddToWorklist(ChildN.getNode());

	SDValue RV = combine(N);

	if (!RV.getNode())
	continue;

	++NodesCombined;

	// If we get back the same node we passed in, rather than a new node or
	// zero, we know that the node must have defined multiple values and
	// CombineTo was used. Since CombineTo takes care of the worklist
	// mechanics for us, we have no work to do in this case.
	if (RV.getNode() == N)
	continue;

	assert(N->getOpcode() != ISD::DELETED_NODE &&
	RV.getOpcode() != ISD::DELETED_NODE &&
	"Node was deleted but visit returned new node!");

	DEBUG(dbgs() << " ... into: ";
	RV.getNode()->dump(&DAG));

	if (N->getNumValues() == RV.getNode()->getNumValues())
	DAG.ReplaceAllUsesWith(N, RV.getNode());
	else {
	assert(N->getValueType(0) == RV.getValueType() &&
	N->getNumValues() == 1 && "Type mismatch");
	DAG.ReplaceAllUsesWith(N, &RV);
	}

	// Push the new node and any users onto the worklist
	AddToWorklist(RV.getNode());
	AddUsersToWorklist(RV.getNode());

	// Finally, if the node is now dead, remove it from the graph. The node
	// may not be dead if the replacement process recursively simplified to
	// something else needing this node. This will also take care of adding any
	// operands which have lost a user to the worklist.
	recursivelyDeleteUnusedNodes(N);
	}

	// If the root changed (e.g. it was a dead load, update the root).
	DAG.setRoot(Dummy.getValue());
	DAG.RemoveDeadNodes();
	}

	SDValue DAGCombiner::visit(SDNode *N) {
	switch (N->getOpcode()) {
	default: break;
	case ISD::TokenFactor: return visitTokenFactor(N);
	case ISD::MERGE_VALUES: return visitMERGE_VALUES(N);
	case ISD::ADD: return visitADD(N);
	case ISD::SUB: return visitSUB(N);
	case ISD::ADDC: return visitADDC(N);
	case ISD::UADDO: return visitUADDO(N);
	case ISD::SUBC: return visitSUBC(N);
	case ISD::USUBO: return visitUSUBO(N);
	case ISD::ADDE: return visitADDE(N);
	case ISD::ADDCARRY: return visitADDCARRY(N);
	case ISD::SUBE: return visitSUBE(N);
	case ISD::SUBCARRY: return visitSUBCARRY(N);
	case ISD::MUL: return visitMUL(N);
	case ISD::SDIV: return visitSDIV(N);
	case ISD::UDIV: return visitUDIV(N);
	case ISD::SREM:
	case ISD::UREM: return visitREM(N);
	case ISD::MULHU: return visitMULHU(N);
	case ISD::MULHS: return visitMULHS(N);
	case ISD::SMUL_LOHI: return visitSMUL_LOHI(N);
	case ISD::UMUL_LOHI: return visitUMUL_LOHI(N);
	case ISD::SMULO: return visitSMULO(N);
	case ISD::UMULO: return visitUMULO(N);
	case ISD::SMIN:
	case ISD::SMAX:
	case ISD::UMIN:
	case ISD::UMAX: return visitIMINMAX(N);
	case ISD::AND: return visitAND(N);
	case ISD::OR: return visitOR(N);
	case ISD::XOR: return visitXOR(N);
	case ISD::SHL: return visitSHL(N);
	case ISD::SRA: return visitSRA(N);
	case ISD::SRL: return visitSRL(N);
	case ISD::ROTR:
	case ISD::ROTL: return visitRotate(N);
	case ISD::ABS: return visitABS(N);
	case ISD::BSWAP: return visitBSWAP(N);
	case ISD::BITREVERSE: return visitBITREVERSE(N);
	case ISD::CTLZ: return visitCTLZ(N);
	case ISD::CTLZ_ZERO_UNDEF: return visitCTLZ_ZERO_UNDEF(N);
	case ISD::CTTZ: return visitCTTZ(N);
	case ISD::CTTZ_ZERO_UNDEF: return visitCTTZ_ZERO_UNDEF(N);
	case ISD::CTPOP: return visitCTPOP(N);
	case ISD::SELECT: return visitSELECT(N);
	case ISD::VSELECT: return visitVSELECT(N);
	case ISD::SELECT_CC: return visitSELECT_CC(N);
	case ISD::SETCC: return visitSETCC(N);
	case ISD::SETCCE: return visitSETCCE(N);
	case ISD::SETCCCARRY: return visitSETCCCARRY(N);
	case ISD::SIGN_EXTEND: return visitSIGN_EXTEND(N);
	case ISD::ZERO_EXTEND: return visitZERO_EXTEND(N);
	case ISD::ANY_EXTEND: return visitANY_EXTEND(N);
	case ISD::AssertSext:
	case ISD::AssertZext: return visitAssertExt(N);
	case ISD::SIGN_EXTEND_INREG: return visitSIGN_EXTEND_INREG(N);
	case ISD::SIGN_EXTEND_VECTOR_INREG: return visitSIGN_EXTEND_VECTOR_INREG(N);
	case ISD::ZERO_EXTEND_VECTOR_INREG: return visitZERO_EXTEND_VECTOR_INREG(N);
	case ISD::TRUNCATE: return visitTRUNCATE(N);
	case ISD::BITCAST: return visitBITCAST(N);
	case ISD::BUILD_PAIR: return visitBUILD_PAIR(N);
	case ISD::FADD: return visitFADD(N);
	case ISD::FSUB: return visitFSUB(N);
	case ISD::FMUL: return visitFMUL(N);
	case ISD::FMA: return visitFMA(N);
	case ISD::FDIV: return visitFDIV(N);
	case ISD::FREM: return visitFREM(N);
	case ISD::FSQRT: return visitFSQRT(N);
	case ISD::FCOPYSIGN: return visitFCOPYSIGN(N);
	case ISD::SINT_TO_FP: return visitSINT_TO_FP(N);
	case ISD::UINT_TO_FP: return visitUINT_TO_FP(N);
	case ISD::FP_TO_SINT: return visitFP_TO_SINT(N);
	case ISD::FP_TO_UINT: return visitFP_TO_UINT(N);
	case ISD::FP_ROUND: return visitFP_ROUND(N);
	case ISD::FP_ROUND_INREG: return visitFP_ROUND_INREG(N);
	case ISD::FP_EXTEND: return visitFP_EXTEND(N);
	case ISD::FNEG: return visitFNEG(N);
	case ISD::FABS: return visitFABS(N);
	case ISD::FFLOOR: return visitFFLOOR(N);
	case ISD::FMINNUM: return visitFMINNUM(N);
	case ISD::FMAXNUM: return visitFMAXNUM(N);
	case ISD::FCEIL: return visitFCEIL(N);
	case ISD::FTRUNC: return visitFTRUNC(N);
	case ISD::BRCOND: return visitBRCOND(N);
	case ISD::BR_CC: return visitBR_CC(N);
	case ISD::LOAD: return visitLOAD(N);
	case ISD::STORE: return visitSTORE(N);
	case ISD::INSERT_VECTOR_ELT: return visitINSERT_VECTOR_ELT(N);
	case ISD::EXTRACT_VECTOR_ELT: return visitEXTRACT_VECTOR_ELT(N);
	case ISD::BUILD_VECTOR: return visitBUILD_VECTOR(N);
	case ISD::CONCAT_VECTORS: return visitCONCAT_VECTORS(N);
	case ISD::EXTRACT_SUBVECTOR: return visitEXTRACT_SUBVECTOR(N);
	case ISD::VECTOR_SHUFFLE: return visitVECTOR_SHUFFLE(N);
	case ISD::SCALAR_TO_VECTOR: return visitSCALAR_TO_VECTOR(N);
	case ISD::INSERT_SUBVECTOR: return visitINSERT_SUBVECTOR(N);
	case ISD::MGATHER: return visitMGATHER(N);
	case ISD::MLOAD: return visitMLOAD(N);
	case ISD::MSCATTER: return visitMSCATTER(N);
	case ISD::MSTORE: return visitMSTORE(N);
	case ISD::FP_TO_FP16: return visitFP_TO_FP16(N);
	case ISD::FP16_TO_FP: return visitFP16_TO_FP(N);
	}
	return SDValue();
	}

	SDValue DAGCombiner::combine(SDNode *N) {
	SDValue RV = visit(N);

	// If nothing happened, try a target-specific DAG combine.
	if (!RV.getNode()) {
	assert(N->getOpcode() != ISD::DELETED_NODE &&
	"Node was deleted but visit returned NULL!");

	if (N->getOpcode() >= ISD::BUILTIN_OP_END \|\|
	TLI.hasTargetDAGCombine((ISD::NodeType)N->getOpcode())) {

	// Expose the DAG combiner to the target combiner impls.
	TargetLowering::DAGCombinerInfo
	DagCombineInfo(DAG, Level, false, this);

	RV = TLI.PerformDAGCombine(N, DagCombineInfo);
	}
	}

	// If nothing happened still, try promoting the operation.
	if (!RV.getNode()) {
	switch (N->getOpcode()) {
	default: break;
	case ISD::ADD:
	case ISD::SUB:
	case ISD::MUL:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	RV = PromoteIntBinOp(SDValue(N, 0));
	break;
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL:
	RV = PromoteIntShiftOp(SDValue(N, 0));
	break;
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	RV = PromoteExtend(SDValue(N, 0));
	break;
	case ISD::LOAD:
	if (PromoteLoad(SDValue(N, 0)))
	RV = SDValue(N, 0);
	break;
	}
	}

	// If N is a commutative binary node, try eliminate it if the commuted
	// version is already present in the DAG.
	if (!RV.getNode() && TLI.isCommutativeBinOp(N->getOpcode()) &&
	N->getNumValues() == 1) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Constant operands are canonicalized to RHS.
	if (N0 != N1 && (isa<ConstantSDNode>(N0) \|\| !isa<ConstantSDNode>(N1))) {
	SDValue Ops[] = {N1, N0};
	SDNode *CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(), Ops,
	N->getFlags());
	if (CSENode)
	return SDValue(CSENode, 0);
	}
	}

	return RV;
	}

	/// Given a node, return its input chain if it has one, otherwise return a null
	/// sd operand.
	static SDValue getInputChainForNode(SDNode *N) {
	if (unsigned NumOps = N->getNumOperands()) {
	if (N->getOperand(0).getValueType() == MVT::Other)
	return N->getOperand(0);
	if (N->getOperand(NumOps-1).getValueType() == MVT::Other)
	return N->getOperand(NumOps-1);
	for (unsigned i = 1; i < NumOps-1; ++i)
	if (N->getOperand(i).getValueType() == MVT::Other)
	return N->getOperand(i);
	}
	return SDValue();
	}

	SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
	// If N has two operands, where one has an input chain equal to the other,
	// the 'other' chain is redundant.
	if (N->getNumOperands() == 2) {
	if (getInputChainForNode(N->getOperand(0).getNode()) == N->getOperand(1))
	return N->getOperand(0);
	if (getInputChainForNode(N->getOperand(1).getNode()) == N->getOperand(0))
	return N->getOperand(1);
	}

	SmallVector<SDNode *, 8> TFs; // List of token factors to visit.
	SmallVector<SDValue, 8> Ops; // Ops for replacing token factor.
	SmallPtrSet<SDNode*, 16> SeenOps;
	bool Changed = false; // If we should replace this token factor.

	// Start out with this token factor.
	TFs.push_back(N);

	// Iterate through token factors. The TFs grows when new token factors are
	// encountered.
	for (unsigned i = 0; i < TFs.size(); ++i) {
	SDNode *TF = TFs[i];

	// Check each of the operands.
	for (const SDValue &Op : TF->op_values()) {
	switch (Op.getOpcode()) {
	case ISD::EntryToken:
	// Entry tokens don't need to be added to the list. They are
	// redundant.
	Changed = true;
	break;

	case ISD::TokenFactor:
	if (Op.hasOneUse() && !is_contained(TFs, Op.getNode())) {
	// Queue up for processing.
	TFs.push_back(Op.getNode());
	// Clean up in case the token factor is removed.
	AddToWorklist(Op.getNode());
	Changed = true;
	break;
	}
	LLVM_FALLTHROUGH;

	default:
	// Only add if it isn't already in the list.
	if (SeenOps.insert(Op.getNode()).second)
	Ops.push_back(Op);
	else
	Changed = true;
	break;
	}
	}
	}

	// Remove Nodes that are chained to another node in the list. Do so
	// by walking up chains breath-first stopping when we've seen
	// another operand. In general we must climb to the EntryNode, but we can exit
	// early if we find all remaining work is associated with just one operand as
	// no further pruning is possible.

	// List of nodes to search through and original Ops from which they originate.
	SmallVector<std::pair<SDNode *, unsigned>, 8> Worklist;
	SmallVector<unsigned, 8> OpWorkCount; // Count of work for each Op.
	SmallPtrSet<SDNode *, 16> SeenChains;
	bool DidPruneOps = false;

	unsigned NumLeftToConsider = 0;
	for (const SDValue &Op : Ops) {
	Worklist.push_back(std::make_pair(Op.getNode(), NumLeftToConsider++));
	OpWorkCount.push_back(1);
	}

	auto AddToWorklist = [&](unsigned CurIdx, SDNode *Op, unsigned OpNumber) {
	// If this is an Op, we can remove the op from the list. Remark any
	// search associated with it as from the current OpNumber.
	if (SeenOps.count(Op) != 0) {
	Changed = true;
	DidPruneOps = true;
	unsigned OrigOpNumber = 0;
	while (OrigOpNumber < Ops.size() && Ops[OrigOpNumber].getNode() != Op)
	OrigOpNumber++;
	assert((OrigOpNumber != Ops.size()) &&
	"expected to find TokenFactor Operand");
	// Re-mark worklist from OrigOpNumber to OpNumber
	for (unsigned i = CurIdx + 1; i < Worklist.size(); ++i) {
	if (Worklist[i].second == OrigOpNumber) {
	Worklist[i].second = OpNumber;
	}
	}
	OpWorkCount[OpNumber] += OpWorkCount[OrigOpNumber];
	OpWorkCount[OrigOpNumber] = 0;
	NumLeftToConsider--;
	}
	// Add if it's a new chain
	if (SeenChains.insert(Op).second) {
	OpWorkCount[OpNumber]++;
	Worklist.push_back(std::make_pair(Op, OpNumber));
	}
	};

	for (unsigned i = 0; i < Worklist.size() && i < 1024; ++i) {
	// We need at least be consider at least 2 Ops to prune.
	if (NumLeftToConsider <= 1)
	break;
	auto CurNode = Worklist[i].first;
	auto CurOpNumber = Worklist[i].second;
	assert((OpWorkCount[CurOpNumber] > 0) &&
	"Node should not appear in worklist");
	switch (CurNode->getOpcode()) {
	case ISD::EntryToken:
	// Hitting EntryToken is the only way for the search to terminate without
	// hitting
	// another operand's search. Prevent us from marking this operand
	// considered.
	NumLeftToConsider++;
	break;
	case ISD::TokenFactor:
	for (const SDValue &Op : CurNode->op_values())
	AddToWorklist(i, Op.getNode(), CurOpNumber);
	break;
	case ISD::CopyFromReg:
	case ISD::CopyToReg:
	AddToWorklist(i, CurNode->getOperand(0).getNode(), CurOpNumber);
	break;
	default:
	if (auto *MemNode = dyn_cast<MemSDNode>(CurNode))
	AddToWorklist(i, MemNode->getChain().getNode(), CurOpNumber);
	break;
	}
	OpWorkCount[CurOpNumber]--;
	if (OpWorkCount[CurOpNumber] == 0)
	NumLeftToConsider--;
	}

	// If we've changed things around then replace token factor.
	if (Changed) {
	SDValue Result;
	if (Ops.empty()) {
	// The entry token is the only possible outcome.
	Result = DAG.getEntryNode();
	} else {
	if (DidPruneOps) {
	SmallVector<SDValue, 8> PrunedOps;
	//
	for (const SDValue &Op : Ops) {
	if (SeenChains.count(Op.getNode()) == 0)
	PrunedOps.push_back(Op);
	}
	Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, PrunedOps);
	} else {
	Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops);
	}
	}
	return Result;
	}
	return SDValue();
	}

	/// MERGE_VALUES can always be eliminated.
	SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) {
	WorklistRemover DeadNodes(*this);
	// Replacing results may cause a different MERGE_VALUES to suddenly
	// be CSE'd with N, and carry its uses with it. Iterate until no
	// uses remain, to ensure that the node can be safely deleted.
	// First add the users of this node to the work list so that they
	// can be tried again once they have new operands.
	AddUsersToWorklist(N);
	do {
	for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, i), N->getOperand(i));
	} while (!N->use_empty());
	deleteAndRecombine(N);
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}

	/// If \p N is a ConstantSDNode with isOpaque() == false return it casted to a
	/// ConstantSDNode pointer else nullptr.
	static ConstantSDNode *getAsNonOpaqueConstant(SDValue N) {
	ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N);
	return Const != nullptr && !Const->isOpaque() ? Const : nullptr;
	}

	SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) {
	auto BinOpcode = BO->getOpcode();
	assert((BinOpcode == ISD::ADD \|\| BinOpcode == ISD::SUB \|\|
	BinOpcode == ISD::MUL \|\| BinOpcode == ISD::SDIV \|\|
	BinOpcode == ISD::UDIV \|\| BinOpcode == ISD::SREM \|\|
	BinOpcode == ISD::UREM \|\| BinOpcode == ISD::AND \|\|
	BinOpcode == ISD::OR \|\| BinOpcode == ISD::XOR \|\|
	BinOpcode == ISD::SHL \|\| BinOpcode == ISD::SRL \|\|
	BinOpcode == ISD::SRA \|\| BinOpcode == ISD::FADD \|\|
	BinOpcode == ISD::FSUB \|\| BinOpcode == ISD::FMUL \|\|
	BinOpcode == ISD::FDIV \|\| BinOpcode == ISD::FREM) &&
	"Unexpected binary operator");

	// Bail out if any constants are opaque because we can't constant fold those.
	SDValue C1 = BO->getOperand(1);
	if (!isConstantOrConstantVector(C1, true) &&
	!isConstantFPBuildVectorOrConstantFP(C1))
	return SDValue();

	// Don't do this unless the old select is going away. We want to eliminate the
	// binary operator, not replace a binop with a select.
	// TODO: Handle ISD::SELECT_CC.
	SDValue Sel = BO->getOperand(0);
	if (Sel.getOpcode() != ISD::SELECT \|\| !Sel.hasOneUse())
	return SDValue();

	SDValue CT = Sel.getOperand(1);
	if (!isConstantOrConstantVector(CT, true) &&
	!isConstantFPBuildVectorOrConstantFP(CT))
	return SDValue();

	SDValue CF = Sel.getOperand(2);
	if (!isConstantOrConstantVector(CF, true) &&
	!isConstantFPBuildVectorOrConstantFP(CF))
	return SDValue();

	// We have a select-of-constants followed by a binary operator with a
	// constant. Eliminate the binop by pulling the constant math into the select.
	// Example: add (select Cond, CT, CF), C1 --> select Cond, CT + C1, CF + C1
	EVT VT = Sel.getValueType();
	SDLoc DL(Sel);
	SDValue NewCT = DAG.getNode(BinOpcode, DL, VT, CT, C1);
	if (!NewCT.isUndef() &&
	!isConstantOrConstantVector(NewCT, true) &&
	!isConstantFPBuildVectorOrConstantFP(NewCT))
	return SDValue();

	SDValue NewCF = DAG.getNode(BinOpcode, DL, VT, CF, C1);
	if (!NewCF.isUndef() &&
	!isConstantOrConstantVector(NewCF, true) &&
	!isConstantFPBuildVectorOrConstantFP(NewCF))
	return SDValue();

	return DAG.getSelect(DL, VT, Sel.getOperand(0), NewCT, NewCF);
	}

	SDValue DAGCombiner::visitADD(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	SDLoc DL(N);

	// fold vector ops
	if (VT.isVector()) {
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	// fold (add x, 0) -> x, vector edition
	if (ISD::isBuildVectorAllZeros(N1.getNode()))
	return N0;
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return N1;
	}

	// fold (add x, undef) -> undef
	if (N0.isUndef())
	return N0;

	if (N1.isUndef())
	return N1;

	if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) {
	// canonicalize constant to RHS
	if (!DAG.isConstantIntBuildVectorOrConstantInt(N1))
	return DAG.getNode(ISD::ADD, DL, VT, N1, N0);
	// fold (add c1, c2) -> c1+c2
	return DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, N0.getNode(),
	N1.getNode());
	}

	// fold (add x, 0) -> x
	if (isNullConstant(N1))
	return N0;

	if (isConstantOrConstantVector(N1, /* NoOpaque */ true)) {
	// fold ((c1-A)+c2) -> (c1+c2)-A
	if (N0.getOpcode() == ISD::SUB &&
	isConstantOrConstantVector(N0.getOperand(0), /* NoOpaque */ true)) {
	// FIXME: Adding 2 constants should be handled by FoldConstantArithmetic.
	return DAG.getNode(ISD::SUB, DL, VT,
	DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(0)),
	N0.getOperand(1));
	}

	// add (sext i1 X), 1 -> zext (not i1 X)
	// We don't transform this pattern:
	// add (zext i1 X), -1 -> sext (not i1 X)
	// because most (?) targets generate better code for the zext form.
	if (N0.getOpcode() == ISD::SIGN_EXTEND && N0.hasOneUse() &&
	isOneConstantOrOneSplatConstant(N1)) {
	SDValue X = N0.getOperand(0);
	if ((!LegalOperations \|\|
	(TLI.isOperationLegal(ISD::XOR, X.getValueType()) &&
	TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) &&
	X.getScalarValueSizeInBits() == 1) {
	SDValue Not = DAG.getNOT(DL, X, X.getValueType());
	return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Not);
	}
	}

	// Undo the add -> or combine to merge constant offsets from a frame index.
	if (N0.getOpcode() == ISD::OR &&
	isa<FrameIndexSDNode>(N0.getOperand(0)) &&
	isa<ConstantSDNode>(N0.getOperand(1)) &&
	DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1))) {
	SDValue Add0 = DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(1));
	return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Add0);
	}
	}

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// reassociate add
	if (SDValue RADD = ReassociateOps(ISD::ADD, DL, N0, N1))
	return RADD;

	// fold ((0-A) + B) -> B-A
	if (N0.getOpcode() == ISD::SUB &&
	isNullConstantOrNullSplatConstant(N0.getOperand(0)))
	return DAG.getNode(ISD::SUB, DL, VT, N1, N0.getOperand(1));

	// fold (A + (0-B)) -> A-B
	if (N1.getOpcode() == ISD::SUB &&
	isNullConstantOrNullSplatConstant(N1.getOperand(0)))
	return DAG.getNode(ISD::SUB, DL, VT, N0, N1.getOperand(1));

	// fold (A+(B-A)) -> B
	if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(1))
	return N1.getOperand(0);

	// fold ((B-A)+A) -> B
	if (N0.getOpcode() == ISD::SUB && N1 == N0.getOperand(1))
	return N0.getOperand(0);

	// fold (A+(B-(A+C))) to (B-C)
	if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD &&
	N0 == N1.getOperand(1).getOperand(0))
	return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0),
	N1.getOperand(1).getOperand(1));

	// fold (A+(B-(C+A))) to (B-C)
	if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD &&
	N0 == N1.getOperand(1).getOperand(1))
	return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0),
	N1.getOperand(1).getOperand(0));

	// fold (A+((B-A)+or-C)) to (B+or-C)
	if ((N1.getOpcode() == ISD::SUB \|\| N1.getOpcode() == ISD::ADD) &&
	N1.getOperand(0).getOpcode() == ISD::SUB &&
	N0 == N1.getOperand(0).getOperand(1))
	return DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0).getOperand(0),
	N1.getOperand(1));

	// fold (A-B)+(C-D) to (A+C)-(B+D) when A or C is constant
	if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB) {
	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	SDValue N10 = N1.getOperand(0);
	SDValue N11 = N1.getOperand(1);

	if (isConstantOrConstantVector(N00) \|\| isConstantOrConstantVector(N10))
	return DAG.getNode(ISD::SUB, DL, VT,
	DAG.getNode(ISD::ADD, SDLoc(N0), VT, N00, N10),
	DAG.getNode(ISD::ADD, SDLoc(N1), VT, N01, N11));
	}

	if (SimplifyDemandedBits(SDValue(N, 0)))
	return SDValue(N, 0);

	// fold (a+b) -> (a\|b) iff a and b share no bits.
	if ((!LegalOperations \|\| TLI.isOperationLegal(ISD::OR, VT)) &&
	DAG.haveNoCommonBitsSet(N0, N1))
	return DAG.getNode(ISD::OR, DL, VT, N0, N1);

	if (SDValue Combined = visitADDLike(N0, N1, N))
	return Combined;

	if (SDValue Combined = visitADDLike(N1, N0, N))
	return Combined;

	return SDValue();
	}

	static SDValue getAsCarry(const TargetLowering &TLI, SDValue V) {
	bool Masked = false;

	// First, peel away TRUNCATE/ZERO_EXTEND/AND nodes due to legalization.
	while (true) {
	if (V.getOpcode() == ISD::TRUNCATE \|\| V.getOpcode() == ISD::ZERO_EXTEND) {
	V = V.getOperand(0);
	continue;
	}

	if (V.getOpcode() == ISD::AND && isOneConstant(V.getOperand(1))) {
	Masked = true;
	V = V.getOperand(0);
	continue;
	}

	break;
	}

	// If this is not a carry, return.
	if (V.getResNo() != 1)
	return SDValue();

	if (V.getOpcode() != ISD::ADDCARRY && V.getOpcode() != ISD::SUBCARRY &&
	V.getOpcode() != ISD::UADDO && V.getOpcode() != ISD::USUBO)
	return SDValue();

	// If the result is masked, then no matter what kind of bool it is we can
	// return. If it isn't, then we need to make sure the bool type is either 0 or
	// 1 and not other values.
	if (Masked \|\|
	TLI.getBooleanContents(V.getValueType()) ==
	TargetLoweringBase::ZeroOrOneBooleanContent)
	return V;

	return SDValue();
	}

	SDValue DAGCombiner::visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference) {
	EVT VT = N0.getValueType();
	SDLoc DL(LocReference);

	// fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n))
	if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::SUB &&
	isNullConstantOrNullSplatConstant(N1.getOperand(0).getOperand(0)))
	return DAG.getNode(ISD::SUB, DL, VT, N0,
	DAG.getNode(ISD::SHL, DL, VT,
	N1.getOperand(0).getOperand(1),
	N1.getOperand(1)));

	if (N1.getOpcode() == ISD::AND) {
	SDValue AndOp0 = N1.getOperand(0);
	unsigned NumSignBits = DAG.ComputeNumSignBits(AndOp0);
	unsigned DestBits = VT.getScalarSizeInBits();

	// (add z, (and (sbbl x, x), 1)) -> (sub z, (sbbl x, x))
	// and similar xforms where the inner op is either ~0 or 0.
	if (NumSignBits == DestBits &&
	isOneConstantOrOneSplatConstant(N1->getOperand(1)))
	return DAG.getNode(ISD::SUB, DL, VT, N0, AndOp0);
	}

	// add (sext i1), X -> sub X, (zext i1)
	if (N0.getOpcode() == ISD::SIGN_EXTEND &&
	N0.getOperand(0).getValueType() == MVT::i1 &&
	!TLI.isOperationLegal(ISD::SIGN_EXTEND, MVT::i1)) {
	SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
	return DAG.getNode(ISD::SUB, DL, VT, N1, ZExt);
	}

	// add X, (sextinreg Y i1) -> sub X, (and Y 1)
	if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) {
	VTSDNode *TN = cast<VTSDNode>(N1.getOperand(1));
	if (TN->getVT() == MVT::i1) {
	SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0),
	DAG.getConstant(1, DL, VT));
	return DAG.getNode(ISD::SUB, DL, VT, N0, ZExt);
	}
	}

	// (add X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry)
	if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1)) &&
	N1.getResNo() == 0)
	return DAG.getNode(ISD::ADDCARRY, DL, N1->getVTList(),
	N0, N1.getOperand(0), N1.getOperand(2));

	// (add X, Carry) -> (addcarry X, 0, Carry)
	if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT))
	if (SDValue Carry = getAsCarry(TLI, N1))
	return DAG.getNode(ISD::ADDCARRY, DL,
	DAG.getVTList(VT, Carry.getValueType()), N0,
	DAG.getConstant(0, DL, VT), Carry);

	return SDValue();
	}

	SDValue DAGCombiner::visitADDC(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	SDLoc DL(N);

	// If the flag result is dead, turn this into an ADD.
	if (!N->hasAnyUseOfValue(1))
	return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
	DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));

	// canonicalize constant to RHS.
	ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
	if (N0C && !N1C)
	return DAG.getNode(ISD::ADDC, DL, N->getVTList(), N1, N0);

	// fold (addc x, 0) -> x + no carry out
	if (isNullConstant(N1))
	return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE,
	DL, MVT::Glue));

	// If it cannot overflow, transform into an add.
	if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never)
	return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
	DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));

	return SDValue();
	}

	SDValue DAGCombiner::visitUADDO(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	if (VT.isVector())
	return SDValue();

	EVT CarryVT = N->getValueType(1);
	SDLoc DL(N);

	// If the flag result is dead, turn this into an ADD.
	if (!N->hasAnyUseOfValue(1))
	return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
	DAG.getUNDEF(CarryVT));

	// canonicalize constant to RHS.
	ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
	if (N0C && !N1C)
	return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N1, N0);

	// fold (uaddo x, 0) -> x + no carry out
	if (isNullConstant(N1))
	return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT));

	// If it cannot overflow, transform into an add.
	if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never)
	return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
	DAG.getConstant(0, DL, CarryVT));

	if (SDValue Combined = visitUADDOLike(N0, N1, N))
	return Combined;

	if (SDValue Combined = visitUADDOLike(N1, N0, N))
	return Combined;

	return SDValue();
	}

	SDValue DAGCombiner::visitUADDOLike(SDValue N0, SDValue N1, SDNode *N) {
	auto VT = N0.getValueType();

	// (uaddo X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry)
	// If Y + 1 cannot overflow.
	if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1))) {
	SDValue Y = N1.getOperand(0);
	SDValue One = DAG.getConstant(1, SDLoc(N), Y.getValueType());
	if (DAG.computeOverflowKind(Y, One) == SelectionDAG::OFK_Never)
	return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, Y,
	N1.getOperand(2));
	}

	// (uaddo X, Carry) -> (addcarry X, 0, Carry)
	if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT))
	if (SDValue Carry = getAsCarry(TLI, N1))
	return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0,
	DAG.getConstant(0, SDLoc(N), VT), Carry);

	return SDValue();
	}

	SDValue DAGCombiner::visitADDE(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue CarryIn = N->getOperand(2);

	// canonicalize constant to RHS
	ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
	if (N0C && !N1C)
	return DAG.getNode(ISD::ADDE, SDLoc(N), N->getVTList(),
	N1, N0, CarryIn);

	// fold (adde x, y, false) -> (addc x, y)
	if (CarryIn.getOpcode() == ISD::CARRY_FALSE)
	return DAG.getNode(ISD::ADDC, SDLoc(N), N->getVTList(), N0, N1);

	return SDValue();
	}

	SDValue DAGCombiner::visitADDCARRY(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue CarryIn = N->getOperand(2);
	SDLoc DL(N);

	// canonicalize constant to RHS
	ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
	if (N0C && !N1C)
	return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), N1, N0, CarryIn);

	// fold (addcarry x, y, false) -> (uaddo x, y)
	if (isNullConstant(CarryIn))
	return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1);

	// fold (addcarry 0, 0, X) -> (and (ext/trunc X), 1) and no carry.
	if (isNullConstant(N0) && isNullConstant(N1)) {
	EVT VT = N0.getValueType();
	EVT CarryVT = CarryIn.getValueType();
	SDValue CarryExt = DAG.getBoolExtOrTrunc(CarryIn, DL, VT, CarryVT);
	AddToWorklist(CarryExt.getNode());
	return CombineTo(N, DAG.getNode(ISD::AND, DL, VT, CarryExt,
	DAG.getConstant(1, DL, VT)),
	DAG.getConstant(0, DL, CarryVT));
	}

	if (SDValue Combined = visitADDCARRYLike(N0, N1, CarryIn, N))
	return Combined;

	if (SDValue Combined = visitADDCARRYLike(N1, N0, CarryIn, N))
	return Combined;

	return SDValue();
	}

	SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn,
	SDNode *N) {
	// Iff the flag result is dead:
	// (addcarry (add\|uaddo X, Y), 0, Carry) -> (addcarry X, Y, Carry)
	if ((N0.getOpcode() == ISD::ADD \|\|
	(N0.getOpcode() == ISD::UADDO && N0.getResNo() == 0)) &&
	isNullConstant(N1) && !N->hasAnyUseOfValue(1))
	return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(),
	N0.getOperand(0), N0.getOperand(1), CarryIn);

	/**
	* When one of the addcarry argument is itself a carry, we may be facing
	* a diamond carry propagation. In which case we try to transform the DAG
	* to ensure linear carry propagation if that is possible.
	*
	* We are trying to get:
	* (addcarry X, 0, (addcarry A, B, Z):Carry)
	*/
	if (auto Y = getAsCarry(TLI, N1)) {
	/**
	* (uaddo A, B)
	* / \
	* Carry Sum
	* \| \
	* \| (addcarry *, 0, Z)
	* \| /
	* \ Carry
	* \| /
	* (addcarry X, , )
	*/
	if (Y.getOpcode() == ISD::UADDO &&
	CarryIn.getResNo() == 1 &&
	CarryIn.getOpcode() == ISD::ADDCARRY &&
	isNullConstant(CarryIn.getOperand(1)) &&
	CarryIn.getOperand(0) == Y.getValue(0)) {
	auto NewY = DAG.getNode(ISD::ADDCARRY, SDLoc(N), Y->getVTList(),
	Y.getOperand(0), Y.getOperand(1),
	CarryIn.getOperand(2));
	AddToWorklist(NewY.getNode());
	return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0,
	DAG.getConstant(0, SDLoc(N), N0.getValueType()),
	NewY.getValue(1));
	}
	}

	return SDValue();
	}

	// Since it may not be valid to emit a fold to zero for vector initializers
	// check if we can before folding.
	static SDValue tryFoldToZero(const SDLoc &DL, const TargetLowering &TLI, EVT VT,
	SelectionDAG &DAG, bool LegalOperations,
	bool LegalTypes) {
	if (!VT.isVector())
	return DAG.getConstant(0, DL, VT);
	if (!LegalOperations \|\| TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
	return DAG.getConstant(0, DL, VT);
	return SDValue();
	}

	SDValue DAGCombiner::visitSUB(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	SDLoc DL(N);

	// fold vector ops
	if (VT.isVector()) {
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	// fold (sub x, 0) -> x, vector edition
	if (ISD::isBuildVectorAllZeros(N1.getNode()))
	return N0;
	}

	// fold (sub x, x) -> 0
	// FIXME: Refactor this and xor and other similar operations together.
	if (N0 == N1)
	return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations, LegalTypes);
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
	DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
	// fold (sub c1, c2) -> c1-c2
	return DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, N0.getNode(),
	N1.getNode());
	}

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);

	// fold (sub x, c) -> (add x, -c)
	if (N1C) {
	return DAG.getNode(ISD::ADD, DL, VT, N0,
	DAG.getConstant(-N1C->getAPIntValue(), DL, VT));
	}

	if (isNullConstantOrNullSplatConstant(N0)) {
	unsigned BitWidth = VT.getScalarSizeInBits();
	// Right-shifting everything out but the sign bit followed by negation is
	// the same as flipping arithmetic/logical shift type without the negation:
	// -(X >>u 31) -> (X >>s 31)
	// -(X >>s 31) -> (X >>u 31)
	if (N1->getOpcode() == ISD::SRA \|\| N1->getOpcode() == ISD::SRL) {
	ConstantSDNode *ShiftAmt = isConstOrConstSplat(N1.getOperand(1));
	if (ShiftAmt && ShiftAmt->getZExtValue() == BitWidth - 1) {
	auto NewSh = N1->getOpcode() == ISD::SRA ? ISD::SRL : ISD::SRA;
	if (!LegalOperations \|\| TLI.isOperationLegal(NewSh, VT))
	return DAG.getNode(NewSh, DL, VT, N1.getOperand(0), N1.getOperand(1));
	}
	}

	// 0 - X --> 0 if the sub is NUW.
	if (N->getFlags().hasNoUnsignedWrap())
	return N0;

	if (DAG.MaskedValueIsZero(N1, ~APInt::getSignMask(BitWidth))) {
	// N1 is either 0 or the minimum signed value. If the sub is NSW, then
	// N1 must be 0 because negating the minimum signed value is undefined.
	if (N->getFlags().hasNoSignedWrap())
	return N0;

	// 0 - X --> X if X is 0 or the minimum signed value.
	return N1;
	}
	}

	// Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1)
	if (isAllOnesConstantOrAllOnesSplatConstant(N0))
	return DAG.getNode(ISD::XOR, DL, VT, N1, N0);

	// fold A-(A-B) -> B
	if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(0))
	return N1.getOperand(1);

	// fold (A+B)-A -> B
	if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1)
	return N0.getOperand(1);

	// fold (A+B)-B -> A
	if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1)
	return N0.getOperand(0);

	// fold C2-(A+C1) -> (C2-C1)-A
	if (N1.getOpcode() == ISD::ADD) {
	SDValue N11 = N1.getOperand(1);
	if (isConstantOrConstantVector(N0, /* NoOpaques */ true) &&
	isConstantOrConstantVector(N11, /* NoOpaques */ true)) {
	SDValue NewC = DAG.getNode(ISD::SUB, DL, VT, N0, N11);
	return DAG.getNode(ISD::SUB, DL, VT, NewC, N1.getOperand(0));
	}
	}

	// fold ((A+(B+or-C))-B) -> A+or-C
	if (N0.getOpcode() == ISD::ADD &&
	(N0.getOperand(1).getOpcode() == ISD::SUB \|\|
	N0.getOperand(1).getOpcode() == ISD::ADD) &&
	N0.getOperand(1).getOperand(0) == N1)
	return DAG.getNode(N0.getOperand(1).getOpcode(), DL, VT, N0.getOperand(0),
	N0.getOperand(1).getOperand(1));

	// fold ((A+(C+B))-B) -> A+C
	if (N0.getOpcode() == ISD::ADD && N0.getOperand(1).getOpcode() == ISD::ADD &&
	N0.getOperand(1).getOperand(1) == N1)
	return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0),
	N0.getOperand(1).getOperand(0));

	// fold ((A-(B-C))-C) -> A-B
	if (N0.getOpcode() == ISD::SUB && N0.getOperand(1).getOpcode() == ISD::SUB &&
	N0.getOperand(1).getOperand(1) == N1)
	return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0),
	N0.getOperand(1).getOperand(0));

	// If either operand of a sub is undef, the result is undef
	if (N0.isUndef())
	return N0;
	if (N1.isUndef())
	return N1;

	// If the relocation model supports it, consider symbol offsets.
	if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N0))
	if (!LegalOperations && TLI.isOffsetFoldingLegal(GA)) {
	// fold (sub Sym, c) -> Sym-c
	if (N1C && GA->getOpcode() == ISD::GlobalAddress)
	return DAG.getGlobalAddress(GA->getGlobal(), SDLoc(N1C), VT,
	GA->getOffset() -
	(uint64_t)N1C->getSExtValue());
	// fold (sub Sym+c1, Sym+c2) -> c1-c2
	if (GlobalAddressSDNode *GB = dyn_cast<GlobalAddressSDNode>(N1))
	if (GA->getGlobal() == GB->getGlobal())
	return DAG.getConstant((uint64_t)GA->getOffset() - GB->getOffset(),
	DL, VT);
	}

	// sub X, (sextinreg Y i1) -> add X, (and Y 1)
	if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) {
	VTSDNode *TN = cast<VTSDNode>(N1.getOperand(1));
	if (TN->getVT() == MVT::i1) {
	SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0),
	DAG.getConstant(1, DL, VT));
	return DAG.getNode(ISD::ADD, DL, VT, N0, ZExt);
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitSUBC(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	SDLoc DL(N);

	// If the flag result is dead, turn this into an SUB.
	if (!N->hasAnyUseOfValue(1))
	return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1),
	DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));

	// fold (subc x, x) -> 0 + no borrow
	if (N0 == N1)
	return CombineTo(N, DAG.getConstant(0, DL, VT),
	DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));

	// fold (subc x, 0) -> x + no borrow
	if (isNullConstant(N1))
	return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));

	// Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1) + no borrow
	if (isAllOnesConstant(N0))
	return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0),
	DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));

	return SDValue();
	}

	SDValue DAGCombiner::visitUSUBO(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	if (VT.isVector())
	return SDValue();

	EVT CarryVT = N->getValueType(1);
	SDLoc DL(N);

	// If the flag result is dead, turn this into an SUB.
	if (!N->hasAnyUseOfValue(1))
	return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1),
	DAG.getUNDEF(CarryVT));

	// fold (usubo x, x) -> 0 + no borrow
	if (N0 == N1)
	return CombineTo(N, DAG.getConstant(0, DL, VT),
	DAG.getConstant(0, DL, CarryVT));

	// fold (usubo x, 0) -> x + no borrow
	if (isNullConstant(N1))
	return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT));

	// Canonicalize (usubo -1, x) -> ~x, i.e. (xor x, -1) + no borrow
	if (isAllOnesConstant(N0))
	return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0),
	DAG.getConstant(0, DL, CarryVT));

	return SDValue();
	}

	SDValue DAGCombiner::visitSUBE(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue CarryIn = N->getOperand(2);

	// fold (sube x, y, false) -> (subc x, y)
	if (CarryIn.getOpcode() == ISD::CARRY_FALSE)
	return DAG.getNode(ISD::SUBC, SDLoc(N), N->getVTList(), N0, N1);

	return SDValue();
	}

	SDValue DAGCombiner::visitSUBCARRY(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue CarryIn = N->getOperand(2);

	// fold (subcarry x, y, false) -> (usubo x, y)
	if (isNullConstant(CarryIn))
	return DAG.getNode(ISD::USUBO, SDLoc(N), N->getVTList(), N0, N1);

	return SDValue();
	}

	SDValue DAGCombiner::visitMUL(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();

	// fold (mul x, undef) -> 0
	if (N0.isUndef() \|\| N1.isUndef())
	return DAG.getConstant(0, SDLoc(N), VT);

	bool N0IsConst = false;
	bool N1IsConst = false;
	bool N1IsOpaqueConst = false;
	bool N0IsOpaqueConst = false;
	APInt ConstValue0, ConstValue1;
	// fold vector ops
	if (VT.isVector()) {
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	N0IsConst = ISD::isConstantSplatVector(N0.getNode(), ConstValue0);
	N1IsConst = ISD::isConstantSplatVector(N1.getNode(), ConstValue1);
	assert((!N0IsConst \|\|
	ConstValue0.getBitWidth() == VT.getScalarSizeInBits()) &&
	"Splat APInt should be element width");
	assert((!N1IsConst \|\|
	ConstValue1.getBitWidth() == VT.getScalarSizeInBits()) &&
	"Splat APInt should be element width");
	} else {
	N0IsConst = isa<ConstantSDNode>(N0);
	if (N0IsConst) {
	ConstValue0 = cast<ConstantSDNode>(N0)->getAPIntValue();
	N0IsOpaqueConst = cast<ConstantSDNode>(N0)->isOpaque();
	}
	N1IsConst = isa<ConstantSDNode>(N1);
	if (N1IsConst) {
	ConstValue1 = cast<ConstantSDNode>(N1)->getAPIntValue();
	N1IsOpaqueConst = cast<ConstantSDNode>(N1)->isOpaque();
	}
	}

	// fold (mul c1, c2) -> c1*c2
	if (N0IsConst && N1IsConst && !N0IsOpaqueConst && !N1IsOpaqueConst)
	return DAG.FoldConstantArithmetic(ISD::MUL, SDLoc(N), VT,
	N0.getNode(), N1.getNode());

	// canonicalize constant to RHS (vector doesn't have to splat)
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
	!DAG.isConstantIntBuildVectorOrConstantInt(N1))
	return DAG.getNode(ISD::MUL, SDLoc(N), VT, N1, N0);
	// fold (mul x, 0) -> 0
	if (N1IsConst && ConstValue1.isNullValue())
	return N1;
	// fold (mul x, 1) -> x
	if (N1IsConst && ConstValue1.isOneValue())
	return N0;

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// fold (mul x, -1) -> 0-x
	if (N1IsConst && ConstValue1.isAllOnesValue()) {
	SDLoc DL(N);
	return DAG.getNode(ISD::SUB, DL, VT,
	DAG.getConstant(0, DL, VT), N0);
	}
	// fold (mul x, (1 << c)) -> x << c
	if (isConstantOrConstantVector(N1, /NoOpaques/ true) &&
	DAG.isKnownToBeAPowerOfTwo(N1) &&
	(!VT.isVector() \|\| Level <= AfterLegalizeVectorOps)) {
	SDLoc DL(N);
	SDValue LogBase2 = BuildLogBase2(N1, DL);
	AddToWorklist(LogBase2.getNode());

	EVT ShiftVT = getShiftAmountTy(N0.getValueType());
	SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT);
	AddToWorklist(Trunc.getNode());
	return DAG.getNode(ISD::SHL, DL, VT, N0, Trunc);
	}
	// fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c
	if (N1IsConst && !N1IsOpaqueConst && (-ConstValue1).isPowerOf2()) {
	unsigned Log2Val = (-ConstValue1).logBase2();
	SDLoc DL(N);
	// FIXME: If the input is something that is easily negated (e.g. a
	// single-use add), we should put the negate there.
	return DAG.getNode(ISD::SUB, DL, VT,
	DAG.getConstant(0, DL, VT),
	DAG.getNode(ISD::SHL, DL, VT, N0,
	DAG.getConstant(Log2Val, DL,
	getShiftAmountTy(N0.getValueType()))));
	}

	// (mul (shl X, c1), c2) -> (mul X, c2 << c1)
	if (N0.getOpcode() == ISD::SHL &&
	isConstantOrConstantVector(N1, /* NoOpaques */ true) &&
	isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) {
	SDValue C3 = DAG.getNode(ISD::SHL, SDLoc(N), VT, N1, N0.getOperand(1));
	if (isConstantOrConstantVector(C3))
	return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), C3);
	}

	// Change (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one
	// use.
	{
	SDValue Sh(nullptr, 0), Y(nullptr, 0);

	// Check for both (mul (shl X, C), Y) and (mul Y, (shl X, C)).
	if (N0.getOpcode() == ISD::SHL &&
	isConstantOrConstantVector(N0.getOperand(1)) &&
	N0.getNode()->hasOneUse()) {
	Sh = N0; Y = N1;
	} else if (N1.getOpcode() == ISD::SHL &&
	isConstantOrConstantVector(N1.getOperand(1)) &&
	N1.getNode()->hasOneUse()) {
	Sh = N1; Y = N0;
	}

	if (Sh.getNode()) {
	SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N), VT, Sh.getOperand(0), Y);
	return DAG.getNode(ISD::SHL, SDLoc(N), VT, Mul, Sh.getOperand(1));
	}
	}

	// fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2)
	if (DAG.isConstantIntBuildVectorOrConstantInt(N1) &&
	N0.getOpcode() == ISD::ADD &&
	DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)) &&
	isMulAddWithConstProfitable(N, N0, N1))
	return DAG.getNode(ISD::ADD, SDLoc(N), VT,
	DAG.getNode(ISD::MUL, SDLoc(N0), VT,
	N0.getOperand(0), N1),
	DAG.getNode(ISD::MUL, SDLoc(N1), VT,
	N0.getOperand(1), N1));

	// reassociate mul
	if (SDValue RMUL = ReassociateOps(ISD::MUL, SDLoc(N), N0, N1))
	return RMUL;

	return SDValue();
	}

	/// Return true if divmod libcall is available.
	static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned,
	const TargetLowering &TLI) {
	RTLIB::Libcall LC;
	EVT NodeType = Node->getValueType(0);
	if (!NodeType.isSimple())
	return false;
	switch (NodeType.getSimpleVT().SimpleTy) {
	default: return false; // No libcall for vector types.
	case MVT::i8: LC= isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
	case MVT::i16: LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
	case MVT::i32: LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
	case MVT::i64: LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
	case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break;
	}

	return TLI.getLibcallName(LC) != nullptr;
	}

	/// Issue divrem if both quotient and remainder are needed.
	SDValue DAGCombiner::useDivRem(SDNode *Node) {
	if (Node->use_empty())
	return SDValue(); // This is a dead node, leave it alone.

	unsigned Opcode = Node->getOpcode();
	bool isSigned = (Opcode == ISD::SDIV) \|\| (Opcode == ISD::SREM);
	unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;

	// DivMod lib calls can still work on non-legal types if using lib-calls.
	EVT VT = Node->getValueType(0);
	if (VT.isVector() \|\| !VT.isInteger())
	return SDValue();

	if (!TLI.isTypeLegal(VT) && !TLI.isOperationCustom(DivRemOpc, VT))
	return SDValue();

	// If DIVREM is going to get expanded into a libcall,
	// but there is no libcall available, then don't combine.
	if (!TLI.isOperationLegalOrCustom(DivRemOpc, VT) &&
	!isDivRemLibcallAvailable(Node, isSigned, TLI))
	return SDValue();

	// If div is legal, it's better to do the normal expansion
	unsigned OtherOpcode = 0;
	if ((Opcode == ISD::SDIV) \|\| (Opcode == ISD::UDIV)) {
	OtherOpcode = isSigned ? ISD::SREM : ISD::UREM;
	if (TLI.isOperationLegalOrCustom(Opcode, VT))
	return SDValue();
	} else {
	OtherOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
	if (TLI.isOperationLegalOrCustom(OtherOpcode, VT))
	return SDValue();
	}

	SDValue Op0 = Node->getOperand(0);
	SDValue Op1 = Node->getOperand(1);
	SDValue combined;
	for (SDNode::use_iterator UI = Op0.getNode()->use_begin(),
	UE = Op0.getNode()->use_end(); UI != UE;) {
	SDNode User = UI++;
	if (User == Node \|\| User->use_empty())
	continue;
	// Convert the other matching node(s), too;
	// otherwise, the DIVREM may get target-legalized into something
	// target-specific that we won't be able to recognize.
	unsigned UserOpc = User->getOpcode();
	if ((UserOpc == Opcode \|\| UserOpc == OtherOpcode \|\| UserOpc == DivRemOpc) &&
	User->getOperand(0) == Op0 &&
	User->getOperand(1) == Op1) {
	if (!combined) {
	if (UserOpc == OtherOpcode) {
	SDVTList VTs = DAG.getVTList(VT, VT);
	combined = DAG.getNode(DivRemOpc, SDLoc(Node), VTs, Op0, Op1);
	} else if (UserOpc == DivRemOpc) {
	combined = SDValue(User, 0);
	} else {
	assert(UserOpc == Opcode);
	continue;
	}
	}
	if (UserOpc == ISD::SDIV \|\| UserOpc == ISD::UDIV)
	CombineTo(User, combined);
	else if (UserOpc == ISD::SREM \|\| UserOpc == ISD::UREM)
	CombineTo(User, combined.getValue(1));
	}
	}
	return combined;
	}

	static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	if (DAG.isUndef(N->getOpcode(), {N0, N1}))
	return DAG.getUNDEF(VT);

	// undef / X -> 0
	// undef % X -> 0
	if (N0.isUndef())
	return DAG.getConstant(0, DL, VT);

	return SDValue();
	}

	SDValue DAGCombiner::visitSDIV(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);

	// fold vector ops
	if (VT.isVector())
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	SDLoc DL(N);

	// fold (sdiv c1, c2) -> c1/c2
	ConstantSDNode *N0C = isConstOrConstSplat(N0);
	ConstantSDNode *N1C = isConstOrConstSplat(N1);
	if (N0C && N1C && !N0C->isOpaque() && !N1C->isOpaque())
	return DAG.FoldConstantArithmetic(ISD::SDIV, DL, VT, N0C, N1C);
	// fold (sdiv X, 1) -> X
	if (N1C && N1C->isOne())
	return N0;
	// fold (sdiv X, -1) -> 0-X
	if (N1C && N1C->isAllOnesValue())
	return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0);

	if (SDValue V = simplifyDivRem(N, DAG))
	return V;

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// If we know the sign bits of both operands are zero, strength reduce to a
	// udiv instead. Handles (X&15) /s 4 -> X&15 >> 2
	if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
	return DAG.getNode(ISD::UDIV, DL, N1.getValueType(), N0, N1);

	// fold (sdiv X, pow2) -> simple ops after legalize
	// FIXME: We check for the exact bit here because the generic lowering gives
	// better results in that case. The target-specific lowering should learn how
	// to handle exact sdivs efficiently.
	if (N1C && !N1C->isNullValue() && !N1C->isOpaque() &&
	!N->getFlags().hasExact() && (N1C->getAPIntValue().isPowerOf2() \|\|
	(-N1C->getAPIntValue()).isPowerOf2())) {
	// Target-specific implementation of sdiv x, pow2.
	if (SDValue Res = BuildSDIVPow2(N))
	return Res;

	unsigned lg2 = N1C->getAPIntValue().countTrailingZeros();

	// Splat the sign bit into the register
	SDValue SGN =
	DAG.getNode(ISD::SRA, DL, VT, N0,
	DAG.getConstant(VT.getScalarSizeInBits() - 1, DL,
	getShiftAmountTy(N0.getValueType())));
	AddToWorklist(SGN.getNode());

	// Add (N0 < 0) ? abs2 - 1 : 0;
	SDValue SRL =
	DAG.getNode(ISD::SRL, DL, VT, SGN,
	DAG.getConstant(VT.getScalarSizeInBits() - lg2, DL,
	getShiftAmountTy(SGN.getValueType())));
	SDValue ADD = DAG.getNode(ISD::ADD, DL, VT, N0, SRL);
	AddToWorklist(SRL.getNode());
	AddToWorklist(ADD.getNode()); // Divide by pow2
	SDValue SRA = DAG.getNode(ISD::SRA, DL, VT, ADD,
	DAG.getConstant(lg2, DL,
	getShiftAmountTy(ADD.getValueType())));

	// If we're dividing by a positive value, we're done. Otherwise, we must
	// negate the result.
	if (N1C->getAPIntValue().isNonNegative())
	return SRA;

	AddToWorklist(SRA.getNode());
	return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
	}

	// If integer divide is expensive and we satisfy the requirements, emit an
	// alternate sequence. Targets may check function attributes for size/speed
	// trade-offs.
	AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
	if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr))
	if (SDValue Op = BuildSDIV(N))
	return Op;

	// sdiv, srem -> sdivrem
	// If the divisor is constant, then return DIVREM only if isIntDivCheap() is
	// true. Otherwise, we break the simplification logic in visitREM().
	if (!N1C \|\| TLI.isIntDivCheap(N->getValueType(0), Attr))
	if (SDValue DivRem = useDivRem(N))
	return DivRem;

	return SDValue();
	}

	SDValue DAGCombiner::visitUDIV(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);

	// fold vector ops
	if (VT.isVector())
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	SDLoc DL(N);

	// fold (udiv c1, c2) -> c1/c2
	ConstantSDNode *N0C = isConstOrConstSplat(N0);
	ConstantSDNode *N1C = isConstOrConstSplat(N1);
	if (N0C && N1C)
	if (SDValue Folded = DAG.FoldConstantArithmetic(ISD::UDIV, DL, VT,
	N0C, N1C))
	return Folded;

	if (SDValue V = simplifyDivRem(N, DAG))
	return V;

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// fold (udiv x, (1 << c)) -> x >>u c
	if (isConstantOrConstantVector(N1, /NoOpaques/ true) &&
	DAG.isKnownToBeAPowerOfTwo(N1)) {
	SDValue LogBase2 = BuildLogBase2(N1, DL);
	AddToWorklist(LogBase2.getNode());

	EVT ShiftVT = getShiftAmountTy(N0.getValueType());
	SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT);
	AddToWorklist(Trunc.getNode());
	return DAG.getNode(ISD::SRL, DL, VT, N0, Trunc);
	}

	// fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
	if (N1.getOpcode() == ISD::SHL) {
	SDValue N10 = N1.getOperand(0);
	if (isConstantOrConstantVector(N10, /NoOpaques/ true) &&
	DAG.isKnownToBeAPowerOfTwo(N10)) {
	SDValue LogBase2 = BuildLogBase2(N10, DL);
	AddToWorklist(LogBase2.getNode());

	EVT ADDVT = N1.getOperand(1).getValueType();
	SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ADDVT);
	AddToWorklist(Trunc.getNode());
	SDValue Add = DAG.getNode(ISD::ADD, DL, ADDVT, N1.getOperand(1), Trunc);
	AddToWorklist(Add.getNode());
	return DAG.getNode(ISD::SRL, DL, VT, N0, Add);
	}
	}

	// fold (udiv x, c) -> alternate
	AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
	if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr))
	if (SDValue Op = BuildUDIV(N))
	return Op;

	// sdiv, srem -> sdivrem
	// If the divisor is constant, then return DIVREM only if isIntDivCheap() is
	// true. Otherwise, we break the simplification logic in visitREM().
	if (!N1C \|\| TLI.isIntDivCheap(N->getValueType(0), Attr))
	if (SDValue DivRem = useDivRem(N))
	return DivRem;

	return SDValue();
	}

	// handles ISD::SREM and ISD::UREM
	SDValue DAGCombiner::visitREM(SDNode *N) {
	unsigned Opcode = N->getOpcode();
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	bool isSigned = (Opcode == ISD::SREM);
	SDLoc DL(N);

	// fold (rem c1, c2) -> c1%c2
	ConstantSDNode *N0C = isConstOrConstSplat(N0);
	ConstantSDNode *N1C = isConstOrConstSplat(N1);
	if (N0C && N1C)
	if (SDValue Folded = DAG.FoldConstantArithmetic(Opcode, DL, VT, N0C, N1C))
	return Folded;

	if (SDValue V = simplifyDivRem(N, DAG))
	return V;

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	if (isSigned) {
	// If we know the sign bits of both operands are zero, strength reduce to a
	// urem instead. Handles (X & 0x0FFFFFFF) %s 16 -> X&15
	if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
	return DAG.getNode(ISD::UREM, DL, VT, N0, N1);
	} else {
	SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
	if (DAG.isKnownToBeAPowerOfTwo(N1)) {
	// fold (urem x, pow2) -> (and x, pow2-1)
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne);
	AddToWorklist(Add.getNode());
	return DAG.getNode(ISD::AND, DL, VT, N0, Add);
	}
	if (N1.getOpcode() == ISD::SHL &&
	DAG.isKnownToBeAPowerOfTwo(N1.getOperand(0))) {
	// fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne);
	AddToWorklist(Add.getNode());
	return DAG.getNode(ISD::AND, DL, VT, N0, Add);
	}
	}

	AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();

	// If X/C can be simplified by the division-by-constant logic, lower
	// X%C to the equivalent of X-X/C*C.
	// To avoid mangling nodes, this simplification requires that the combine()
	// call for the speculative DIV must not cause a DIVREM conversion. We guard
	// against this by skipping the simplification if isIntDivCheap(). When
	// div is not cheap, combine will not return a DIVREM. Regardless,
	// checking cheapness here makes sense since the simplification results in
	// fatter code.
	if (N1C && !N1C->isNullValue() && !TLI.isIntDivCheap(VT, Attr)) {
	unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
	SDValue Div = DAG.getNode(DivOpcode, DL, VT, N0, N1);
	AddToWorklist(Div.getNode());
	SDValue OptimizedDiv = combine(Div.getNode());
	if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != Div.getNode()) {
	assert((OptimizedDiv.getOpcode() != ISD::UDIVREM) &&
	(OptimizedDiv.getOpcode() != ISD::SDIVREM));
	SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, OptimizedDiv, N1);
	SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul);
	AddToWorklist(Mul.getNode());
	return Sub;
	}
	}

	// sdiv, srem -> sdivrem
	if (SDValue DivRem = useDivRem(N))
	return DivRem.getValue(1);

	return SDValue();
	}

	SDValue DAGCombiner::visitMULHS(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	if (VT.isVector()) {
	// fold (mulhs x, 0) -> 0
	if (ISD::isBuildVectorAllZeros(N1.getNode()))
	return N1;
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return N0;
	}

	// fold (mulhs x, 0) -> 0
	if (isNullConstant(N1))
	return N1;
	// fold (mulhs x, 1) -> (sra x, size(x)-1)
	if (isOneConstant(N1))
	return DAG.getNode(ISD::SRA, DL, N0.getValueType(), N0,
	DAG.getConstant(N0.getValueSizeInBits() - 1, DL,
	getShiftAmountTy(N0.getValueType())));

	// fold (mulhs x, undef) -> 0
	if (N0.isUndef() \|\| N1.isUndef())
	return DAG.getConstant(0, DL, VT);

	// If the type twice as wide is legal, transform the mulhs to a wider multiply
	// plus a shift.
	if (VT.isSimple() && !VT.isVector()) {
	MVT Simple = VT.getSimpleVT();
	unsigned SimpleSize = Simple.getSizeInBits();
	EVT NewVT = EVT::getIntegerVT(DAG.getContext(), SimpleSize2);
	if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
	N0 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N0);
	N1 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N1);
	N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1);
	N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1,
	DAG.getConstant(SimpleSize, DL,
	getShiftAmountTy(N1.getValueType())));
	return DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitMULHU(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	if (VT.isVector()) {
	// fold (mulhu x, 0) -> 0
	if (ISD::isBuildVectorAllZeros(N1.getNode()))
	return N1;
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return N0;
	}

	// fold (mulhu x, 0) -> 0
	if (isNullConstant(N1))
	return N1;
	// fold (mulhu x, 1) -> 0
	if (isOneConstant(N1))
	return DAG.getConstant(0, DL, N0.getValueType());
	// fold (mulhu x, undef) -> 0
	if (N0.isUndef() \|\| N1.isUndef())
	return DAG.getConstant(0, DL, VT);

	// If the type twice as wide is legal, transform the mulhu to a wider multiply
	// plus a shift.
	if (VT.isSimple() && !VT.isVector()) {
	MVT Simple = VT.getSimpleVT();
	unsigned SimpleSize = Simple.getSizeInBits();
	EVT NewVT = EVT::getIntegerVT(DAG.getContext(), SimpleSize2);
	if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
	N0 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N0);
	N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N1);
	N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1);
	N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1,
	DAG.getConstant(SimpleSize, DL,
	getShiftAmountTy(N1.getValueType())));
	return DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
	}
	}

	return SDValue();
	}

	/// Perform optimizations common to nodes that compute two values. LoOp and HiOp
	/// give the opcodes for the two computations that are being performed. Return
	/// true if a simplification was made.
	SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
	unsigned HiOp) {
	// If the high half is not needed, just compute the low half.
	bool HiExists = N->hasAnyUseOfValue(1);
	if (!HiExists &&
	(!LegalOperations \|\|
	TLI.isOperationLegalOrCustom(LoOp, N->getValueType(0)))) {
	SDValue Res = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops());
	return CombineTo(N, Res, Res);
	}

	// If the low half is not needed, just compute the high half.
	bool LoExists = N->hasAnyUseOfValue(0);
	if (!LoExists &&
	(!LegalOperations \|\|
	TLI.isOperationLegal(HiOp, N->getValueType(1)))) {
	SDValue Res = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops());
	return CombineTo(N, Res, Res);
	}

	// If both halves are used, return as it is.
	if (LoExists && HiExists)
	return SDValue();

	// If the two computed results can be simplified separately, separate them.
	if (LoExists) {
	SDValue Lo = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops());
	AddToWorklist(Lo.getNode());
	SDValue LoOpt = combine(Lo.getNode());
	if (LoOpt.getNode() && LoOpt.getNode() != Lo.getNode() &&
	(!LegalOperations \|\|
	TLI.isOperationLegal(LoOpt.getOpcode(), LoOpt.getValueType())))
	return CombineTo(N, LoOpt, LoOpt);
	}

	if (HiExists) {
	SDValue Hi = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops());
	AddToWorklist(Hi.getNode());
	SDValue HiOpt = combine(Hi.getNode());
	if (HiOpt.getNode() && HiOpt != Hi &&
	(!LegalOperations \|\|
	TLI.isOperationLegal(HiOpt.getOpcode(), HiOpt.getValueType())))
	return CombineTo(N, HiOpt, HiOpt);
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) {
	if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHS))
	return Res;

	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	// If the type is twice as wide is legal, transform the mulhu to a wider
	// multiply plus a shift.
	if (VT.isSimple() && !VT.isVector()) {
	MVT Simple = VT.getSimpleVT();
	unsigned SimpleSize = Simple.getSizeInBits();
	EVT NewVT = EVT::getIntegerVT(DAG.getContext(), SimpleSize2);
	if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
	SDValue Lo = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N->getOperand(0));
	SDValue Hi = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N->getOperand(1));
	Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi);
	// Compute the high part as N1.
	Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo,
	DAG.getConstant(SimpleSize, DL,
	getShiftAmountTy(Lo.getValueType())));
	Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi);
	// Compute the low part as N0.
	Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo);
	return CombineTo(N, Lo, Hi);
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) {
	if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHU))
	return Res;

	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	// If the type is twice as wide is legal, transform the mulhu to a wider
	// multiply plus a shift.
	if (VT.isSimple() && !VT.isVector()) {
	MVT Simple = VT.getSimpleVT();
	unsigned SimpleSize = Simple.getSizeInBits();
	EVT NewVT = EVT::getIntegerVT(DAG.getContext(), SimpleSize2);
	if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
	SDValue Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N->getOperand(0));
	SDValue Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N->getOperand(1));
	Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi);
	// Compute the high part as N1.
	Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo,
	DAG.getConstant(SimpleSize, DL,
	getShiftAmountTy(Lo.getValueType())));
	Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi);
	// Compute the low part as N0.
	Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo);
	return CombineTo(N, Lo, Hi);
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitSMULO(SDNode *N) {
	// (smulo x, 2) -> (saddo x, x)
	if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1)))
	if (C2->getAPIntValue() == 2)
	return DAG.getNode(ISD::SADDO, SDLoc(N), N->getVTList(),
	N->getOperand(0), N->getOperand(0));

	return SDValue();
	}

	SDValue DAGCombiner::visitUMULO(SDNode *N) {
	// (umulo x, 2) -> (uaddo x, x)
	if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1)))
	if (C2->getAPIntValue() == 2)
	return DAG.getNode(ISD::UADDO, SDLoc(N), N->getVTList(),
	N->getOperand(0), N->getOperand(0));

	return SDValue();
	}

	SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();

	// fold vector ops
	if (VT.isVector())
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	// fold operation with constant operands.
	ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
	ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
	if (N0C && N1C)
	return DAG.FoldConstantArithmetic(N->getOpcode(), SDLoc(N), VT, N0C, N1C);

	// canonicalize constant to RHS
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
	!DAG.isConstantIntBuildVectorOrConstantInt(N1))
	return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0);

	return SDValue();
	}

	/// If this is a binary operator with two operands of the same opcode, try to
	/// simplify it.
	SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
	SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	assert(N0.getOpcode() == N1.getOpcode() && "Bad input!");

	// Bail early if none of these transforms apply.
	if (N0.getNumOperands() == 0) return SDValue();

	// For each of OP in AND/OR/XOR:
	// fold (OP (zext x), (zext y)) -> (zext (OP x, y))
	// fold (OP (sext x), (sext y)) -> (sext (OP x, y))
	// fold (OP (aext x), (aext y)) -> (aext (OP x, y))
	// fold (OP (bswap x), (bswap y)) -> (bswap (OP x, y))
	// fold (OP (trunc x), (trunc y)) -> (trunc (OP x, y)) (if trunc isn't free)
	//
	// do not sink logical op inside of a vector extend, since it may combine
	// into a vsetcc.
	EVT Op0VT = N0.getOperand(0).getValueType();
	if ((N0.getOpcode() == ISD::ZERO_EXTEND \|\|
	N0.getOpcode() == ISD::SIGN_EXTEND \|\|
	N0.getOpcode() == ISD::BSWAP \|\|
	// Avoid infinite looping with PromoteIntBinOp.
	(N0.getOpcode() == ISD::ANY_EXTEND &&
	(!LegalTypes \|\| TLI.isTypeDesirableForOp(N->getOpcode(), Op0VT))) \|\|
	(N0.getOpcode() == ISD::TRUNCATE &&
	(!TLI.isZExtFree(VT, Op0VT) \|\|
	!TLI.isTruncateFree(Op0VT, VT)) &&
	TLI.isTypeLegal(Op0VT))) &&
	!VT.isVector() &&
	Op0VT == N1.getOperand(0).getValueType() &&
	(!LegalOperations \|\| TLI.isOperationLegal(N->getOpcode(), Op0VT))) {
	SDValue ORNode = DAG.getNode(N->getOpcode(), SDLoc(N0),
	N0.getOperand(0).getValueType(),
	N0.getOperand(0), N1.getOperand(0));
	AddToWorklist(ORNode.getNode());
	return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, ORNode);
	}

	// For each of OP in SHL/SRL/SRA/AND...
	// fold (and (OP x, z), (OP y, z)) -> (OP (and x, y), z)
	// fold (or (OP x, z), (OP y, z)) -> (OP (or x, y), z)
	// fold (xor (OP x, z), (OP y, z)) -> (OP (xor x, y), z)
	if ((N0.getOpcode() == ISD::SHL \|\| N0.getOpcode() == ISD::SRL \|\|
	N0.getOpcode() == ISD::SRA \|\| N0.getOpcode() == ISD::AND) &&
	N0.getOperand(1) == N1.getOperand(1)) {
	SDValue ORNode = DAG.getNode(N->getOpcode(), SDLoc(N0),
	N0.getOperand(0).getValueType(),
	N0.getOperand(0), N1.getOperand(0));
	AddToWorklist(ORNode.getNode());
	return DAG.getNode(N0.getOpcode(), SDLoc(N), VT,
	ORNode, N0.getOperand(1));
	}

	// Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B))
	// Only perform this optimization up until type legalization, before
	// LegalizeVectorOprs. LegalizeVectorOprs promotes vector operations by
	// adding bitcasts. For example (xor v4i32) is promoted to (v2i64), and
	// we don't want to undo this promotion.
	// We also handle SCALAR_TO_VECTOR because xor/or/and operations are cheaper
	// on scalars.
	if ((N0.getOpcode() == ISD::BITCAST \|\|
	N0.getOpcode() == ISD::SCALAR_TO_VECTOR) &&
	Level <= AfterLegalizeTypes) {
	SDValue In0 = N0.getOperand(0);
	SDValue In1 = N1.getOperand(0);
	EVT In0Ty = In0.getValueType();
	EVT In1Ty = In1.getValueType();
	SDLoc DL(N);
	// If both incoming values are integers, and the original types are the
	// same.
	if (In0Ty.isInteger() && In1Ty.isInteger() && In0Ty == In1Ty) {
	SDValue Op = DAG.getNode(N->getOpcode(), DL, In0Ty, In0, In1);
	SDValue BC = DAG.getNode(N0.getOpcode(), DL, VT, Op);
	AddToWorklist(Op.getNode());
	return BC;
	}
	}

	// Xor/and/or are indifferent to the swizzle operation (shuffle of one value).
	// Simplify xor/and/or (shuff(A), shuff(B)) -> shuff(op (A,B))
	// If both shuffles use the same mask, and both shuffle within a single
	// vector, then it is worthwhile to move the swizzle after the operation.
	// The type-legalizer generates this pattern when loading illegal
	// vector types from memory. In many cases this allows additional shuffle
	// optimizations.
	// There are other cases where moving the shuffle after the xor/and/or
	// is profitable even if shuffles don't perform a swizzle.
	// If both shuffles use the same mask, and both shuffles have the same first
	// or second operand, then it might still be profitable to move the shuffle
	// after the xor/and/or operation.
	if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG) {
	ShuffleVectorSDNode *SVN0 = cast<ShuffleVectorSDNode>(N0);
	ShuffleVectorSDNode *SVN1 = cast<ShuffleVectorSDNode>(N1);

	assert(N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&
	"Inputs to shuffles are not the same type");

	// Check that both shuffles use the same mask. The masks are known to be of
	// the same length because the result vector type is the same.
	// Check also that shuffles have only one use to avoid introducing extra
	// instructions.
	if (SVN0->hasOneUse() && SVN1->hasOneUse() &&
	SVN0->getMask().equals(SVN1->getMask())) {
	SDValue ShOp = N0->getOperand(1);

	// Don't try to fold this node if it requires introducing a
	// build vector of all zeros that might be illegal at this stage.
	if (N->getOpcode() == ISD::XOR && !ShOp.isUndef()) {
	if (!LegalTypes)
	ShOp = DAG.getConstant(0, SDLoc(N), VT);
	else
	ShOp = SDValue();
	}

	// (AND (shuf (A, C), shuf (B, C)) -> shuf (AND (A, B), C)
	// (OR (shuf (A, C), shuf (B, C)) -> shuf (OR (A, B), C)
	// (XOR (shuf (A, C), shuf (B, C)) -> shuf (XOR (A, B), V_0)
	if (N0.getOperand(1) == N1.getOperand(1) && ShOp.getNode()) {
	SDValue NewNode = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
	N0->getOperand(0), N1->getOperand(0));
	AddToWorklist(NewNode.getNode());
	return DAG.getVectorShuffle(VT, SDLoc(N), NewNode, ShOp,
	SVN0->getMask());
	}

	// Don't try to fold this node if it requires introducing a
	// build vector of all zeros that might be illegal at this stage.
	ShOp = N0->getOperand(0);
	if (N->getOpcode() == ISD::XOR && !ShOp.isUndef()) {
	if (!LegalTypes)
	ShOp = DAG.getConstant(0, SDLoc(N), VT);
	else
	ShOp = SDValue();
	}

	// (AND (shuf (C, A), shuf (C, B)) -> shuf (C, AND (A, B))
	// (OR (shuf (C, A), shuf (C, B)) -> shuf (C, OR (A, B))
	// (XOR (shuf (C, A), shuf (C, B)) -> shuf (V_0, XOR (A, B))
	if (N0->getOperand(0) == N1->getOperand(0) && ShOp.getNode()) {
	SDValue NewNode = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
	N0->getOperand(1), N1->getOperand(1));
	AddToWorklist(NewNode.getNode());
	return DAG.getVectorShuffle(VT, SDLoc(N), ShOp, NewNode,
	SVN0->getMask());
	}
	}
	}

	return SDValue();
	}

	/// Try to make (and/or setcc (LL, LR), setcc (RL, RR)) more efficient.
	SDValue DAGCombiner::foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
	const SDLoc &DL) {
	SDValue LL, LR, RL, RR, N0CC, N1CC;
	if (!isSetCCEquivalent(N0, LL, LR, N0CC) \|\|
	!isSetCCEquivalent(N1, RL, RR, N1CC))
	return SDValue();

	assert(N0.getValueType() == N1.getValueType() &&
	"Unexpected operand types for bitwise logic op");
	assert(LL.getValueType() == LR.getValueType() &&
	RL.getValueType() == RR.getValueType() &&
	"Unexpected operand types for setcc");

	// If we're here post-legalization or the logic op type is not i1, the logic
	// op type must match a setcc result type. Also, all folds require new
	// operations on the left and right operands, so those types must match.
	EVT VT = N0.getValueType();
	EVT OpVT = LL.getValueType();
	if (LegalOperations \|\| VT != MVT::i1)
	if (VT != getSetCCResultType(OpVT))
	return SDValue();
	if (OpVT != RL.getValueType())
	return SDValue();

	ISD::CondCode CC0 = cast<CondCodeSDNode>(N0CC)->get();
	ISD::CondCode CC1 = cast<CondCodeSDNode>(N1CC)->get();
	bool IsInteger = OpVT.isInteger();
	if (LR == RR && CC0 == CC1 && IsInteger) {
	bool IsZero = isNullConstantOrNullSplatConstant(LR);
	bool IsNeg1 = isAllOnesConstantOrAllOnesSplatConstant(LR);

	// All bits clear?
	bool AndEqZero = IsAnd && CC1 == ISD::SETEQ && IsZero;
	// All sign bits clear?
	bool AndGtNeg1 = IsAnd && CC1 == ISD::SETGT && IsNeg1;
	// Any bits set?
	bool OrNeZero = !IsAnd && CC1 == ISD::SETNE && IsZero;
	// Any sign bits set?
	bool OrLtZero = !IsAnd && CC1 == ISD::SETLT && IsZero;

	// (and (seteq X, 0), (seteq Y, 0)) --> (seteq (or X, Y), 0)
	// (and (setgt X, -1), (setgt Y, -1)) --> (setgt (or X, Y), -1)
	// (or (setne X, 0), (setne Y, 0)) --> (setne (or X, Y), 0)
	// (or (setlt X, 0), (setlt Y, 0)) --> (setlt (or X, Y), 0)
	if (AndEqZero \|\| AndGtNeg1 \|\| OrNeZero \|\| OrLtZero) {
	SDValue Or = DAG.getNode(ISD::OR, SDLoc(N0), OpVT, LL, RL);
	AddToWorklist(Or.getNode());
	return DAG.getSetCC(DL, VT, Or, LR, CC1);
	}

	// All bits set?
	bool AndEqNeg1 = IsAnd && CC1 == ISD::SETEQ && IsNeg1;
	// All sign bits set?
	bool AndLtZero = IsAnd && CC1 == ISD::SETLT && IsZero;
	// Any bits clear?
	bool OrNeNeg1 = !IsAnd && CC1 == ISD::SETNE && IsNeg1;
	// Any sign bits clear?
	bool OrGtNeg1 = !IsAnd && CC1 == ISD::SETGT && IsNeg1;

	// (and (seteq X, -1), (seteq Y, -1)) --> (seteq (and X, Y), -1)
	// (and (setlt X, 0), (setlt Y, 0)) --> (setlt (and X, Y), 0)
	// (or (setne X, -1), (setne Y, -1)) --> (setne (and X, Y), -1)
	// (or (setgt X, -1), (setgt Y -1)) --> (setgt (and X, Y), -1)
	if (AndEqNeg1 \|\| AndLtZero \|\| OrNeNeg1 \|\| OrGtNeg1) {
	SDValue And = DAG.getNode(ISD::AND, SDLoc(N0), OpVT, LL, RL);
	AddToWorklist(And.getNode());
	return DAG.getSetCC(DL, VT, And, LR, CC1);
	}
	}

	// TODO: What is the 'or' equivalent of this fold?
	// (and (setne X, 0), (setne X, -1)) --> (setuge (add X, 1), 2)
	if (IsAnd && LL == RL && CC0 == CC1 && OpVT.getScalarSizeInBits() > 1 &&
	IsInteger && CC0 == ISD::SETNE &&
	((isNullConstant(LR) && isAllOnesConstant(RR)) \|\|
	(isAllOnesConstant(LR) && isNullConstant(RR)))) {
	SDValue One = DAG.getConstant(1, DL, OpVT);
	SDValue Two = DAG.getConstant(2, DL, OpVT);
	SDValue Add = DAG.getNode(ISD::ADD, SDLoc(N0), OpVT, LL, One);
	AddToWorklist(Add.getNode());
	return DAG.getSetCC(DL, VT, Add, Two, ISD::SETUGE);
	}

	// Try more general transforms if the predicates match and the only user of
	// the compares is the 'and' or 'or'.
	if (IsInteger && TLI.convertSetCCLogicToBitwiseLogic(OpVT) && CC0 == CC1 &&
	N0.hasOneUse() && N1.hasOneUse()) {
	// and (seteq A, B), (seteq C, D) --> seteq (or (xor A, B), (xor C, D)), 0
	// or (setne A, B), (setne C, D) --> setne (or (xor A, B), (xor C, D)), 0
	if ((IsAnd && CC1 == ISD::SETEQ) \|\| (!IsAnd && CC1 == ISD::SETNE)) {
	SDValue XorL = DAG.getNode(ISD::XOR, SDLoc(N0), OpVT, LL, LR);
	SDValue XorR = DAG.getNode(ISD::XOR, SDLoc(N1), OpVT, RL, RR);
	SDValue Or = DAG.getNode(ISD::OR, DL, OpVT, XorL, XorR);
	SDValue Zero = DAG.getConstant(0, DL, OpVT);
	return DAG.getSetCC(DL, VT, Or, Zero, CC1);
	}
	}

	// Canonicalize equivalent operands to LL == RL.
	if (LL == RR && LR == RL) {
	CC1 = ISD::getSetCCSwappedOperands(CC1);
	std::swap(RL, RR);
	}

	// (and (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC)
	// (or (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC)
	if (LL == RL && LR == RR) {
	ISD::CondCode NewCC = IsAnd ? ISD::getSetCCAndOperation(CC0, CC1, IsInteger)
	: ISD::getSetCCOrOperation(CC0, CC1, IsInteger);
	if (NewCC != ISD::SETCC_INVALID &&
	(!LegalOperations \|\|
	(TLI.isCondCodeLegal(NewCC, LL.getSimpleValueType()) &&
	TLI.isOperationLegal(ISD::SETCC, OpVT))))
	return DAG.getSetCC(DL, VT, LL, LR, NewCC);
	}

	return SDValue();
	}

	/// This contains all DAGCombine rules which reduce two values combined by
	/// an And operation to a single value. This makes them reusable in the context
	/// of visitSELECT(). Rules involving constants are not included as
	/// visitSELECT() already handles those cases.
	SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, SDNode *N) {
	EVT VT = N1.getValueType();
	SDLoc DL(N);

	// fold (and x, undef) -> 0
	if (N0.isUndef() \|\| N1.isUndef())
	return DAG.getConstant(0, DL, VT);

	if (SDValue V = foldLogicOfSetCCs(true, N0, N1, DL))
	return V;

	if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::SRL &&
	VT.getSizeInBits() <= 64) {
	if (ConstantSDNode *ADDI = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
	if (ConstantSDNode *SRLI = dyn_cast<ConstantSDNode>(N1.getOperand(1))) {
	// Look for (and (add x, c1), (lshr y, c2)). If C1 wasn't a legal
	// immediate for an add, but it is legal if its top c2 bits are set,
	// transform the ADD so the immediate doesn't need to be materialized
	// in a register.
	APInt ADDC = ADDI->getAPIntValue();
	APInt SRLC = SRLI->getAPIntValue();
	if (ADDC.getMinSignedBits() <= 64 &&
	SRLC.ult(VT.getSizeInBits()) &&
	!TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
	APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
	SRLC.getZExtValue());
	if (DAG.MaskedValueIsZero(N0.getOperand(1), Mask)) {
	ADDC \|= Mask;
	if (TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
	SDLoc DL0(N0);
	SDValue NewAdd =
	DAG.getNode(ISD::ADD, DL0, VT,
	N0.getOperand(0), DAG.getConstant(ADDC, DL, VT));
	CombineTo(N0.getNode(), NewAdd);
	// Return N so it doesn't get rechecked!
	return SDValue(N, 0);
	}
	}
	}
	}
	}
	}

	// Reduce bit extract of low half of an integer to the narrower type.
	// (and (srl i64:x, K), KMask) ->
	// (i64 zero_extend (and (srl (i32 (trunc i64:x)), K)), KMask)
	if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) {
	if (ConstantSDNode *CAnd = dyn_cast<ConstantSDNode>(N1)) {
	if (ConstantSDNode *CShift = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
	unsigned Size = VT.getSizeInBits();
	const APInt &AndMask = CAnd->getAPIntValue();
	unsigned ShiftBits = CShift->getZExtValue();

	// Bail out, this node will probably disappear anyway.
	if (ShiftBits == 0)
	return SDValue();

	unsigned MaskBits = AndMask.countTrailingOnes();
	EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), Size / 2);

	if (AndMask.isMask() &&
	// Required bits must not span the two halves of the integer and
	// must fit in the half size type.
	(ShiftBits + MaskBits <= Size / 2) &&
	TLI.isNarrowingProfitable(VT, HalfVT) &&
	TLI.isTypeDesirableForOp(ISD::AND, HalfVT) &&
	TLI.isTypeDesirableForOp(ISD::SRL, HalfVT) &&
	TLI.isTruncateFree(VT, HalfVT) &&
	TLI.isZExtFree(HalfVT, VT)) {
	// The isNarrowingProfitable is to avoid regressions on PPC and
	// AArch64 which match a few 64-bit bit insert / bit extract patterns
	// on downstream users of this. Those patterns could probably be
	// extended to handle extensions mixed in.

	SDValue SL(N0);
	assert(MaskBits <= Size);

	// Extracting the highest bit of the low half.
	EVT ShiftVT = TLI.getShiftAmountTy(HalfVT, DAG.getDataLayout());
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, HalfVT,
	N0.getOperand(0));

	SDValue NewMask = DAG.getConstant(AndMask.trunc(Size / 2), SL, HalfVT);
	SDValue ShiftK = DAG.getConstant(ShiftBits, SL, ShiftVT);
	SDValue Shift = DAG.getNode(ISD::SRL, SL, HalfVT, Trunc, ShiftK);
	SDValue And = DAG.getNode(ISD::AND, SL, HalfVT, Shift, NewMask);
	return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, And);
	}
	}
	}
	}

	return SDValue();
	}

	bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode AndC, LoadSDNode LoadN,
	EVT LoadResultTy, EVT &ExtVT) {
	if (!AndC->getAPIntValue().isMask())
	return false;

	unsigned ActiveBits = AndC->getAPIntValue().countTrailingOnes();

	ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
	EVT LoadedVT = LoadN->getMemoryVT();

	if (ExtVT == LoadedVT &&
	(!LegalOperations \|\|
	TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))) {
	// ZEXTLOAD will match without needing to change the size of the value being
	// loaded.
	return true;
	}

	// Do not change the width of a volatile load.
	if (LoadN->isVolatile())
	return false;

	// Do not generate loads of non-round integer types since these can
	// be expensive (and would be wrong if the type is not byte sized).
	if (!LoadedVT.bitsGT(ExtVT) \|\| !ExtVT.isRound())
	return false;

	if (LegalOperations &&
	!TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))
	return false;

	if (!TLI.shouldReduceLoadWidth(LoadN, ISD::ZEXTLOAD, ExtVT))
	return false;

	return true;
	}

	bool DAGCombiner::isLegalNarrowLoad(LoadSDNode *LoadN, ISD::LoadExtType ExtType,
	EVT &ExtVT, unsigned ShAmt) {
	// Don't transform one with multiple uses, this would require adding a new
	// load.
	if (!SDValue(LoadN, 0).hasOneUse())
	return false;

	if (LegalOperations &&
	!TLI.isLoadExtLegal(ExtType, LoadN->getValueType(0), ExtVT))
	return false;

	// Do not generate loads of non-round integer types since these can
	// be expensive (and would be wrong if the type is not byte sized).
	if (!ExtVT.isRound())
	return false;

	// Don't change the width of a volatile load.
	if (LoadN->isVolatile())
	return false;

	// Verify that we are actually reducing a load width here.
	if (LoadN->getMemoryVT().getSizeInBits() < ExtVT.getSizeInBits())
	return false;

	// For the transform to be legal, the load must produce only two values
	// (the value loaded and the chain). Don't transform a pre-increment
	// load, for example, which produces an extra value. Otherwise the
	// transformation is not equivalent, and the downstream logic to replace
	// uses gets things wrong.
	if (LoadN->getNumValues() > 2)
	return false;

	// If the load that we're shrinking is an extload and we're not just
	// discarding the extension we can't simply shrink the load. Bail.
	// TODO: It would be possible to merge the extensions in some cases.
	if (LoadN->getExtensionType() != ISD::NON_EXTLOAD &&
	LoadN->getMemoryVT().getSizeInBits() < ExtVT.getSizeInBits() + ShAmt)
	return false;

	if (!TLI.shouldReduceLoadWidth(LoadN, ExtType, ExtVT))
	return false;

	// It's not possible to generate a constant of extended or untyped type.
	EVT PtrType = LoadN->getOperand(1).getValueType();
	if (PtrType == MVT::Untyped \|\| PtrType.isExtended())
	return false;

	return true;
	}

	bool DAGCombiner::SearchForAndLoads(SDNode *N,
	SmallPtrSetImpl<LoadSDNode*> &Loads,
	SmallPtrSetImpl<SDNode*> &NodesWithConsts,
	ConstantSDNode *Mask,
	SDNode *&NodeToMask) {
	// Recursively search for the operands, looking for loads which can be
	// narrowed.
	for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i) {
	SDValue Op = N->getOperand(i);

	if (Op.getValueType().isVector())
	return false;

	// Some constants may need fixing up later if they are too large.
	if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
	if ((N->getOpcode() == ISD::OR \|\| N->getOpcode() == ISD::XOR) &&
	(Mask->getAPIntValue() & C->getAPIntValue()) != C->getAPIntValue())
	NodesWithConsts.insert(N);
	continue;
	}

	if (!Op.hasOneUse())
	return false;

	switch(Op.getOpcode()) {
	case ISD::LOAD: {
	auto *Load = cast<LoadSDNode>(Op);
	EVT ExtVT;
	if (isAndLoadExtLoad(Mask, Load, Load->getValueType(0), ExtVT) &&
	isLegalNarrowLoad(Load, ISD::ZEXTLOAD, ExtVT)) {

	// ZEXTLOAD is already small enough.
	if (Load->getExtensionType() == ISD::ZEXTLOAD &&
	ExtVT.bitsGE(Load->getMemoryVT()))
	continue;

	// Use LE to convert equal sized loads to zext.
	if (ExtVT.bitsLE(Load->getMemoryVT()))
	Loads.insert(Load);

	continue;
	}
	return false;
	}
	case ISD::ZERO_EXTEND:
	case ISD::AssertZext: {
	unsigned ActiveBits = Mask->getAPIntValue().countTrailingOnes();
	EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
	EVT VT = Op.getOpcode() == ISD::AssertZext ?
	cast<VTSDNode>(Op.getOperand(1))->getVT() :
	Op.getOperand(0).getValueType();

	// We can accept extending nodes if the mask is wider or an equal
	// width to the original type.
	if (ExtVT.bitsGE(VT))
	continue;
	break;
	}
	case ISD::OR:
	case ISD::XOR:
	case ISD::AND:
	if (!SearchForAndLoads(Op.getNode(), Loads, NodesWithConsts, Mask,
	NodeToMask))
	return false;
	continue;
	}

	// Allow one node which will masked along with any loads found.
	if (NodeToMask)
	return false;
	NodeToMask = Op.getNode();
	}
	return true;
	}

	bool DAGCombiner::BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG) {
	auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!Mask)
	return false;

	if (!Mask->getAPIntValue().isMask())
	return false;

	// No need to do anything if the and directly uses a load.
	if (isa<LoadSDNode>(N->getOperand(0)))
	return false;

	SmallPtrSet<LoadSDNode*, 8> Loads;
	SmallPtrSet<SDNode*, 2> NodesWithConsts;
	SDNode *FixupNode = nullptr;
	if (SearchForAndLoads(N, Loads, NodesWithConsts, Mask, FixupNode)) {
	if (Loads.size() == 0)
	return false;

	DEBUG(dbgs() << "Backwards propagate AND: "; N->dump());
	SDValue MaskOp = N->getOperand(1);

	// If it exists, fixup the single node we allow in the tree that needs
	// masking.
	if (FixupNode) {
	DEBUG(dbgs() << "First, need to fix up: "; FixupNode->dump());
	SDValue And = DAG.getNode(ISD::AND, SDLoc(FixupNode),
	FixupNode->getValueType(0),
	SDValue(FixupNode, 0), MaskOp);
	DAG.ReplaceAllUsesOfValueWith(SDValue(FixupNode, 0), And);
	DAG.UpdateNodeOperands(And.getNode(), SDValue(FixupNode, 0),
	MaskOp);
	}

	// Narrow any constants that need it.
	for (auto *LogicN : NodesWithConsts) {
	SDValue Op0 = LogicN->getOperand(0);
	SDValue Op1 = LogicN->getOperand(1);

	if (isa<ConstantSDNode>(Op0))
	std::swap(Op0, Op1);

	SDValue And = DAG.getNode(ISD::AND, SDLoc(Op1), Op1.getValueType(),
	Op1, MaskOp);

	DAG.UpdateNodeOperands(LogicN, Op0, And);
	}

	// Create narrow loads.
	for (auto *Load : Loads) {
	DEBUG(dbgs() << "Propagate AND back to: "; Load->dump());
	SDValue And = DAG.getNode(ISD::AND, SDLoc(Load), Load->getValueType(0),
	SDValue(Load, 0), MaskOp);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), And);
	DAG.UpdateNodeOperands(And.getNode(), SDValue(Load, 0), MaskOp);
	SDValue NewLoad = ReduceLoadWidth(And.getNode());
	assert(NewLoad &&
	"Shouldn't be masking the load if it can't be narrowed");
	CombineTo(Load, NewLoad, NewLoad.getValue(1));
	}
	DAG.ReplaceAllUsesWith(N, N->getOperand(0).getNode());
	return true;
	}
	return false;
	}

	SDValue DAGCombiner::visitAND(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N1.getValueType();

	// x & x --> x
	if (N0 == N1)
	return N0;

	// fold vector ops
	if (VT.isVector()) {
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	// fold (and x, 0) -> 0, vector edition
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	// do not return N0, because undef node may exist in N0
	return DAG.getConstant(APInt::getNullValue(N0.getScalarValueSizeInBits()),
	SDLoc(N), N0.getValueType());
	if (ISD::isBuildVectorAllZeros(N1.getNode()))
	// do not return N1, because undef node may exist in N1
	return DAG.getConstant(APInt::getNullValue(N1.getScalarValueSizeInBits()),
	SDLoc(N), N1.getValueType());

	// fold (and x, -1) -> x, vector edition
	if (ISD::isBuildVectorAllOnes(N0.getNode()))
	return N1;
	if (ISD::isBuildVectorAllOnes(N1.getNode()))
	return N0;
	}

	// fold (and c1, c2) -> c1&c2
	ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
	ConstantSDNode *N1C = isConstOrConstSplat(N1);
	if (N0C && N1C && !N1C->isOpaque())
	return DAG.FoldConstantArithmetic(ISD::AND, SDLoc(N), VT, N0C, N1C);
	// canonicalize constant to RHS
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
	!DAG.isConstantIntBuildVectorOrConstantInt(N1))
	return DAG.getNode(ISD::AND, SDLoc(N), VT, N1, N0);
	// fold (and x, -1) -> x
	if (isAllOnesConstant(N1))
	return N0;
	// if (and x, c) is known to be zero, return 0
	unsigned BitWidth = VT.getScalarSizeInBits();
	if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),
	APInt::getAllOnesValue(BitWidth)))
	return DAG.getConstant(0, SDLoc(N), VT);

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// reassociate and
	if (SDValue RAND = ReassociateOps(ISD::AND, SDLoc(N), N0, N1))
	return RAND;

	// Try to convert a constant mask AND into a shuffle clear mask.
	if (VT.isVector())
	if (SDValue Shuffle = XformToShuffleWithZero(N))
	return Shuffle;

	// fold (and (or x, C), D) -> D if (C & D) == D
	auto MatchSubset = [](ConstantSDNode LHS, ConstantSDNode RHS) {
	return RHS->getAPIntValue().isSubsetOf(LHS->getAPIntValue());
	};
	if (N0.getOpcode() == ISD::OR &&
	matchBinaryPredicate(N0.getOperand(1), N1, MatchSubset))
	return N1;
	// fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits.
	if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {
	SDValue N0Op0 = N0.getOperand(0);
	APInt Mask = ~N1C->getAPIntValue();
	Mask = Mask.trunc(N0Op0.getScalarValueSizeInBits());
	if (DAG.MaskedValueIsZero(N0Op0, Mask)) {
	SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
	N0.getValueType(), N0Op0);

	// Replace uses of the AND with uses of the Zero extend node.
	CombineTo(N, Zext);

	// We actually want to replace all uses of the any_extend with the
	// zero_extend, to avoid duplicating things. This will later cause this
	// AND to be folded.
	CombineTo(N0.getNode(), Zext);
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}
	// similarly fold (and (X (load ([non_ext\|any_ext\|zero_ext] V))), c) ->
	// (X (load ([non_ext\|zero_ext] V))) if 'and' only clears top bits which must
	// already be zero by virtue of the width of the base type of the load.
	//
	// the 'X' node here can either be nothing or an extract_vector_elt to catch
	// more cases.
	if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	N0.getValueSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits() &&
	N0.getOperand(0).getOpcode() == ISD::LOAD &&
	N0.getOperand(0).getResNo() == 0) \|\|
	(N0.getOpcode() == ISD::LOAD && N0.getResNo() == 0)) {
	LoadSDNode *Load = cast<LoadSDNode>( (N0.getOpcode() == ISD::LOAD) ?
	N0 : N0.getOperand(0) );

	// Get the constant (if applicable) the zero'th operand is being ANDed with.
	// This can be a pure constant or a vector splat, in which case we treat the
	// vector as a scalar and use the splat value.
	APInt Constant = APInt::getNullValue(1);
	if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
	Constant = C->getAPIntValue();
	} else if (BuildVectorSDNode *Vector = dyn_cast<BuildVectorSDNode>(N1)) {
	APInt SplatValue, SplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	bool IsSplat = Vector->isConstantSplat(SplatValue, SplatUndef,
	SplatBitSize, HasAnyUndefs);
	if (IsSplat) {
	// Undef bits can contribute to a possible optimisation if set, so
	// set them.
	SplatValue \|= SplatUndef;

	// The splat value may be something like "0x00FFFFFF", which means 0 for
	// the first vector value and FF for the rest, repeating. We need a mask
	// that will apply equally to all members of the vector, so AND all the
	// lanes of the constant together.
	EVT VT = Vector->getValueType(0);
	unsigned BitWidth = VT.getScalarSizeInBits();

	// If the splat value has been compressed to a bitlength lower
	// than the size of the vector lane, we need to re-expand it to
	// the lane size.
	if (BitWidth > SplatBitSize)
	for (SplatValue = SplatValue.zextOrTrunc(BitWidth);
	SplatBitSize < BitWidth;
	SplatBitSize = SplatBitSize * 2)
	SplatValue \|= SplatValue.shl(SplatBitSize);

	// Make sure that variable 'Constant' is only set if 'SplatBitSize' is a
	// multiple of 'BitWidth'. Otherwise, we could propagate a wrong value.
	if (SplatBitSize % BitWidth == 0) {
	Constant = APInt::getAllOnesValue(BitWidth);
	for (unsigned i = 0, n = SplatBitSize/BitWidth; i < n; ++i)
	Constant &= SplatValue.lshr(i*BitWidth).zextOrTrunc(BitWidth);
	}
	}
	}

	// If we want to change an EXTLOAD to a ZEXTLOAD, ensure a ZEXTLOAD is
	// actually legal and isn't going to get expanded, else this is a false
	// optimisation.
	bool CanZextLoadProfitably = TLI.isLoadExtLegal(ISD::ZEXTLOAD,
	Load->getValueType(0),
	Load->getMemoryVT());

	// Resize the constant to the same size as the original memory access before
	// extension. If it is still the AllOnesValue then this AND is completely
	// unneeded.
	Constant = Constant.zextOrTrunc(Load->getMemoryVT().getScalarSizeInBits());

	bool B;
	switch (Load->getExtensionType()) {
	default: B = false; break;
	case ISD::EXTLOAD: B = CanZextLoadProfitably; break;
	case ISD::ZEXTLOAD:
	case ISD::NON_EXTLOAD: B = true; break;
	}

	if (B && Constant.isAllOnesValue()) {
	// If the load type was an EXTLOAD, convert to ZEXTLOAD in order to
	// preserve semantics once we get rid of the AND.
	SDValue NewLoad(Load, 0);

	// Fold the AND away. NewLoad may get replaced immediately.
	CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0);

	if (Load->getExtensionType() == ISD::EXTLOAD) {
	NewLoad = DAG.getLoad(Load->getAddressingMode(), ISD::ZEXTLOAD,
	Load->getValueType(0), SDLoc(Load),
	Load->getChain(), Load->getBasePtr(),
	Load->getOffset(), Load->getMemoryVT(),
	Load->getMemOperand());
	// Replace uses of the EXTLOAD with the new ZEXTLOAD.
	if (Load->getNumValues() == 3) {
	// PRE/POST_INC loads have 3 values.
	SDValue To[] = { NewLoad.getValue(0), NewLoad.getValue(1),
	NewLoad.getValue(2) };
	CombineTo(Load, To, 3, true);
	} else {
	CombineTo(Load, NewLoad.getValue(0), NewLoad.getValue(1));
	}
	}

	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}

	// fold (and (load x), 255) -> (zextload x, i8)
	// fold (and (extload x, i16), 255) -> (zextload x, i8)
	// fold (and (any_ext (extload x, i16)), 255) -> (zextload x, i8)
	if (!VT.isVector() && N1C && (N0.getOpcode() == ISD::LOAD \|\|
	(N0.getOpcode() == ISD::ANY_EXTEND &&
	N0.getOperand(0).getOpcode() == ISD::LOAD))) {
	if (SDValue Res = ReduceLoadWidth(N)) {
	LoadSDNode *LN0 = N0->getOpcode() == ISD::ANY_EXTEND
	? cast<LoadSDNode>(N0.getOperand(0)) : cast<LoadSDNode>(N0);

	AddToWorklist(N);
	CombineTo(LN0, Res, Res.getValue(1));
	return SDValue(N, 0);
	}
	}

	if (Level >= AfterLegalizeTypes) {
	// Attempt to propagate the AND back up to the leaves which, if they're
	// loads, can be combined to narrow loads and the AND node can be removed.
	// Perform after legalization so that extend nodes will already be
	// combined into the loads.
	if (BackwardsPropagateMask(N, DAG)) {
	return SDValue(N, 0);
	}
	}

	if (SDValue Combined = visitANDLike(N0, N1, N))
	return Combined;

	// Simplify: (and (op x...), (op y...)) -> (op (and x, y))
	if (N0.getOpcode() == N1.getOpcode())
	if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))
	return Tmp;

	// Masking the negated extension of a boolean is just the zero-extended
	// boolean:
	// and (sub 0, zext(bool X)), 1 --> zext(bool X)
	// and (sub 0, sext(bool X)), 1 --> zext(bool X)
	//
	// Note: the SimplifyDemandedBits fold below can make an information-losing
	// transform, and then we have no way to find this better fold.
	if (N1C && N1C->isOne() && N0.getOpcode() == ISD::SUB) {
	if (isNullConstantOrNullSplatConstant(N0.getOperand(0))) {
	SDValue SubRHS = N0.getOperand(1);
	if (SubRHS.getOpcode() == ISD::ZERO_EXTEND &&
	SubRHS.getOperand(0).getScalarValueSizeInBits() == 1)
	return SubRHS;
	if (SubRHS.getOpcode() == ISD::SIGN_EXTEND &&
	SubRHS.getOperand(0).getScalarValueSizeInBits() == 1)
	return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, SubRHS.getOperand(0));
	}
	}

	// fold (and (sign_extend_inreg x, i16 to i32), 1) -> (and x, 1)
	// fold (and (sra)) -> (and (srl)) when possible.
	if (SimplifyDemandedBits(SDValue(N, 0)))
	return SDValue(N, 0);

	// fold (zext_inreg (extload x)) -> (zextload x)
	if (ISD::isEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode())) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	EVT MemVT = LN0->getMemoryVT();
	// If we zero all the possible extended bits, then we can turn this into
	// a zextload if we are running before legalize or the operation is legal.
	unsigned BitWidth = N1.getScalarValueSizeInBits();
	if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth,
	BitWidth - MemVT.getScalarSizeInBits())) &&
	((!LegalOperations && !LN0->isVolatile()) \|\|
	TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) {
	SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT,
	LN0->getChain(), LN0->getBasePtr(),
	MemVT, LN0->getMemOperand());
	AddToWorklist(N);
	CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}
	// fold (zext_inreg (sextload x)) -> (zextload x) iff load has one use
	if (ISD::isSEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
	N0.hasOneUse()) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	EVT MemVT = LN0->getMemoryVT();
	// If we zero all the possible extended bits, then we can turn this into
	// a zextload if we are running before legalize or the operation is legal.
	unsigned BitWidth = N1.getScalarValueSizeInBits();
	if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth,
	BitWidth - MemVT.getScalarSizeInBits())) &&
	((!LegalOperations && !LN0->isVolatile()) \|\|
	TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) {
	SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT,
	LN0->getChain(), LN0->getBasePtr(),
	MemVT, LN0->getMemOperand());
	AddToWorklist(N);
	CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}
	// fold (and (or (srl N, 8), (shl N, 8)), 0xffff) -> (srl (bswap N), const)
	if (N1C && N1C->getAPIntValue() == 0xffff && N0.getOpcode() == ISD::OR) {
	if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
	N0.getOperand(1), false))
	return BSwap;
	}

	return SDValue();
	}

	/// Match (a >> 8) \| (a << 8) as (bswap a) >> 16.
	SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
	bool DemandHighBits) {
	if (!LegalOperations)
	return SDValue();

	EVT VT = N->getValueType(0);
	if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16)
	return SDValue();
	if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT))
	return SDValue();

	// Recognize (and (shl a, 8), 0xff00), (and (srl a, 8), 0xff)
	bool LookPassAnd0 = false;
	bool LookPassAnd1 = false;
	if (N0.getOpcode() == ISD::AND && N0.getOperand(0).getOpcode() == ISD::SRL)
	std::swap(N0, N1);
	if (N1.getOpcode() == ISD::AND && N1.getOperand(0).getOpcode() == ISD::SHL)
	std::swap(N0, N1);
	if (N0.getOpcode() == ISD::AND) {
	if (!N0.getNode()->hasOneUse())
	return SDValue();
	ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (!N01C \|\| N01C->getZExtValue() != 0xFF00)
	return SDValue();
	N0 = N0.getOperand(0);
	LookPassAnd0 = true;
	}

	if (N1.getOpcode() == ISD::AND) {
	if (!N1.getNode()->hasOneUse())
	return SDValue();
	ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
	if (!N11C \|\| N11C->getZExtValue() != 0xFF)
	return SDValue();
	N1 = N1.getOperand(0);
	LookPassAnd1 = true;
	}

	if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
	std::swap(N0, N1);
	if (N0.getOpcode() != ISD::SHL \|\| N1.getOpcode() != ISD::SRL)
	return SDValue();
	if (!N0.getNode()->hasOneUse() \|\| !N1.getNode()->hasOneUse())
	return SDValue();

	ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
	if (!N01C \|\| !N11C)
	return SDValue();
	if (N01C->getZExtValue() != 8 \|\| N11C->getZExtValue() != 8)
	return SDValue();

	// Look for (shl (and a, 0xff), 8), (srl (and a, 0xff00), 8)
	SDValue N00 = N0->getOperand(0);
	if (!LookPassAnd0 && N00.getOpcode() == ISD::AND) {
	if (!N00.getNode()->hasOneUse())
	return SDValue();
	ConstantSDNode *N001C = dyn_cast<ConstantSDNode>(N00.getOperand(1));
	if (!N001C \|\| N001C->getZExtValue() != 0xFF)
	return SDValue();
	N00 = N00.getOperand(0);
	LookPassAnd0 = true;
	}

	SDValue N10 = N1->getOperand(0);
	if (!LookPassAnd1 && N10.getOpcode() == ISD::AND) {
	if (!N10.getNode()->hasOneUse())
	return SDValue();
	ConstantSDNode *N101C = dyn_cast<ConstantSDNode>(N10.getOperand(1));
	if (!N101C \|\| N101C->getZExtValue() != 0xFF00)
	return SDValue();
	N10 = N10.getOperand(0);
	LookPassAnd1 = true;
	}

	if (N00 != N10)
	return SDValue();

	// Make sure everything beyond the low halfword gets set to zero since the SRL
	// 16 will clear the top bits.
	unsigned OpSizeInBits = VT.getSizeInBits();
	if (DemandHighBits && OpSizeInBits > 16) {
	// If the left-shift isn't masked out then the only way this is a bswap is
	// if all bits beyond the low 8 are 0. In that case the entire pattern
	// reduces to a left shift anyway: leave it for other parts of the combiner.
	if (!LookPassAnd0)
	return SDValue();

	// However, if the right shift isn't masked out then it might be because
	// it's not needed. See if we can spot that too.
	if (!LookPassAnd1 &&
	!DAG.MaskedValueIsZero(
	N10, APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - 16)))
	return SDValue();
	}

	SDValue Res = DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N00);
	if (OpSizeInBits > 16) {
	SDLoc DL(N);
	Res = DAG.getNode(ISD::SRL, DL, VT, Res,
	DAG.getConstant(OpSizeInBits - 16, DL,
	getShiftAmountTy(VT)));
	}
	return Res;
	}

	/// Return true if the specified node is an element that makes up a 32-bit
	/// packed halfword byteswap.
	/// ((x & 0x000000ff) << 8) \|
	/// ((x & 0x0000ff00) >> 8) \|
	/// ((x & 0x00ff0000) << 8) \|
	/// ((x & 0xff000000) >> 8)
	static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) {
	if (!N.getNode()->hasOneUse())
	return false;

	unsigned Opc = N.getOpcode();
	if (Opc != ISD::AND && Opc != ISD::SHL && Opc != ISD::SRL)
	return false;

	SDValue N0 = N.getOperand(0);
	unsigned Opc0 = N0.getOpcode();
	if (Opc0 != ISD::AND && Opc0 != ISD::SHL && Opc0 != ISD::SRL)
	return false;

	ConstantSDNode *N1C = nullptr;
	// SHL or SRL: look upstream for AND mask operand
	if (Opc == ISD::AND)
	N1C = dyn_cast<ConstantSDNode>(N.getOperand(1));
	else if (Opc0 == ISD::AND)
	N1C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (!N1C)
	return false;

	unsigned MaskByteOffset;
	switch (N1C->getZExtValue()) {
	default:
	return false;
	case 0xFF: MaskByteOffset = 0; break;
	case 0xFF00: MaskByteOffset = 1; break;
	case 0xFF0000: MaskByteOffset = 2; break;
	case 0xFF000000: MaskByteOffset = 3; break;
	}

	// Look for (x & 0xff) << 8 as well as ((x << 8) & 0xff00).
	if (Opc == ISD::AND) {
	if (MaskByteOffset == 0 \|\| MaskByteOffset == 2) {
	// (x >> 8) & 0xff
	// (x >> 8) & 0xff0000
	if (Opc0 != ISD::SRL)
	return false;
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (!C \|\| C->getZExtValue() != 8)
	return false;
	} else {
	// (x << 8) & 0xff00
	// (x << 8) & 0xff000000
	if (Opc0 != ISD::SHL)
	return false;
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (!C \|\| C->getZExtValue() != 8)
	return false;
	}
	} else if (Opc == ISD::SHL) {
	// (x & 0xff) << 8
	// (x & 0xff0000) << 8
	if (MaskByteOffset != 0 && MaskByteOffset != 2)
	return false;
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1));
	if (!C \|\| C->getZExtValue() != 8)
	return false;
	} else { // Opc == ISD::SRL
	// (x & 0xff00) >> 8
	// (x & 0xff000000) >> 8
	if (MaskByteOffset != 1 && MaskByteOffset != 3)
	return false;
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1));
	if (!C \|\| C->getZExtValue() != 8)
	return false;
	}

	if (Parts[MaskByteOffset])
	return false;

	Parts[MaskByteOffset] = N0.getOperand(0).getNode();
	return true;
	}

	/// Match a 32-bit packed halfword bswap. That is
	/// ((x & 0x000000ff) << 8) \|
	/// ((x & 0x0000ff00) >> 8) \|
	/// ((x & 0x00ff0000) << 8) \|
	/// ((x & 0xff000000) >> 8)
	/// => (rotl (bswap x), 16)
	SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
	if (!LegalOperations)
	return SDValue();

	EVT VT = N->getValueType(0);
	if (VT != MVT::i32)
	return SDValue();
	if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT))
	return SDValue();

	// Look for either
	// (or (or (and), (and)), (or (and), (and)))
	// (or (or (or (and), (and)), (and)), (and))
	if (N0.getOpcode() != ISD::OR)
	return SDValue();
	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	SDNode *Parts[4] = {};

	if (N1.getOpcode() == ISD::OR &&
	N00.getNumOperands() == 2 && N01.getNumOperands() == 2) {
	// (or (or (and), (and)), (or (and), (and)))
	if (!isBSwapHWordElement(N00, Parts))
	return SDValue();

	if (!isBSwapHWordElement(N01, Parts))
	return SDValue();
	SDValue N10 = N1.getOperand(0);
	if (!isBSwapHWordElement(N10, Parts))
	return SDValue();
	SDValue N11 = N1.getOperand(1);
	if (!isBSwapHWordElement(N11, Parts))
	return SDValue();
	} else {
	// (or (or (or (and), (and)), (and)), (and))
	if (!isBSwapHWordElement(N1, Parts))
	return SDValue();
	if (!isBSwapHWordElement(N01, Parts))
	return SDValue();
	if (N00.getOpcode() != ISD::OR)
	return SDValue();
	SDValue N000 = N00.getOperand(0);
	if (!isBSwapHWordElement(N000, Parts))
	return SDValue();
	SDValue N001 = N00.getOperand(1);
	if (!isBSwapHWordElement(N001, Parts))
	return SDValue();
	}

	// Make sure the parts are all coming from the same node.
	if (Parts[0] != Parts[1] \|\| Parts[0] != Parts[2] \|\| Parts[0] != Parts[3])
	return SDValue();

	SDLoc DL(N);
	SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT,
	SDValue(Parts[0], 0));

	// Result of the bswap should be rotated by 16. If it's not legal, then
	// do (x << 16) \| (x >> 16).
	SDValue ShAmt = DAG.getConstant(16, DL, getShiftAmountTy(VT));
	if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT))
	return DAG.getNode(ISD::ROTL, DL, VT, BSwap, ShAmt);
	if (TLI.isOperationLegalOrCustom(ISD::ROTR, VT))
	return DAG.getNode(ISD::ROTR, DL, VT, BSwap, ShAmt);
	return DAG.getNode(ISD::OR, DL, VT,
	DAG.getNode(ISD::SHL, DL, VT, BSwap, ShAmt),
	DAG.getNode(ISD::SRL, DL, VT, BSwap, ShAmt));
	}

	/// This contains all DAGCombine rules which reduce two values combined by
	/// an Or operation to a single value \see visitANDLike().
	SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *N) {
	EVT VT = N1.getValueType();
	SDLoc DL(N);

	// fold (or x, undef) -> -1
	if (!LegalOperations && (N0.isUndef() \|\| N1.isUndef()))
	return DAG.getAllOnesConstant(DL, VT);

	if (SDValue V = foldLogicOfSetCCs(false, N0, N1, DL))
	return V;

	// (or (and X, C1), (and Y, C2)) -> (and (or X, Y), C3) if possible.
	if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND &&
	// Don't increase # computations.
	(N0.getNode()->hasOneUse() \|\| N1.getNode()->hasOneUse())) {
	// We can only do this xform if we know that bits from X that are set in C2
	// but not in C1 are already zero. Likewise for Y.
	if (const ConstantSDNode *N0O1C =
	getAsNonOpaqueConstant(N0.getOperand(1))) {
	if (const ConstantSDNode *N1O1C =
	getAsNonOpaqueConstant(N1.getOperand(1))) {
	// We can only do this xform if we know that bits from X that are set in
	// C2 but not in C1 are already zero. Likewise for Y.
	const APInt &LHSMask = N0O1C->getAPIntValue();
	const APInt &RHSMask = N1O1C->getAPIntValue();

	if (DAG.MaskedValueIsZero(N0.getOperand(0), RHSMask&~LHSMask) &&
	DAG.MaskedValueIsZero(N1.getOperand(0), LHSMask&~RHSMask)) {
	SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
	N0.getOperand(0), N1.getOperand(0));
	return DAG.getNode(ISD::AND, DL, VT, X,
	DAG.getConstant(LHSMask \| RHSMask, DL, VT));
	}
	}
	}
	}

	// (or (and X, M), (and X, N)) -> (and X, (or M, N))
	if (N0.getOpcode() == ISD::AND &&
	N1.getOpcode() == ISD::AND &&
	N0.getOperand(0) == N1.getOperand(0) &&
	// Don't increase # computations.
	(N0.getNode()->hasOneUse() \|\| N1.getNode()->hasOneUse())) {
	SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
	N0.getOperand(1), N1.getOperand(1));
	return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), X);
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitOR(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N1.getValueType();

	// x \| x --> x
	if (N0 == N1)
	return N0;

	// fold vector ops
	if (VT.isVector()) {
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	// fold (or x, 0) -> x, vector edition
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return N1;
	if (ISD::isBuildVectorAllZeros(N1.getNode()))
	return N0;

	// fold (or x, -1) -> -1, vector edition
	if (ISD::isBuildVectorAllOnes(N0.getNode()))
	// do not return N0, because undef node may exist in N0
	return DAG.getAllOnesConstant(SDLoc(N), N0.getValueType());
	if (ISD::isBuildVectorAllOnes(N1.getNode()))
	// do not return N1, because undef node may exist in N1
	return DAG.getAllOnesConstant(SDLoc(N), N1.getValueType());

	// fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask)
	// Do this only if the resulting shuffle is legal.
	if (isa<ShuffleVectorSDNode>(N0) &&
	isa<ShuffleVectorSDNode>(N1) &&
	// Avoid folding a node with illegal type.
	TLI.isTypeLegal(VT)) {
	bool ZeroN00 = ISD::isBuildVectorAllZeros(N0.getOperand(0).getNode());
	bool ZeroN01 = ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode());
	bool ZeroN10 = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode());
	bool ZeroN11 = ISD::isBuildVectorAllZeros(N1.getOperand(1).getNode());
	// Ensure both shuffles have a zero input.
	if ((ZeroN00 != ZeroN01) && (ZeroN10 != ZeroN11)) {
	assert((!ZeroN00 \|\| !ZeroN01) && "Both inputs zero!");
	assert((!ZeroN10 \|\| !ZeroN11) && "Both inputs zero!");
	const ShuffleVectorSDNode *SV0 = cast<ShuffleVectorSDNode>(N0);
	const ShuffleVectorSDNode *SV1 = cast<ShuffleVectorSDNode>(N1);
	bool CanFold = true;
	int NumElts = VT.getVectorNumElements();
	SmallVector<int, 4> Mask(NumElts);

	for (int i = 0; i != NumElts; ++i) {
	int M0 = SV0->getMaskElt(i);
	int M1 = SV1->getMaskElt(i);

	// Determine if either index is pointing to a zero vector.
	bool M0Zero = M0 < 0 \|\| (ZeroN00 == (M0 < NumElts));
	bool M1Zero = M1 < 0 \|\| (ZeroN10 == (M1 < NumElts));

	// If one element is zero and the otherside is undef, keep undef.
	// This also handles the case that both are undef.
	if ((M0Zero && M1 < 0) \|\| (M1Zero && M0 < 0)) {
	Mask[i] = -1;
	continue;
	}

	// Make sure only one of the elements is zero.
	if (M0Zero == M1Zero) {
	CanFold = false;
	break;
	}

	assert((M0 >= 0 \|\| M1 >= 0) && "Undef index!");

	// We have a zero and non-zero element. If the non-zero came from
	// SV0 make the index a LHS index. If it came from SV1, make it
	// a RHS index. We need to mod by NumElts because we don't care
	// which operand it came from in the original shuffles.
	Mask[i] = M1Zero ? M0 % NumElts : (M1 % NumElts) + NumElts;
	}

	if (CanFold) {
	SDValue NewLHS = ZeroN00 ? N0.getOperand(1) : N0.getOperand(0);
	SDValue NewRHS = ZeroN10 ? N1.getOperand(1) : N1.getOperand(0);

	bool LegalMask = TLI.isShuffleMaskLegal(Mask, VT);
	if (!LegalMask) {
	std::swap(NewLHS, NewRHS);
	ShuffleVectorSDNode::commuteMask(Mask);
	LegalMask = TLI.isShuffleMaskLegal(Mask, VT);
	}

	if (LegalMask)
	return DAG.getVectorShuffle(VT, SDLoc(N), NewLHS, NewRHS, Mask);
	}
	}
	}
	}

	// fold (or c1, c2) -> c1\|c2
	ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
	if (N0C && N1C && !N1C->isOpaque())
	return DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N), VT, N0C, N1C);
	// canonicalize constant to RHS
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
	!DAG.isConstantIntBuildVectorOrConstantInt(N1))
	return DAG.getNode(ISD::OR, SDLoc(N), VT, N1, N0);
	// fold (or x, 0) -> x
	if (isNullConstant(N1))
	return N0;
	// fold (or x, -1) -> -1
	if (isAllOnesConstant(N1))
	return N1;

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// fold (or x, c) -> c iff (x & ~c) == 0
	if (N1C && DAG.MaskedValueIsZero(N0, ~N1C->getAPIntValue()))
	return N1;

	if (SDValue Combined = visitORLike(N0, N1, N))
	return Combined;

	// Recognize halfword bswaps as (bswap + rotl 16) or (bswap + shl 16)
	if (SDValue BSwap = MatchBSwapHWord(N, N0, N1))
	return BSwap;
	if (SDValue BSwap = MatchBSwapHWordLow(N, N0, N1))
	return BSwap;

	// reassociate or
	if (SDValue ROR = ReassociateOps(ISD::OR, SDLoc(N), N0, N1))
	return ROR;

	// Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1\|c2)
	// iff (c1 & c2) != 0.
	auto MatchIntersect = [](ConstantSDNode LHS, ConstantSDNode RHS) {
	return LHS->getAPIntValue().intersects(RHS->getAPIntValue());
	};
	if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
	matchBinaryPredicate(N0.getOperand(1), N1, MatchIntersect)) {
	if (SDValue COR = DAG.FoldConstantArithmetic(
	ISD::OR, SDLoc(N1), VT, N1.getNode(), N0.getOperand(1).getNode())) {
	SDValue IOR = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1);
	AddToWorklist(IOR.getNode());
	return DAG.getNode(ISD::AND, SDLoc(N), VT, COR, IOR);
	}
	}

	// Simplify: (or (op x...), (op y...)) -> (op (or x, y))
	if (N0.getOpcode() == N1.getOpcode())
	if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))
	return Tmp;

	// See if this is some rotate idiom.
	if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N)))
	return SDValue(Rot, 0);

	if (SDValue Load = MatchLoadCombine(N))
	return Load;

	// Simplify the operands using demanded-bits information.
	if (SimplifyDemandedBits(SDValue(N, 0)))
	return SDValue(N, 0);

	return SDValue();
	}

	/// Match "(X shl/srl V1) & V2" where V2 may not be present.
	bool DAGCombiner::MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask) {
	if (Op.getOpcode() == ISD::AND) {
	if (DAG.isConstantIntBuildVectorOrConstantInt(Op.getOperand(1))) {
	Mask = Op.getOperand(1);
	Op = Op.getOperand(0);
	} else {
	return false;
	}
	}

	if (Op.getOpcode() == ISD::SRL \|\| Op.getOpcode() == ISD::SHL) {
	Shift = Op;
	return true;
	}

	return false;
	}

	// Return true if we can prove that, whenever Neg and Pos are both in the
	// range [0, EltSize), Neg == (Pos == 0 ? 0 : EltSize - Pos). This means that
	// for two opposing shifts shift1 and shift2 and a value X with OpBits bits:
	//
	// (or (shift1 X, Neg), (shift2 X, Pos))
	//
	// reduces to a rotate in direction shift2 by Pos or (equivalently) a rotate
	// in direction shift1 by Neg. The range [0, EltSize) means that we only need
	// to consider shift amounts with defined behavior.
	static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize) {
	// If EltSize is a power of 2 then:
	//
	// (a) (Pos == 0 ? 0 : EltSize - Pos) == (EltSize - Pos) & (EltSize - 1)
	// (b) Neg == Neg & (EltSize - 1) whenever Neg is in [0, EltSize).
	//
	// So if EltSize is a power of 2 and Neg is (and Neg', EltSize-1), we check
	// for the stronger condition:
	//
	// Neg & (EltSize - 1) == (EltSize - Pos) & (EltSize - 1) [A]
	//
	// for all Neg and Pos. Since Neg & (EltSize - 1) == Neg' & (EltSize - 1)
	// we can just replace Neg with Neg' for the rest of the function.
	//
	// In other cases we check for the even stronger condition:
	//
	// Neg == EltSize - Pos [B]
	//
	// for all Neg and Pos. Note that the (or ...) then invokes undefined
	// behavior if Pos == 0 (and consequently Neg == EltSize).
	//
	// We could actually use [A] whenever EltSize is a power of 2, but the
	// only extra cases that it would match are those uninteresting ones
	// where Neg and Pos are never in range at the same time. E.g. for
	// EltSize == 32, using [A] would allow a Neg of the form (sub 64, Pos)
	// as well as (sub 32, Pos), but:
	//
	// (or (shift1 X, (sub 64, Pos)), (shift2 X, Pos))
	//
	// always invokes undefined behavior for 32-bit X.
	//
	// Below, Mask == EltSize - 1 when using [A] and is all-ones otherwise.
	unsigned MaskLoBits = 0;
	if (Neg.getOpcode() == ISD::AND && isPowerOf2_64(EltSize)) {
	if (ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(1))) {
	if (NegC->getAPIntValue() == EltSize - 1) {
	Neg = Neg.getOperand(0);
	MaskLoBits = Log2_64(EltSize);
	}
	}
	}

	// Check whether Neg has the form (sub NegC, NegOp1) for some NegC and NegOp1.
	if (Neg.getOpcode() != ISD::SUB)
	return false;
	ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(0));
	if (!NegC)
	return false;
	SDValue NegOp1 = Neg.getOperand(1);

	// On the RHS of [A], if Pos is Pos' & (EltSize - 1), just replace Pos with
	// Pos'. The truncation is redundant for the purpose of the equality.
	if (MaskLoBits && Pos.getOpcode() == ISD::AND)
	if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1)))
	if (PosC->getAPIntValue() == EltSize - 1)
	Pos = Pos.getOperand(0);

	// The condition we need is now:
	//
	// (NegC - NegOp1) & Mask == (EltSize - Pos) & Mask
	//
	// If NegOp1 == Pos then we need:
	//
	// EltSize & Mask == NegC & Mask
	//
	// (because "x & Mask" is a truncation and distributes through subtraction).
	APInt Width;
	if (Pos == NegOp1)
	Width = NegC->getAPIntValue();

	// Check for cases where Pos has the form (add NegOp1, PosC) for some PosC.
	// Then the condition we want to prove becomes:
	//
	// (NegC - NegOp1) & Mask == (EltSize - (NegOp1 + PosC)) & Mask
	//
	// which, again because "x & Mask" is a truncation, becomes:
	//
	// NegC & Mask == (EltSize - PosC) & Mask
	// EltSize & Mask == (NegC + PosC) & Mask
	else if (Pos.getOpcode() == ISD::ADD && Pos.getOperand(0) == NegOp1) {
	if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1)))
	Width = PosC->getAPIntValue() + NegC->getAPIntValue();
	else
	return false;
	} else
	return false;

	// Now we just need to check that EltSize & Mask == Width & Mask.
	if (MaskLoBits)
	// EltSize & Mask is 0 since Mask is EltSize - 1.
	return Width.getLoBits(MaskLoBits) == 0;
	return Width == EltSize;
	}

	// A subroutine of MatchRotate used once we have found an OR of two opposite
	// shifts of Shifted. If Neg == <operand size> - Pos then the OR reduces
	// to both (PosOpcode Shifted, Pos) and (NegOpcode Shifted, Neg), with the
	// former being preferred if supported. InnerPos and InnerNeg are Pos and
	// Neg with outer conversions stripped away.
	SDNode *DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
	SDValue Neg, SDValue InnerPos,
	SDValue InnerNeg, unsigned PosOpcode,
	unsigned NegOpcode, const SDLoc &DL) {
	// fold (or (shl x, (*ext y)),
	// (srl x, (*ext (sub 32, y)))) ->
	// (rotl x, y) or (rotr x, (sub 32, y))
	//
	// fold (or (shl x, (*ext (sub 32, y))),
	// (srl x, (*ext y))) ->
	// (rotr x, y) or (rotl x, (sub 32, y))
	EVT VT = Shifted.getValueType();
	if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits())) {
	bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT);
	return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted,
	HasPos ? Pos : Neg).getNode();
	}

	return nullptr;
	}

	// MatchRotate - Handle an 'or' of two operands. If this is one of the many
	// idioms for rotate, and if the target supports rotation instructions, generate
	// a rot[lr].
	SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
	// Must be a legal type. Expanded 'n promoted things won't work with rotates.
	EVT VT = LHS.getValueType();
	if (!TLI.isTypeLegal(VT)) return nullptr;

	// The target must have at least one rotate flavor.
	bool HasROTL = TLI.isOperationLegalOrCustom(ISD::ROTL, VT);
	bool HasROTR = TLI.isOperationLegalOrCustom(ISD::ROTR, VT);
	if (!HasROTL && !HasROTR) return nullptr;

	// Check for truncated rotate.
	if (LHS.getOpcode() == ISD::TRUNCATE && RHS.getOpcode() == ISD::TRUNCATE &&
	LHS.getOperand(0).getValueType() == RHS.getOperand(0).getValueType()) {
	assert(LHS.getValueType() == RHS.getValueType());
	if (SDNode *Rot = MatchRotate(LHS.getOperand(0), RHS.getOperand(0), DL)) {
	return DAG.getNode(ISD::TRUNCATE, SDLoc(LHS), LHS.getValueType(),
	SDValue(Rot, 0)).getNode();
	}
	}

	// Match "(X shl/srl V1) & V2" where V2 may not be present.
	SDValue LHSShift; // The shift.
	SDValue LHSMask; // AND value if any.
	if (!MatchRotateHalf(LHS, LHSShift, LHSMask))
	return nullptr; // Not part of a rotate.

	SDValue RHSShift; // The shift.
	SDValue RHSMask; // AND value if any.
	if (!MatchRotateHalf(RHS, RHSShift, RHSMask))
	return nullptr; // Not part of a rotate.

	if (LHSShift.getOperand(0) != RHSShift.getOperand(0))
	return nullptr; // Not shifting the same value.

	if (LHSShift.getOpcode() == RHSShift.getOpcode())
	return nullptr; // Shifts must disagree.

	// Canonicalize shl to left side in a shl/srl pair.
	if (RHSShift.getOpcode() == ISD::SHL) {
	std::swap(LHS, RHS);
	std::swap(LHSShift, RHSShift);
	std::swap(LHSMask, RHSMask);
	}

	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	SDValue LHSShiftArg = LHSShift.getOperand(0);
	SDValue LHSShiftAmt = LHSShift.getOperand(1);
	SDValue RHSShiftArg = RHSShift.getOperand(0);
	SDValue RHSShiftAmt = RHSShift.getOperand(1);

	// fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1)
	// fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2)
	auto MatchRotateSum = [EltSizeInBits](ConstantSDNode *LHS,
	ConstantSDNode *RHS) {
	return (LHS->getAPIntValue() + RHS->getAPIntValue()) == EltSizeInBits;
	};
	if (matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) {
	SDValue Rot = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT,
	LHSShiftArg, HasROTL ? LHSShiftAmt : RHSShiftAmt);

	// If there is an AND of either shifted operand, apply it to the result.
	if (LHSMask.getNode() \|\| RHSMask.getNode()) {
	SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
	SDValue Mask = AllOnes;

	if (LHSMask.getNode()) {
	SDValue RHSBits = DAG.getNode(ISD::SRL, DL, VT, AllOnes, RHSShiftAmt);
	Mask = DAG.getNode(ISD::AND, DL, VT, Mask,
	DAG.getNode(ISD::OR, DL, VT, LHSMask, RHSBits));
	}
	if (RHSMask.getNode()) {
	SDValue LHSBits = DAG.getNode(ISD::SHL, DL, VT, AllOnes, LHSShiftAmt);
	Mask = DAG.getNode(ISD::AND, DL, VT, Mask,
	DAG.getNode(ISD::OR, DL, VT, RHSMask, LHSBits));
	}

	Rot = DAG.getNode(ISD::AND, DL, VT, Rot, Mask);
	}

	return Rot.getNode();
	}

	// If there is a mask here, and we have a variable shift, we can't be sure
	// that we're masking out the right stuff.
	if (LHSMask.getNode() \|\| RHSMask.getNode())
	return nullptr;

	// If the shift amount is sign/zext/any-extended just peel it off.
	SDValue LExtOp0 = LHSShiftAmt;
	SDValue RExtOp0 = RHSShiftAmt;
	if ((LHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND \|\|
	LHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND \|\|
	LHSShiftAmt.getOpcode() == ISD::ANY_EXTEND \|\|
	LHSShiftAmt.getOpcode() == ISD::TRUNCATE) &&
	(RHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND \|\|
	RHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND \|\|
	RHSShiftAmt.getOpcode() == ISD::ANY_EXTEND \|\|
	RHSShiftAmt.getOpcode() == ISD::TRUNCATE)) {
	LExtOp0 = LHSShiftAmt.getOperand(0);
	RExtOp0 = RHSShiftAmt.getOperand(0);
	}

	SDNode *TryL = MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt,
	LExtOp0, RExtOp0, ISD::ROTL, ISD::ROTR, DL);
	if (TryL)
	return TryL;

	SDNode *TryR = MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt,
	RExtOp0, LExtOp0, ISD::ROTR, ISD::ROTL, DL);
	if (TryR)
	return TryR;

	return nullptr;
	}

	namespace {

	/// Represents known origin of an individual byte in load combine pattern. The
	/// value of the byte is either constant zero or comes from memory.
	struct ByteProvider {
	// For constant zero providers Load is set to nullptr. For memory providers
	// Load represents the node which loads the byte from memory.
	// ByteOffset is the offset of the byte in the value produced by the load.
	LoadSDNode *Load = nullptr;
	unsigned ByteOffset = 0;

	ByteProvider() = default;

	static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset) {
	return ByteProvider(Load, ByteOffset);
	}

	static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0); }

	bool isConstantZero() const { return !Load; }
	bool isMemory() const { return Load; }

	bool operator==(const ByteProvider &Other) const {
	return Other.Load == Load && Other.ByteOffset == ByteOffset;
	}

	private:
	ByteProvider(LoadSDNode *Load, unsigned ByteOffset)
	: Load(Load), ByteOffset(ByteOffset) {}
	};

	} // end anonymous namespace

	/// Recursively traverses the expression calculating the origin of the requested
	/// byte of the given value. Returns None if the provider can't be calculated.
	///
	/// For all the values except the root of the expression verifies that the value
	/// has exactly one use and if it's not true return None. This way if the origin
	/// of the byte is returned it's guaranteed that the values which contribute to
	/// the byte are not used outside of this expression.
	///
	/// Because the parts of the expression are not allowed to have more than one
	/// use this function iterates over trees, not DAGs. So it never visits the same
	/// node more than once.
	static const Optional<ByteProvider>
	calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth,
	bool Root = false) {
	// Typical i64 by i8 pattern requires recursion up to 8 calls depth
	if (Depth == 10)
	return None;

	if (!Root && !Op.hasOneUse())
	return None;

	assert(Op.getValueType().isScalarInteger() && "can't handle other types");
	unsigned BitWidth = Op.getValueSizeInBits();
	if (BitWidth % 8 != 0)
	return None;
	unsigned ByteWidth = BitWidth / 8;
	assert(Index < ByteWidth && "invalid index requested");
	(void) ByteWidth;

	switch (Op.getOpcode()) {
	case ISD::OR: {
	auto LHS = calculateByteProvider(Op->getOperand(0), Index, Depth + 1);
	if (!LHS)
	return None;
	auto RHS = calculateByteProvider(Op->getOperand(1), Index, Depth + 1);
	if (!RHS)
	return None;

	if (LHS->isConstantZero())
	return RHS;
	if (RHS->isConstantZero())
	return LHS;
	return None;
	}
	case ISD::SHL: {
	auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
	if (!ShiftOp)
	return None;

	uint64_t BitShift = ShiftOp->getZExtValue();
	if (BitShift % 8 != 0)
	return None;
	uint64_t ByteShift = BitShift / 8;

	return Index < ByteShift
	? ByteProvider::getConstantZero()
	: calculateByteProvider(Op->getOperand(0), Index - ByteShift,
	Depth + 1);
	}
	case ISD::ANY_EXTEND:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND: {
	SDValue NarrowOp = Op->getOperand(0);
	unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits();
	if (NarrowBitWidth % 8 != 0)
	return None;
	uint64_t NarrowByteWidth = NarrowBitWidth / 8;

	if (Index >= NarrowByteWidth)
	return Op.getOpcode() == ISD::ZERO_EXTEND
	? Optional<ByteProvider>(ByteProvider::getConstantZero())
	: None;
	return calculateByteProvider(NarrowOp, Index, Depth + 1);
	}
	case ISD::BSWAP:
	return calculateByteProvider(Op->getOperand(0), ByteWidth - Index - 1,
	Depth + 1);
	case ISD::LOAD: {
	auto L = cast<LoadSDNode>(Op.getNode());
	if (L->isVolatile() \|\| L->isIndexed())
	return None;

	unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
	if (NarrowBitWidth % 8 != 0)
	return None;
	uint64_t NarrowByteWidth = NarrowBitWidth / 8;

	if (Index >= NarrowByteWidth)
	return L->getExtensionType() == ISD::ZEXTLOAD
	? Optional<ByteProvider>(ByteProvider::getConstantZero())
	: None;
	return ByteProvider::getMemory(L, Index);
	}
	}

	return None;
	}

	/// Match a pattern where a wide type scalar value is loaded by several narrow
	/// loads and combined by shifts and ors. Fold it into a single load or a load
	/// and a BSWAP if the targets supports it.
	///
	/// Assuming little endian target:
	/// i8 *a = ...
	/// i32 val = a[0] \| (a[1] << 8) \| (a[2] << 16) \| (a[3] << 24)
	/// =>
	/// i32 val = *((i32)a)
	///
	/// i8 *a = ...
	/// i32 val = (a[0] << 24) \| (a[1] << 16) \| (a[2] << 8) \| a[3]
	/// =>
	/// i32 val = BSWAP(*((i32)a))
	///
	/// TODO: This rule matches complex patterns with OR node roots and doesn't
	/// interact well with the worklist mechanism. When a part of the pattern is
	/// updated (e.g. one of the loads) its direct users are put into the worklist,
	/// but the root node of the pattern which triggers the load combine is not
	/// necessarily a direct user of the changed node. For example, once the address
	/// of t28 load is reassociated load combine won't be triggered:
	/// t25: i32 = add t4, Constant:i32<2>
	/// t26: i64 = sign_extend t25
	/// t27: i64 = add t2, t26
	/// t28: i8,ch = load<LD1[%tmp9]> t0, t27, undef:i64
	/// t29: i32 = zero_extend t28
	/// t32: i32 = shl t29, Constant:i8<8>
	/// t33: i32 = or t23, t32
	/// As a possible fix visitLoad can check if the load can be a part of a load
	/// combine pattern and add corresponding OR roots to the worklist.
	SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
	assert(N->getOpcode() == ISD::OR &&
	"Can only match load combining against OR nodes");

	// Handles simple types only
	EVT VT = N->getValueType(0);
	if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
	return SDValue();
	unsigned ByteWidth = VT.getSizeInBits() / 8;

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	// Before legalize we can introduce too wide illegal loads which will be later
	// split into legal sized loads. This enables us to combine i64 load by i8
	// patterns to a couple of i32 loads on 32 bit targets.
	if (LegalOperations && !TLI.isOperationLegal(ISD::LOAD, VT))
	return SDValue();

	std::function<unsigned(unsigned, unsigned)> LittleEndianByteAt = [](
	unsigned BW, unsigned i) { return i; };
	std::function<unsigned(unsigned, unsigned)> BigEndianByteAt = [](
	unsigned BW, unsigned i) { return BW - i - 1; };

	bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian();
	auto MemoryByteOffset = [&] (ByteProvider P) {
	assert(P.isMemory() && "Must be a memory byte provider");
	unsigned LoadBitWidth = P.Load->getMemoryVT().getSizeInBits();
	assert(LoadBitWidth % 8 == 0 &&
	"can only analyze providers for individual bytes not bit");
	unsigned LoadByteWidth = LoadBitWidth / 8;
	return IsBigEndianTarget
	? BigEndianByteAt(LoadByteWidth, P.ByteOffset)
	: LittleEndianByteAt(LoadByteWidth, P.ByteOffset);
	};

	Optional<BaseIndexOffset> Base;
	SDValue Chain;

	SmallSet<LoadSDNode *, 8> Loads;
	Optional<ByteProvider> FirstByteProvider;
	int64_t FirstOffset = INT64_MAX;

	// Check if all the bytes of the OR we are looking at are loaded from the same
	// base address. Collect bytes offsets from Base address in ByteOffsets.
	SmallVector<int64_t, 4> ByteOffsets(ByteWidth);
	for (unsigned i = 0; i < ByteWidth; i++) {
	auto P = calculateByteProvider(SDValue(N, 0), i, 0, /Root=/true);
	if (!P \|\| !P->isMemory()) // All the bytes must be loaded from memory
	return SDValue();

	LoadSDNode *L = P->Load;
	assert(L->hasNUsesOfValue(1, 0) && !L->isVolatile() && !L->isIndexed() &&
	"Must be enforced by calculateByteProvider");
	assert(L->getOffset().isUndef() && "Unindexed load must have undef offset");

	// All loads must share the same chain
	SDValue LChain = L->getChain();
	if (!Chain)
	Chain = LChain;
	else if (Chain != LChain)
	return SDValue();

	// Loads must share the same base address
	BaseIndexOffset Ptr = BaseIndexOffset::match(L, DAG);
	int64_t ByteOffsetFromBase = 0;
	if (!Base)
	Base = Ptr;
	else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase))
	return SDValue();

	// Calculate the offset of the current byte from the base address
	ByteOffsetFromBase += MemoryByteOffset(*P);
	ByteOffsets[i] = ByteOffsetFromBase;

	// Remember the first byte load
	if (ByteOffsetFromBase < FirstOffset) {
	FirstByteProvider = P;
	FirstOffset = ByteOffsetFromBase;
	}

	Loads.insert(L);
	}
	assert(!Loads.empty() && "All the bytes of the value must be loaded from "
	"memory, so there must be at least one load which produces the value");
	assert(Base && "Base address of the accessed memory location must be set");
	assert(FirstOffset != INT64_MAX && "First byte offset must be set");

	// Check if the bytes of the OR we are looking at match with either big or
	// little endian value load
	bool BigEndian = true, LittleEndian = true;
	for (unsigned i = 0; i < ByteWidth; i++) {
	int64_t CurrentByteOffset = ByteOffsets[i] - FirstOffset;
	LittleEndian &= CurrentByteOffset == LittleEndianByteAt(ByteWidth, i);
	BigEndian &= CurrentByteOffset == BigEndianByteAt(ByteWidth, i);
	if (!BigEndian && !LittleEndian)
	return SDValue();
	}
	assert((BigEndian != LittleEndian) && "should be either or");
	assert(FirstByteProvider && "must be set");

	// Ensure that the first byte is loaded from zero offset of the first load.
	// So the combined value can be loaded from the first load address.
	if (MemoryByteOffset(*FirstByteProvider) != 0)
	return SDValue();
	LoadSDNode *FirstLoad = FirstByteProvider->Load;

	// The node we are looking at matches with the pattern, check if we can
	// replace it with a single load and bswap if needed.

	// If the load needs byte swap check if the target supports it
	bool NeedsBswap = IsBigEndianTarget != BigEndian;

	// Before legalize we can introduce illegal bswaps which will be later
	// converted to an explicit bswap sequence. This way we end up with a single
	// load and byte shuffling instead of several loads and byte shuffling.
	if (NeedsBswap && LegalOperations && !TLI.isOperationLegal(ISD::BSWAP, VT))
	return SDValue();

	// Check that a load of the wide type is both allowed and fast on the target
	bool Fast = false;
	bool Allowed = TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
	VT, FirstLoad->getAddressSpace(),
	FirstLoad->getAlignment(), &Fast);
	if (!Allowed \|\| !Fast)
	return SDValue();

	SDValue NewLoad =
	DAG.getLoad(VT, SDLoc(N), Chain, FirstLoad->getBasePtr(),
	FirstLoad->getPointerInfo(), FirstLoad->getAlignment());

	// Transfer chain users from old loads to the new load.
	for (LoadSDNode *L : Loads)
	DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1));

	return NeedsBswap ? DAG.getNode(ISD::BSWAP, SDLoc(N), VT, NewLoad) : NewLoad;
	}

	SDValue DAGCombiner::visitXOR(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();

	// fold vector ops
	if (VT.isVector()) {
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	// fold (xor x, 0) -> x, vector edition
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return N1;
	if (ISD::isBuildVectorAllZeros(N1.getNode()))
	return N0;
	}

	// fold (xor undef, undef) -> 0. This is a common idiom (misuse).
	if (N0.isUndef() && N1.isUndef())
	return DAG.getConstant(0, SDLoc(N), VT);
	// fold (xor x, undef) -> undef
	if (N0.isUndef())
	return N0;
	if (N1.isUndef())
	return N1;
	// fold (xor c1, c2) -> c1^c2
	ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
	ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
	if (N0C && N1C)
	return DAG.FoldConstantArithmetic(ISD::XOR, SDLoc(N), VT, N0C, N1C);
	// canonicalize constant to RHS
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
	!DAG.isConstantIntBuildVectorOrConstantInt(N1))
	return DAG.getNode(ISD::XOR, SDLoc(N), VT, N1, N0);
	// fold (xor x, 0) -> x
	if (isNullConstant(N1))
	return N0;

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// reassociate xor
	if (SDValue RXOR = ReassociateOps(ISD::XOR, SDLoc(N), N0, N1))
	return RXOR;

	// fold !(x cc y) -> (x !cc y)
	SDValue LHS, RHS, CC;
	if (TLI.isConstTrueVal(N1.getNode()) && isSetCCEquivalent(N0, LHS, RHS, CC)) {
	bool isInt = LHS.getValueType().isInteger();
	ISD::CondCode NotCC = ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
	isInt);

	if (!LegalOperations \|\|
	TLI.isCondCodeLegal(NotCC, LHS.getSimpleValueType())) {
	switch (N0.getOpcode()) {
	default:
	llvm_unreachable("Unhandled SetCC Equivalent!");
	case ISD::SETCC:
	return DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC);
	case ISD::SELECT_CC:
	return DAG.getSelectCC(SDLoc(N0), LHS, RHS, N0.getOperand(2),
	N0.getOperand(3), NotCC);
	}
	}
	}

	// fold (not (zext (setcc x, y))) -> (zext (not (setcc x, y)))
	if (isOneConstant(N1) && N0.getOpcode() == ISD::ZERO_EXTEND &&
	N0.getNode()->hasOneUse() &&
	isSetCCEquivalent(N0.getOperand(0), LHS, RHS, CC)){
	SDValue V = N0.getOperand(0);
	SDLoc DL(N0);
	V = DAG.getNode(ISD::XOR, DL, V.getValueType(), V,
	DAG.getConstant(1, DL, V.getValueType()));
	AddToWorklist(V.getNode());
	return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, V);
	}

	// fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are setcc
	if (isOneConstant(N1) && VT == MVT::i1 &&
	(N0.getOpcode() == ISD::OR \|\| N0.getOpcode() == ISD::AND)) {
	SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
	if (isOneUseSetCC(RHS) \|\| isOneUseSetCC(LHS)) {
	unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND;
	LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), VT, LHS, N1); // LHS = ~LHS
	RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), VT, RHS, N1); // RHS = ~RHS
	AddToWorklist(LHS.getNode()); AddToWorklist(RHS.getNode());
	return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
	}
	}
	// fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are constants
	if (isAllOnesConstant(N1) &&
	(N0.getOpcode() == ISD::OR \|\| N0.getOpcode() == ISD::AND)) {
	SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
	if (isa<ConstantSDNode>(RHS) \|\| isa<ConstantSDNode>(LHS)) {
	unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND;
	LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), VT, LHS, N1); // LHS = ~LHS
	RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), VT, RHS, N1); // RHS = ~RHS
	AddToWorklist(LHS.getNode()); AddToWorklist(RHS.getNode());
	return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
	}
	}
	// fold (xor (and x, y), y) -> (and (not x), y)
	if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
	N0->getOperand(1) == N1) {
	SDValue X = N0->getOperand(0);
	SDValue NotX = DAG.getNOT(SDLoc(X), X, VT);
	AddToWorklist(NotX.getNode());
	return DAG.getNode(ISD::AND, SDLoc(N), VT, NotX, N1);
	}

	// fold Y = sra (X, size(X)-1); xor (add (X, Y), Y) -> (abs X)
	unsigned OpSizeInBits = VT.getScalarSizeInBits();
	if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
	N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0) &&
	TLI.isOperationLegalOrCustom(ISD::ABS, VT)) {
	if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1)))
	if (C->getAPIntValue() == (OpSizeInBits - 1))
	return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0.getOperand(0));
	}

	// fold (xor x, x) -> 0
	if (N0 == N1)
	return tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations, LegalTypes);

	// fold (xor (shl 1, x), -1) -> (rotl ~1, x)
	// Here is a concrete example of this equivalence:
	// i16 x == 14
	// i16 shl == 1 << 14 == 16384 == 0b0100000000000000
	// i16 xor == ~(1 << 14) == 49151 == 0b1011111111111111
	//
	// =>
	//
	// i16 ~1 == 0b1111111111111110
	// i16 rol(~1, 14) == 0b1011111111111111
	//
	// Some additional tips to help conceptualize this transform:
	// - Try to see the operation as placing a single zero in a value of all ones.
	// - There exists no value for x which would allow the result to contain zero.
	// - Values of x larger than the bitwidth are undefined and do not require a
	// consistent result.
	// - Pushing the zero left requires shifting one bits in from the right.
	// A rotate left of ~1 is a nice way of achieving the desired result.
	if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT) && N0.getOpcode() == ISD::SHL
	&& isAllOnesConstant(N1) && isOneConstant(N0.getOperand(0))) {
	SDLoc DL(N);
	return DAG.getNode(ISD::ROTL, DL, VT, DAG.getConstant(~1, DL, VT),
	N0.getOperand(1));
	}

	// Simplify: xor (op x...), (op y...) -> (op (xor x, y))
	if (N0.getOpcode() == N1.getOpcode())
	if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))
	return Tmp;

	// Simplify the expression using non-local knowledge.
	if (SimplifyDemandedBits(SDValue(N, 0)))
	return SDValue(N, 0);

	return SDValue();
	}

	/// Handle transforms common to the three shifts, when the shift amount is a
	/// constant.
	SDValue DAGCombiner::visitShiftByConstant(SDNode N, ConstantSDNode Amt) {
	SDNode *LHS = N->getOperand(0).getNode();
	if (!LHS->hasOneUse()) return SDValue();

	// We want to pull some binops through shifts, so that we have (and (shift))
	// instead of (shift (and)), likewise for add, or, xor, etc. This sort of
	// thing happens with address calculations, so it's important to canonicalize
	// it.
	bool HighBitSet = false; // Can we transform this if the high bit is set?

	switch (LHS->getOpcode()) {
	default: return SDValue();
	case ISD::OR:
	case ISD::XOR:
	HighBitSet = false; // We can only transform sra if the high bit is clear.
	break;
	case ISD::AND:
	HighBitSet = true; // We can only transform sra if the high bit is set.
	break;
	case ISD::ADD:
	if (N->getOpcode() != ISD::SHL)
	return SDValue(); // only shl(add) not sr[al](add).
	HighBitSet = false; // We can only transform sra if the high bit is clear.
	break;
	}

	// We require the RHS of the binop to be a constant and not opaque as well.
	ConstantSDNode *BinOpCst = getAsNonOpaqueConstant(LHS->getOperand(1));
	if (!BinOpCst) return SDValue();

	// FIXME: disable this unless the input to the binop is a shift by a constant
	// or is copy/select.Enable this in other cases when figure out it's exactly profitable.
	SDNode *BinOpLHSVal = LHS->getOperand(0).getNode();
	bool isShift = BinOpLHSVal->getOpcode() == ISD::SHL \|\|
	BinOpLHSVal->getOpcode() == ISD::SRA \|\|
	BinOpLHSVal->getOpcode() == ISD::SRL;
	bool isCopyOrSelect = BinOpLHSVal->getOpcode() == ISD::CopyFromReg \|\|
	BinOpLHSVal->getOpcode() == ISD::SELECT;

	if ((!isShift \|\| !isa<ConstantSDNode>(BinOpLHSVal->getOperand(1))) &&
	!isCopyOrSelect)
	return SDValue();

	if (isCopyOrSelect && N->hasOneUse())
	return SDValue();

	EVT VT = N->getValueType(0);

	// If this is a signed shift right, and the high bit is modified by the
	// logical operation, do not perform the transformation. The highBitSet
	// boolean indicates the value of the high bit of the constant which would
	// cause it to be modified for this operation.
	if (N->getOpcode() == ISD::SRA) {
	bool BinOpRHSSignSet = BinOpCst->getAPIntValue().isNegative();
	if (BinOpRHSSignSet != HighBitSet)
	return SDValue();
	}

	if (!TLI.isDesirableToCommuteWithShift(LHS))
	return SDValue();

	// Fold the constants, shifting the binop RHS by the shift amount.
	SDValue NewRHS = DAG.getNode(N->getOpcode(), SDLoc(LHS->getOperand(1)),
	N->getValueType(0),
	LHS->getOperand(1), N->getOperand(1));
	assert(isa<ConstantSDNode>(NewRHS) && "Folding was not successful!");

	// Create the new shift.
	SDValue NewShift = DAG.getNode(N->getOpcode(),
	SDLoc(LHS->getOperand(0)),
	VT, LHS->getOperand(0), N->getOperand(1));

	// Create the new binop.
	return DAG.getNode(LHS->getOpcode(), SDLoc(N), VT, NewShift, NewRHS);
	}

	SDValue DAGCombiner::distributeTruncateThroughAnd(SDNode *N) {
	assert(N->getOpcode() == ISD::TRUNCATE);
	assert(N->getOperand(0).getOpcode() == ISD::AND);

	// (truncate:TruncVT (and N00, N01C)) -> (and (truncate:TruncVT N00), TruncC)
	if (N->hasOneUse() && N->getOperand(0).hasOneUse()) {
	SDValue N01 = N->getOperand(0).getOperand(1);
	if (isConstantOrConstantVector(N01, /* NoOpaques */ true)) {
	SDLoc DL(N);
	EVT TruncVT = N->getValueType(0);
	SDValue N00 = N->getOperand(0).getOperand(0);
	SDValue Trunc00 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N00);
	SDValue Trunc01 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N01);
	AddToWorklist(Trunc00.getNode());
	AddToWorklist(Trunc01.getNode());
	return DAG.getNode(ISD::AND, DL, TruncVT, Trunc00, Trunc01);
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitRotate(SDNode *N) {
	SDLoc dl(N);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	unsigned Bitsize = VT.getScalarSizeInBits();

	// fold (rot x, 0) -> x
	if (isNullConstantOrNullSplatConstant(N1))
	return N0;

	// fold (rot x, c) -> (rot x, c % BitSize)
	if (ConstantSDNode *Cst = isConstOrConstSplat(N1)) {
	if (Cst->getAPIntValue().uge(Bitsize)) {
	uint64_t RotAmt = Cst->getAPIntValue().urem(Bitsize);
	return DAG.getNode(N->getOpcode(), dl, VT, N0,
	DAG.getConstant(RotAmt, dl, N1.getValueType()));
	}
	}

	// fold (rot* x, (trunc (and y, c))) -> (rot* x, (and (trunc y), (trunc c))).
	if (N1.getOpcode() == ISD::TRUNCATE &&
	N1.getOperand(0).getOpcode() == ISD::AND) {
	if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
	return DAG.getNode(N->getOpcode(), dl, VT, N0, NewOp1);
	}

	unsigned NextOp = N0.getOpcode();
	// fold (rot* (rot* x, c2), c1) -> (rot* x, c1 +- c2 % bitsize)
	if (NextOp == ISD::ROTL \|\| NextOp == ISD::ROTR) {
	SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1);
	SDNode *C2 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1));
	if (C1 && C2 && C1->getValueType(0) == C2->getValueType(0)) {
	EVT ShiftVT = C1->getValueType(0);
	bool SameSide = (N->getOpcode() == NextOp);
	unsigned CombineOp = SameSide ? ISD::ADD : ISD::SUB;
	if (SDValue CombinedShift =
	DAG.FoldConstantArithmetic(CombineOp, dl, ShiftVT, C1, C2)) {
	SDValue BitsizeC = DAG.getConstant(Bitsize, dl, ShiftVT);
	SDValue CombinedShiftNorm = DAG.FoldConstantArithmetic(
	ISD::SREM, dl, ShiftVT, CombinedShift.getNode(),
	BitsizeC.getNode());
	return DAG.getNode(N->getOpcode(), dl, VT, N0->getOperand(0),
	CombinedShiftNorm);
	}
	}
	}
	return SDValue();
	}

	SDValue DAGCombiner::visitSHL(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	unsigned OpSizeInBits = VT.getScalarSizeInBits();

	// fold vector ops
	if (VT.isVector()) {
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	BuildVectorSDNode *N1CV = dyn_cast<BuildVectorSDNode>(N1);
	// If setcc produces all-one true value then:
	// (shl (and (setcc) N01CV) N1CV) -> (and (setcc) N01CV<<N1CV)
	if (N1CV && N1CV->isConstant()) {
	if (N0.getOpcode() == ISD::AND) {
	SDValue N00 = N0->getOperand(0);
	SDValue N01 = N0->getOperand(1);
	BuildVectorSDNode *N01CV = dyn_cast<BuildVectorSDNode>(N01);

	if (N01CV && N01CV->isConstant() && N00.getOpcode() == ISD::SETCC &&
	TLI.getBooleanContents(N00.getOperand(0).getValueType()) ==
	TargetLowering::ZeroOrNegativeOneBooleanContent) {
	if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT,
	N01CV, N1CV))
	return DAG.getNode(ISD::AND, SDLoc(N), VT, N00, C);
	}
	}
	}
	}

	ConstantSDNode *N1C = isConstOrConstSplat(N1);

	// fold (shl c1, c2) -> c1<<c2
	ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
	if (N0C && N1C && !N1C->isOpaque())
	return DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, N0C, N1C);
	// fold (shl 0, x) -> 0
	if (isNullConstantOrNullSplatConstant(N0))
	return N0;
	// fold (shl x, c >= size(x)) -> undef
	// NOTE: ALL vector elements must be too big to avoid partial UNDEFs.
	auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) {
	return Val->getAPIntValue().uge(OpSizeInBits);
	};
	if (matchUnaryPredicate(N1, MatchShiftTooBig))
	return DAG.getUNDEF(VT);
	// fold (shl x, 0) -> x
	if (N1C && N1C->isNullValue())
	return N0;
	// fold (shl undef, x) -> 0
	if (N0.isUndef())
	return DAG.getConstant(0, SDLoc(N), VT);

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// if (shl x, c) is known to be zero, return 0
	if (DAG.MaskedValueIsZero(SDValue(N, 0),
	APInt::getAllOnesValue(OpSizeInBits)))
	return DAG.getConstant(0, SDLoc(N), VT);
	// fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))).
	if (N1.getOpcode() == ISD::TRUNCATE &&
	N1.getOperand(0).getOpcode() == ISD::AND) {
	if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
	return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, NewOp1);
	}

	if (N1C && SimplifyDemandedBits(SDValue(N, 0)))
	return SDValue(N, 0);

	// fold (shl (shl x, c1), c2) -> 0 or (shl x, (add c1, c2))
	if (N0.getOpcode() == ISD::SHL) {
	auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS,
	ConstantSDNode *RHS) {
	APInt c1 = LHS->getAPIntValue();
	APInt c2 = RHS->getAPIntValue();
	zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
	return (c1 + c2).uge(OpSizeInBits);
	};
	if (matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
	return DAG.getConstant(0, SDLoc(N), VT);

	auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS,
	ConstantSDNode *RHS) {
	APInt c1 = LHS->getAPIntValue();
	APInt c2 = RHS->getAPIntValue();
	zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
	return (c1 + c2).ult(OpSizeInBits);
	};
	if (matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
	SDLoc DL(N);
	EVT ShiftVT = N1.getValueType();
	SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1));
	return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Sum);
	}
	}

	// fold (shl (ext (shl x, c1)), c2) -> (ext (shl x, (add c1, c2)))
	// For this to be valid, the second form must not preserve any of the bits
	// that are shifted out by the inner shift in the first form. This means
	// the outer shift size must be >= the number of bits added by the ext.
	// As a corollary, we don't care what kind of ext it is.
	if (N1C && (N0.getOpcode() == ISD::ZERO_EXTEND \|\|
	N0.getOpcode() == ISD::ANY_EXTEND \|\|
	N0.getOpcode() == ISD::SIGN_EXTEND) &&
	N0.getOperand(0).getOpcode() == ISD::SHL) {
	SDValue N0Op0 = N0.getOperand(0);
	if (ConstantSDNode *N0Op0C1 = isConstOrConstSplat(N0Op0.getOperand(1))) {
	APInt c1 = N0Op0C1->getAPIntValue();
	APInt c2 = N1C->getAPIntValue();
	zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);

	EVT InnerShiftVT = N0Op0.getValueType();
	uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits();
	if (c2.uge(OpSizeInBits - InnerShiftSize)) {
	SDLoc DL(N0);
	APInt Sum = c1 + c2;
	if (Sum.uge(OpSizeInBits))
	return DAG.getConstant(0, DL, VT);

	return DAG.getNode(
	ISD::SHL, DL, VT,
	DAG.getNode(N0.getOpcode(), DL, VT, N0Op0->getOperand(0)),
	DAG.getConstant(Sum.getZExtValue(), DL, N1.getValueType()));
	}
	}
	}

	// fold (shl (zext (srl x, C)), C) -> (zext (shl (srl x, C), C))
	// Only fold this if the inner zext has no other uses to avoid increasing
	// the total number of instructions.
	if (N1C && N0.getOpcode() == ISD::ZERO_EXTEND && N0.hasOneUse() &&
	N0.getOperand(0).getOpcode() == ISD::SRL) {
	SDValue N0Op0 = N0.getOperand(0);
	if (ConstantSDNode *N0Op0C1 = isConstOrConstSplat(N0Op0.getOperand(1))) {
	if (N0Op0C1->getAPIntValue().ult(VT.getScalarSizeInBits())) {
	uint64_t c1 = N0Op0C1->getZExtValue();
	uint64_t c2 = N1C->getZExtValue();
	if (c1 == c2) {
	SDValue NewOp0 = N0.getOperand(0);
	EVT CountVT = NewOp0.getOperand(1).getValueType();
	SDLoc DL(N);
	SDValue NewSHL = DAG.getNode(ISD::SHL, DL, NewOp0.getValueType(),
	NewOp0,
	DAG.getConstant(c2, DL, CountVT));
	AddToWorklist(NewSHL.getNode());
	return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N0), VT, NewSHL);
	}
	}
	}
	}

	// fold (shl (sr[la] exact X, C1), C2) -> (shl X, (C2-C1)) if C1 <= C2
	// fold (shl (sr[la] exact X, C1), C2) -> (sr[la] X, (C2-C1)) if C1 > C2
	if (N1C && (N0.getOpcode() == ISD::SRL \|\| N0.getOpcode() == ISD::SRA) &&
	N0->getFlags().hasExact()) {
	if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
	uint64_t C1 = N0C1->getZExtValue();
	uint64_t C2 = N1C->getZExtValue();
	SDLoc DL(N);
	if (C1 <= C2)
	return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0),
	DAG.getConstant(C2 - C1, DL, N1.getValueType()));
	return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0),
	DAG.getConstant(C1 - C2, DL, N1.getValueType()));
	}
	}

	// fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or
	// (and (srl x, (sub c1, c2), MASK)
	// Only fold this if the inner shift has no other uses -- if it does, folding
	// this will increase the total number of instructions.
	if (N1C && N0.getOpcode() == ISD::SRL && N0.hasOneUse()) {
	if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
	uint64_t c1 = N0C1->getZExtValue();
	if (c1 < OpSizeInBits) {
	uint64_t c2 = N1C->getZExtValue();
	APInt Mask = APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - c1);
	SDValue Shift;
	if (c2 > c1) {
	Mask <<= c2 - c1;
	SDLoc DL(N);
	Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0),
	DAG.getConstant(c2 - c1, DL, N1.getValueType()));
	} else {
	Mask.lshrInPlace(c1 - c2);
	SDLoc DL(N);
	Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0),
	DAG.getConstant(c1 - c2, DL, N1.getValueType()));
	}
	SDLoc DL(N0);
	return DAG.getNode(ISD::AND, DL, VT, Shift,
	DAG.getConstant(Mask, DL, VT));
	}
	}
	}

	// fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1))
	if (N0.getOpcode() == ISD::SRA && N1 == N0.getOperand(1) &&
	isConstantOrConstantVector(N1, /* No Opaques */ true)) {
	SDLoc DL(N);
	SDValue AllBits = DAG.getAllOnesConstant(DL, VT);
	SDValue HiBitsMask = DAG.getNode(ISD::SHL, DL, VT, AllBits, N1);
	return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), HiBitsMask);
	}

	// fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
	// fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
	// Variant of version done on multiply, except mul by a power of 2 is turned
	// into a shift.
	if ((N0.getOpcode() == ISD::ADD \|\| N0.getOpcode() == ISD::OR) &&
	N0.getNode()->hasOneUse() &&
	isConstantOrConstantVector(N1, /* No Opaques */ true) &&
	isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) {
	SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1);
	SDValue Shl1 = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);
	AddToWorklist(Shl0.getNode());
	AddToWorklist(Shl1.getNode());
	return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Shl0, Shl1);
	}

	// fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)
	if (N0.getOpcode() == ISD::MUL && N0.getNode()->hasOneUse() &&
	isConstantOrConstantVector(N1, /* No Opaques */ true) &&
	isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) {
	SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);
	if (isConstantOrConstantVector(Shl))
	return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), Shl);
	}

	if (N1C && !N1C->isOpaque())
	if (SDValue NewSHL = visitShiftByConstant(N, N1C))
	return NewSHL;

	return SDValue();
	}

	SDValue DAGCombiner::visitSRA(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	unsigned OpSizeInBits = VT.getScalarSizeInBits();

	// Arithmetic shifting an all-sign-bit value is a no-op.
	// fold (sra 0, x) -> 0
	// fold (sra -1, x) -> -1
	if (DAG.ComputeNumSignBits(N0) == OpSizeInBits)
	return N0;

	// fold vector ops
	if (VT.isVector())
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	ConstantSDNode *N1C = isConstOrConstSplat(N1);

	// fold (sra c1, c2) -> (sra c1, c2)
	ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
	if (N0C && N1C && !N1C->isOpaque())
	return DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, N0C, N1C);
	// fold (sra x, c >= size(x)) -> undef
	// NOTE: ALL vector elements must be too big to avoid partial UNDEFs.
	auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) {
	return Val->getAPIntValue().uge(OpSizeInBits);
	};
	if (matchUnaryPredicate(N1, MatchShiftTooBig))
	return DAG.getUNDEF(VT);
	// fold (sra x, 0) -> x
	if (N1C && N1C->isNullValue())
	return N0;

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports
	// sext_inreg.
	if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) {
	unsigned LowBits = OpSizeInBits - (unsigned)N1C->getZExtValue();
	EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), LowBits);
	if (VT.isVector())
	ExtVT = EVT::getVectorVT(*DAG.getContext(),
	ExtVT, VT.getVectorNumElements());
	if ((!LegalOperations \|\|
	TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, ExtVT)))
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,
	N0.getOperand(0), DAG.getValueType(ExtVT));
	}

	// fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2))
	if (N0.getOpcode() == ISD::SRA) {
	SDLoc DL(N);
	EVT ShiftVT = N1.getValueType();

	auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS,
	ConstantSDNode *RHS) {
	APInt c1 = LHS->getAPIntValue();
	APInt c2 = RHS->getAPIntValue();
	zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
	return (c1 + c2).uge(OpSizeInBits);
	};
	if (matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
	return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0),
	DAG.getConstant(OpSizeInBits - 1, DL, ShiftVT));

	auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS,
	ConstantSDNode *RHS) {
	APInt c1 = LHS->getAPIntValue();
	APInt c2 = RHS->getAPIntValue();
	zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
	return (c1 + c2).ult(OpSizeInBits);
	};
	if (matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
	SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1));
	return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0), Sum);
	}
	}

	// fold (sra (shl X, m), (sub result_size, n))
	// -> (sign_extend (trunc (shl X, (sub (sub result_size, n), m)))) for
	// result_size - n != m.
	// If truncate is free for the target sext(shl) is likely to result in better
	// code.
	if (N0.getOpcode() == ISD::SHL && N1C) {
	// Get the two constanst of the shifts, CN0 = m, CN = n.
	const ConstantSDNode *N01C = isConstOrConstSplat(N0.getOperand(1));
	if (N01C) {
	LLVMContext &Ctx = *DAG.getContext();
	// Determine what the truncate's result bitsize and type would be.
	EVT TruncVT = EVT::getIntegerVT(Ctx, OpSizeInBits - N1C->getZExtValue());

	if (VT.isVector())
	TruncVT = EVT::getVectorVT(Ctx, TruncVT, VT.getVectorNumElements());

	// Determine the residual right-shift amount.
	int ShiftAmt = N1C->getZExtValue() - N01C->getZExtValue();

	// If the shift is not a no-op (in which case this should be just a sign
	// extend already), the truncated to type is legal, sign_extend is legal
	// on that type, and the truncate to that type is both legal and free,
	// perform the transform.
	if ((ShiftAmt > 0) &&
	TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND, TruncVT) &&
	TLI.isOperationLegalOrCustom(ISD::TRUNCATE, VT) &&
	TLI.isTruncateFree(VT, TruncVT)) {
	SDLoc DL(N);
	SDValue Amt = DAG.getConstant(ShiftAmt, DL,
	getShiftAmountTy(N0.getOperand(0).getValueType()));
	SDValue Shift = DAG.getNode(ISD::SRL, DL, VT,
	N0.getOperand(0), Amt);
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT,
	Shift);
	return DAG.getNode(ISD::SIGN_EXTEND, DL,
	N->getValueType(0), Trunc);
	}
	}
	}

	// fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))).
	if (N1.getOpcode() == ISD::TRUNCATE &&
	N1.getOperand(0).getOpcode() == ISD::AND) {
	if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
	return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0, NewOp1);
	}

	// fold (sra (trunc (srl x, c1)), c2) -> (trunc (sra x, c1 + c2))
	// if c1 is equal to the number of bits the trunc removes
	if (N0.getOpcode() == ISD::TRUNCATE &&
	(N0.getOperand(0).getOpcode() == ISD::SRL \|\|
	N0.getOperand(0).getOpcode() == ISD::SRA) &&
	N0.getOperand(0).hasOneUse() &&
	N0.getOperand(0).getOperand(1).hasOneUse() &&
	N1C) {
	SDValue N0Op0 = N0.getOperand(0);
	if (ConstantSDNode *LargeShift = isConstOrConstSplat(N0Op0.getOperand(1))) {
	unsigned LargeShiftVal = LargeShift->getZExtValue();
	EVT LargeVT = N0Op0.getValueType();

	if (LargeVT.getScalarSizeInBits() - OpSizeInBits == LargeShiftVal) {
	SDLoc DL(N);
	SDValue Amt =
	DAG.getConstant(LargeShiftVal + N1C->getZExtValue(), DL,
	getShiftAmountTy(N0Op0.getOperand(0).getValueType()));
	SDValue SRA = DAG.getNode(ISD::SRA, DL, LargeVT,
	N0Op0.getOperand(0), Amt);
	return DAG.getNode(ISD::TRUNCATE, DL, VT, SRA);
	}
	}
	}

	// Simplify, based on bits shifted out of the LHS.
	if (N1C && SimplifyDemandedBits(SDValue(N, 0)))
	return SDValue(N, 0);

	// If the sign bit is known to be zero, switch this to a SRL.
	if (DAG.SignBitIsZero(N0))
	return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1);

	if (N1C && !N1C->isOpaque())
	if (SDValue NewSRA = visitShiftByConstant(N, N1C))
	return NewSRA;

	return SDValue();
	}

	SDValue DAGCombiner::visitSRL(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	unsigned OpSizeInBits = VT.getScalarSizeInBits();

	// fold vector ops
	if (VT.isVector())
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	ConstantSDNode *N1C = isConstOrConstSplat(N1);

	// fold (srl c1, c2) -> c1 >>u c2
	ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
	if (N0C && N1C && !N1C->isOpaque())
	return DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, N0C, N1C);
	// fold (srl 0, x) -> 0
	if (isNullConstantOrNullSplatConstant(N0))
	return N0;
	// fold (srl x, c >= size(x)) -> undef
	// NOTE: ALL vector elements must be too big to avoid partial UNDEFs.
	auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) {
	return Val->getAPIntValue().uge(OpSizeInBits);
	};
	if (matchUnaryPredicate(N1, MatchShiftTooBig))
	return DAG.getUNDEF(VT);
	// fold (srl x, 0) -> x
	if (N1C && N1C->isNullValue())
	return N0;

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// if (srl x, c) is known to be zero, return 0
	if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),
	APInt::getAllOnesValue(OpSizeInBits)))
	return DAG.getConstant(0, SDLoc(N), VT);

	// fold (srl (srl x, c1), c2) -> 0 or (srl x, (add c1, c2))
	if (N0.getOpcode() == ISD::SRL) {
	auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS,
	ConstantSDNode *RHS) {
	APInt c1 = LHS->getAPIntValue();
	APInt c2 = RHS->getAPIntValue();
	zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
	return (c1 + c2).uge(OpSizeInBits);
	};
	if (matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
	return DAG.getConstant(0, SDLoc(N), VT);

	auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS,
	ConstantSDNode *RHS) {
	APInt c1 = LHS->getAPIntValue();
	APInt c2 = RHS->getAPIntValue();
	zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
	return (c1 + c2).ult(OpSizeInBits);
	};
	if (matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
	SDLoc DL(N);
	EVT ShiftVT = N1.getValueType();
	SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1));
	return DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Sum);
	}
	}

	// fold (srl (trunc (srl x, c1)), c2) -> 0 or (trunc (srl x, (add c1, c2)))
	if (N1C && N0.getOpcode() == ISD::TRUNCATE &&
	N0.getOperand(0).getOpcode() == ISD::SRL) {
	if (auto N001C = isConstOrConstSplat(N0.getOperand(0).getOperand(1))) {
	uint64_t c1 = N001C->getZExtValue();
	uint64_t c2 = N1C->getZExtValue();
	EVT InnerShiftVT = N0.getOperand(0).getValueType();
	EVT ShiftCountVT = N0.getOperand(0).getOperand(1).getValueType();
	uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits();
	// This is only valid if the OpSizeInBits + c1 = size of inner shift.
	if (c1 + OpSizeInBits == InnerShiftSize) {
	SDLoc DL(N0);
	if (c1 + c2 >= InnerShiftSize)
	return DAG.getConstant(0, DL, VT);
	return DAG.getNode(ISD::TRUNCATE, DL, VT,
	DAG.getNode(ISD::SRL, DL, InnerShiftVT,
	N0.getOperand(0).getOperand(0),
	DAG.getConstant(c1 + c2, DL,
	ShiftCountVT)));
	}
	}
	}

	// fold (srl (shl x, c), c) -> (and x, cst2)
	if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 &&
	isConstantOrConstantVector(N1, /* NoOpaques */ true)) {
	SDLoc DL(N);
	SDValue Mask =
	DAG.getNode(ISD::SRL, DL, VT, DAG.getAllOnesConstant(DL, VT), N1);
	AddToWorklist(Mask.getNode());
	return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), Mask);
	}

	// fold (srl (anyextend x), c) -> (and (anyextend (srl x, c)), mask)
	if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {
	// Shifting in all undef bits?
	EVT SmallVT = N0.getOperand(0).getValueType();
	unsigned BitSize = SmallVT.getScalarSizeInBits();
	if (N1C->getZExtValue() >= BitSize)
	return DAG.getUNDEF(VT);

	if (!LegalTypes \|\| TLI.isTypeDesirableForOp(ISD::SRL, SmallVT)) {
	uint64_t ShiftAmt = N1C->getZExtValue();
	SDLoc DL0(N0);
	SDValue SmallShift = DAG.getNode(ISD::SRL, DL0, SmallVT,
	N0.getOperand(0),
	DAG.getConstant(ShiftAmt, DL0,
	getShiftAmountTy(SmallVT)));
	AddToWorklist(SmallShift.getNode());
	APInt Mask = APInt::getLowBitsSet(OpSizeInBits, OpSizeInBits - ShiftAmt);
	SDLoc DL(N);
	return DAG.getNode(ISD::AND, DL, VT,
	DAG.getNode(ISD::ANY_EXTEND, DL, VT, SmallShift),
	DAG.getConstant(Mask, DL, VT));
	}
	}

	// fold (srl (sra X, Y), 31) -> (srl X, 31). This srl only looks at the sign
	// bit, which is unmodified by sra.
	if (N1C && N1C->getZExtValue() + 1 == OpSizeInBits) {
	if (N0.getOpcode() == ISD::SRA)
	return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0.getOperand(0), N1);
	}

	// fold (srl (ctlz x), "5") -> x iff x has one bit set (the low bit).
	if (N1C && N0.getOpcode() == ISD::CTLZ &&
	N1C->getAPIntValue() == Log2_32(OpSizeInBits)) {
	KnownBits Known;
	DAG.computeKnownBits(N0.getOperand(0), Known);

	// If any of the input bits are KnownOne, then the input couldn't be all
	// zeros, thus the result of the srl will always be zero.
	if (Known.One.getBoolValue()) return DAG.getConstant(0, SDLoc(N0), VT);

	// If all of the bits input the to ctlz node are known to be zero, then
	// the result of the ctlz is "32" and the result of the shift is one.
	APInt UnknownBits = ~Known.Zero;
	if (UnknownBits == 0) return DAG.getConstant(1, SDLoc(N0), VT);

	// Otherwise, check to see if there is exactly one bit input to the ctlz.
	if (UnknownBits.isPowerOf2()) {
	// Okay, we know that only that the single bit specified by UnknownBits
	// could be set on input to the CTLZ node. If this bit is set, the SRL
	// will return 0, if it is clear, it returns 1. Change the CTLZ/SRL pair
	// to an SRL/XOR pair, which is likely to simplify more.
	unsigned ShAmt = UnknownBits.countTrailingZeros();
	SDValue Op = N0.getOperand(0);

	if (ShAmt) {
	SDLoc DL(N0);
	Op = DAG.getNode(ISD::SRL, DL, VT, Op,
	DAG.getConstant(ShAmt, DL,
	getShiftAmountTy(Op.getValueType())));
	AddToWorklist(Op.getNode());
	}

	SDLoc DL(N);
	return DAG.getNode(ISD::XOR, DL, VT,
	Op, DAG.getConstant(1, DL, VT));
	}
	}

	// fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))).
	if (N1.getOpcode() == ISD::TRUNCATE &&
	N1.getOperand(0).getOpcode() == ISD::AND) {
	if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
	return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, NewOp1);
	}

	// fold operands of srl based on knowledge that the low bits are not
	// demanded.
	if (N1C && SimplifyDemandedBits(SDValue(N, 0)))
	return SDValue(N, 0);

	if (N1C && !N1C->isOpaque())
	if (SDValue NewSRL = visitShiftByConstant(N, N1C))
	return NewSRL;

	// Attempt to convert a srl of a load into a narrower zero-extending load.
	if (SDValue NarrowLoad = ReduceLoadWidth(N))
	return NarrowLoad;

	// Here is a common situation. We want to optimize:
	//
	// %a = ...
	// %b = and i32 %a, 2
	// %c = srl i32 %b, 1
	// brcond i32 %c ...
	//
	// into
	//
	// %a = ...
	// %b = and %a, 2
	// %c = setcc eq %b, 0
	// brcond %c ...
	//
	// However when after the source operand of SRL is optimized into AND, the SRL
	// itself may not be optimized further. Look for it and add the BRCOND into
	// the worklist.
	if (N->hasOneUse()) {
	SDNode Use = N->use_begin();
	if (Use->getOpcode() == ISD::BRCOND)
	AddToWorklist(Use);
	else if (Use->getOpcode() == ISD::TRUNCATE && Use->hasOneUse()) {
	// Also look pass the truncate.
	Use = *Use->use_begin();
	if (Use->getOpcode() == ISD::BRCOND)
	AddToWorklist(Use);
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitABS(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (abs c1) -> c2
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0);
	// fold (abs (abs x)) -> (abs x)
	if (N0.getOpcode() == ISD::ABS)
	return N0;
	// fold (abs x) -> x iff not-negative
	if (DAG.SignBitIsZero(N0))
	return N0;
	return SDValue();
	}

	SDValue DAGCombiner::visitBSWAP(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (bswap c1) -> c2
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N0);
	// fold (bswap (bswap x)) -> x
	if (N0.getOpcode() == ISD::BSWAP)
	return N0->getOperand(0);
	return SDValue();
	}

	SDValue DAGCombiner::visitBITREVERSE(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (bitreverse c1) -> c2
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::BITREVERSE, SDLoc(N), VT, N0);
	// fold (bitreverse (bitreverse x)) -> x
	if (N0.getOpcode() == ISD::BITREVERSE)
	return N0.getOperand(0);
	return SDValue();
	}

	SDValue DAGCombiner::visitCTLZ(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (ctlz c1) -> c2
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::CTLZ, SDLoc(N), VT, N0);
	return SDValue();
	}

	SDValue DAGCombiner::visitCTLZ_ZERO_UNDEF(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (ctlz_zero_undef c1) -> c2
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0);
	return SDValue();
	}

	SDValue DAGCombiner::visitCTTZ(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (cttz c1) -> c2
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::CTTZ, SDLoc(N), VT, N0);
	return SDValue();
	}

	SDValue DAGCombiner::visitCTTZ_ZERO_UNDEF(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (cttz_zero_undef c1) -> c2
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SDLoc(N), VT, N0);
	return SDValue();
	}

	SDValue DAGCombiner::visitCTPOP(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (ctpop c1) -> c2
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::CTPOP, SDLoc(N), VT, N0);
	return SDValue();
	}

	/// \brief Generate Min/Max node
	static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
	SDValue RHS, SDValue True, SDValue False,
	ISD::CondCode CC, const TargetLowering &TLI,
	SelectionDAG &DAG) {
	if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
	return SDValue();

	switch (CC) {
	case ISD::SETOLT:
	case ISD::SETOLE:
	case ISD::SETLT:
	case ISD::SETLE:
	case ISD::SETULT:
	case ISD::SETULE: {
	unsigned Opcode = (LHS == True) ? ISD::FMINNUM : ISD::FMAXNUM;
	if (TLI.isOperationLegal(Opcode, VT))
	return DAG.getNode(Opcode, DL, VT, LHS, RHS);
	return SDValue();
	}
	case ISD::SETOGT:
	case ISD::SETOGE:
	case ISD::SETGT:
	case ISD::SETGE:
	case ISD::SETUGT:
	case ISD::SETUGE: {
	unsigned Opcode = (LHS == True) ? ISD::FMAXNUM : ISD::FMINNUM;
	if (TLI.isOperationLegal(Opcode, VT))
	return DAG.getNode(Opcode, DL, VT, LHS, RHS);
	return SDValue();
	}
	default:
	return SDValue();
	}
	}

	SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {
	SDValue Cond = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue N2 = N->getOperand(2);
	EVT VT = N->getValueType(0);
	EVT CondVT = Cond.getValueType();
	SDLoc DL(N);

	if (!VT.isInteger())
	return SDValue();

	auto *C1 = dyn_cast<ConstantSDNode>(N1);
	auto *C2 = dyn_cast<ConstantSDNode>(N2);
	if (!C1 \|\| !C2)
	return SDValue();

	// Only do this before legalization to avoid conflicting with target-specific
	// transforms in the other direction (create a select from a zext/sext). There
	// is also a target-independent combine here in DAGCombiner in the other
	// direction for (select Cond, -1, 0) when the condition is not i1.
	if (CondVT == MVT::i1 && !LegalOperations) {
	if (C1->isNullValue() && C2->isOne()) {
	// select Cond, 0, 1 --> zext (!Cond)
	SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
	if (VT != MVT::i1)
	NotCond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NotCond);
	return NotCond;
	}
	if (C1->isNullValue() && C2->isAllOnesValue()) {
	// select Cond, 0, -1 --> sext (!Cond)
	SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
	if (VT != MVT::i1)
	NotCond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NotCond);
	return NotCond;
	}
	if (C1->isOne() && C2->isNullValue()) {
	// select Cond, 1, 0 --> zext (Cond)
	if (VT != MVT::i1)
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
	return Cond;
	}
	if (C1->isAllOnesValue() && C2->isNullValue()) {
	// select Cond, -1, 0 --> sext (Cond)
	if (VT != MVT::i1)
	Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
	return Cond;
	}

	// For any constants that differ by 1, we can transform the select into an
	// extend and add. Use a target hook because some targets may prefer to
	// transform in the other direction.
	if (TLI.convertSelectOfConstantsToMath(VT)) {
	if (C1->getAPIntValue() - 1 == C2->getAPIntValue()) {
	// select Cond, C1, C1-1 --> add (zext Cond), C1-1
	if (VT != MVT::i1)
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
	return DAG.getNode(ISD::ADD, DL, VT, Cond, N2);
	}
	if (C1->getAPIntValue() + 1 == C2->getAPIntValue()) {
	// select Cond, C1, C1+1 --> add (sext Cond), C1+1
	if (VT != MVT::i1)
	Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
	return DAG.getNode(ISD::ADD, DL, VT, Cond, N2);
	}
	}

	return SDValue();
	}

	// fold (select Cond, 0, 1) -> (xor Cond, 1)
	// We can't do this reliably if integer based booleans have different contents
	// to floating point based booleans. This is because we can't tell whether we
	// have an integer-based boolean or a floating-point-based boolean unless we
	// can find the SETCC that produced it and inspect its operands. This is
	// fairly easy if C is the SETCC node, but it can potentially be
	// undiscoverable (or not reasonably discoverable). For example, it could be
	// in another basic block or it could require searching a complicated
	// expression.
	if (CondVT.isInteger() &&
	TLI.getBooleanContents(false, true) ==
	TargetLowering::ZeroOrOneBooleanContent &&
	TLI.getBooleanContents(false, false) ==
	TargetLowering::ZeroOrOneBooleanContent &&
	C1->isNullValue() && C2->isOne()) {
	SDValue NotCond =
	DAG.getNode(ISD::XOR, DL, CondVT, Cond, DAG.getConstant(1, DL, CondVT));
	if (VT.bitsEq(CondVT))
	return NotCond;
	return DAG.getZExtOrTrunc(NotCond, DL, VT);
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitSELECT(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue N2 = N->getOperand(2);
	EVT VT = N->getValueType(0);
	EVT VT0 = N0.getValueType();
	SDLoc DL(N);

	// fold (select C, X, X) -> X
	if (N1 == N2)
	return N1;

	if (const ConstantSDNode *N0C = dyn_cast<const ConstantSDNode>(N0)) {
	// fold (select true, X, Y) -> X
	// fold (select false, X, Y) -> Y
	return !N0C->isNullValue() ? N1 : N2;
	}

	// fold (select X, X, Y) -> (or X, Y)
	// fold (select X, 1, Y) -> (or C, Y)
	if (VT == VT0 && VT == MVT::i1 && (N0 == N1 \|\| isOneConstant(N1)))
	return DAG.getNode(ISD::OR, DL, VT, N0, N2);

	if (SDValue V = foldSelectOfConstants(N))
	return V;

	// fold (select C, 0, X) -> (and (not C), X)
	if (VT == VT0 && VT == MVT::i1 && isNullConstant(N1)) {
	SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT);
	AddToWorklist(NOTNode.getNode());
	return DAG.getNode(ISD::AND, DL, VT, NOTNode, N2);
	}
	// fold (select C, X, 1) -> (or (not C), X)
	if (VT == VT0 && VT == MVT::i1 && isOneConstant(N2)) {
	SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT);
	AddToWorklist(NOTNode.getNode());
	return DAG.getNode(ISD::OR, DL, VT, NOTNode, N1);
	}
	// fold (select X, Y, X) -> (and X, Y)
	// fold (select X, Y, 0) -> (and X, Y)
	if (VT == VT0 && VT == MVT::i1 && (N0 == N2 \|\| isNullConstant(N2)))
	return DAG.getNode(ISD::AND, DL, VT, N0, N1);

	// If we can fold this based on the true/false value, do so.
	if (SimplifySelectOps(N, N1, N2))
	return SDValue(N, 0); // Don't revisit N.

	if (VT0 == MVT::i1) {
	// The code in this block deals with the following 2 equivalences:
	// select(C0\|C1, x, y) <=> select(C0, x, select(C1, x, y))
	// select(C0&C1, x, y) <=> select(C0, select(C1, x, y), y)
	// The target can specify its preferred form with the
	// shouldNormalizeToSelectSequence() callback. However we always transform
	// to the right anyway if we find the inner select exists in the DAG anyway
	// and we always transform to the left side if we know that we can further
	// optimize the combination of the conditions.
	bool normalizeToSequence =
	TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT);
	// select (and Cond0, Cond1), X, Y
	// -> select Cond0, (select Cond1, X, Y), Y
	if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) {
	SDValue Cond0 = N0->getOperand(0);
	SDValue Cond1 = N0->getOperand(1);
	SDValue InnerSelect =
	DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond1, N1, N2);
	if (normalizeToSequence \|\| !InnerSelect.use_empty())
	return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0,
	InnerSelect, N2);
	}
	// select (or Cond0, Cond1), X, Y -> select Cond0, X, (select Cond1, X, Y)
	if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) {
	SDValue Cond0 = N0->getOperand(0);
	SDValue Cond1 = N0->getOperand(1);
	SDValue InnerSelect =
	DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond1, N1, N2);
	if (normalizeToSequence \|\| !InnerSelect.use_empty())
	return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0, N1,
	InnerSelect);
	}

	// select Cond0, (select Cond1, X, Y), Y -> select (and Cond0, Cond1), X, Y
	if (N1->getOpcode() == ISD::SELECT && N1->hasOneUse()) {
	SDValue N1_0 = N1->getOperand(0);
	SDValue N1_1 = N1->getOperand(1);
	SDValue N1_2 = N1->getOperand(2);
	if (N1_2 == N2 && N0.getValueType() == N1_0.getValueType()) {
	// Create the actual and node if we can generate good code for it.
	if (!normalizeToSequence) {
	SDValue And = DAG.getNode(ISD::AND, DL, N0.getValueType(), N0, N1_0);
	return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), And, N1_1, N2);
	}
	// Otherwise see if we can optimize the "and" to a better pattern.
	if (SDValue Combined = visitANDLike(N0, N1_0, N))
	return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1_1,
	N2);
	}
	}
	// select Cond0, X, (select Cond1, X, Y) -> select (or Cond0, Cond1), X, Y
	if (N2->getOpcode() == ISD::SELECT && N2->hasOneUse()) {
	SDValue N2_0 = N2->getOperand(0);
	SDValue N2_1 = N2->getOperand(1);
	SDValue N2_2 = N2->getOperand(2);
	if (N2_1 == N1 && N0.getValueType() == N2_0.getValueType()) {
	// Create the actual or node if we can generate good code for it.
	if (!normalizeToSequence) {
	SDValue Or = DAG.getNode(ISD::OR, DL, N0.getValueType(), N0, N2_0);
	return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Or, N1, N2_2);
	}
	// Otherwise see if we can optimize to a better pattern.
	if (SDValue Combined = visitORLike(N0, N2_0, N))
	return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1,
	N2_2);
	}
	}
	}

	// select (xor Cond, 1), X, Y -> select Cond, Y, X
	if (VT0 == MVT::i1) {
	if (N0->getOpcode() == ISD::XOR) {
	if (auto *C = dyn_cast<ConstantSDNode>(N0->getOperand(1))) {
	SDValue Cond0 = N0->getOperand(0);
	if (C->isOne())
	return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0, N2, N1);
	}
	}
	}

	// fold selects based on a setcc into other things, such as min/max/abs
	if (N0.getOpcode() == ISD::SETCC) {
	// select x, y (fcmp lt x, y) -> fminnum x, y
	// select x, y (fcmp gt x, y) -> fmaxnum x, y
	//
	// This is OK if we don't care about what happens if either operand is a
	// NaN.
	//

	// FIXME: Instead of testing for UnsafeFPMath, this should be checking for
	// no signed zeros as well as no nans.
	const TargetOptions &Options = DAG.getTarget().Options;
	if (Options.UnsafeFPMath && VT.isFloatingPoint() && N0.hasOneUse() &&
	DAG.isKnownNeverNaN(N1) && DAG.isKnownNeverNaN(N2)) {
	ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();

	if (SDValue FMinMax = combineMinNumMaxNum(
	DL, VT, N0.getOperand(0), N0.getOperand(1), N1, N2, CC, TLI, DAG))
	return FMinMax;
	}

	if ((!LegalOperations &&
	TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT)) \|\|
	TLI.isOperationLegal(ISD::SELECT_CC, VT))
	return DAG.getNode(ISD::SELECT_CC, DL, VT, N0.getOperand(0),
	N0.getOperand(1), N1, N2, N0.getOperand(2));
	return SimplifySelect(DL, N0, N1, N2);
	}

	return SDValue();
	}

	static
	std::pair<SDValue, SDValue> SplitVSETCC(const SDNode *N, SelectionDAG &DAG) {
	SDLoc DL(N);
	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));

	// Split the inputs.
	SDValue Lo, Hi, LL, LH, RL, RH;
	std::tie(LL, LH) = DAG.SplitVectorOperand(N, 0);
	std::tie(RL, RH) = DAG.SplitVectorOperand(N, 1);

	Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
	Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));

	return std::make_pair(Lo, Hi);
	}

	// This function assumes all the vselect's arguments are CONCAT_VECTOR
	// nodes and that the condition is a BV of ConstantSDNodes (or undefs).
	static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
	SDLoc DL(N);
	SDValue Cond = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	EVT VT = N->getValueType(0);
	int NumElems = VT.getVectorNumElements();
	assert(LHS.getOpcode() == ISD::CONCAT_VECTORS &&
	RHS.getOpcode() == ISD::CONCAT_VECTORS &&
	Cond.getOpcode() == ISD::BUILD_VECTOR);

	// CONCAT_VECTOR can take an arbitrary number of arguments. We only care about
	// binary ones here.
	if (LHS->getNumOperands() != 2 \|\| RHS->getNumOperands() != 2)
	return SDValue();

	// We're sure we have an even number of elements due to the
	// concat_vectors we have as arguments to vselect.
	// Skip BV elements until we find one that's not an UNDEF
	// After we find an UNDEF element, keep looping until we get to half the
	// length of the BV and see if all the non-undef nodes are the same.
	ConstantSDNode *BottomHalf = nullptr;
	for (int i = 0; i < NumElems / 2; ++i) {
	if (Cond->getOperand(i)->isUndef())
	continue;

	if (BottomHalf == nullptr)
	BottomHalf = cast<ConstantSDNode>(Cond.getOperand(i));
	else if (Cond->getOperand(i).getNode() != BottomHalf)
	return SDValue();
	}

	// Do the same for the second half of the BuildVector
	ConstantSDNode *TopHalf = nullptr;
	for (int i = NumElems / 2; i < NumElems; ++i) {
	if (Cond->getOperand(i)->isUndef())
	continue;

	if (TopHalf == nullptr)
	TopHalf = cast<ConstantSDNode>(Cond.getOperand(i));
	else if (Cond->getOperand(i).getNode() != TopHalf)
	return SDValue();
	}

	assert(TopHalf && BottomHalf &&
	"One half of the selector was all UNDEFs and the other was all the "
	"same value. This should have been addressed before this function.");
	return DAG.getNode(
	ISD::CONCAT_VECTORS, DL, VT,
	BottomHalf->isNullValue() ? RHS->getOperand(0) : LHS->getOperand(0),
	TopHalf->isNullValue() ? RHS->getOperand(1) : LHS->getOperand(1));
	}

	SDValue DAGCombiner::visitMSCATTER(SDNode *N) {
	if (Level >= AfterLegalizeTypes)
	return SDValue();

	MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N);
	SDValue Mask = MSC->getMask();
	SDValue Data = MSC->getValue();
	SDLoc DL(N);

	// If the MSCATTER data type requires splitting and the mask is provided by a
	// SETCC, then split both nodes and its operands before legalization. This
	// prevents the type legalizer from unrolling SETCC into scalar comparisons
	// and enables future optimizations (e.g. min/max pattern matching on X86).
	if (Mask.getOpcode() != ISD::SETCC)
	return SDValue();

	// Check if any splitting is required.
	if (TLI.getTypeAction(*DAG.getContext(), Data.getValueType()) !=
	TargetLowering::TypeSplitVector)
	return SDValue();
	SDValue MaskLo, MaskHi, Lo, Hi;
	std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);

	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MSC->getValueType(0));

	SDValue Chain = MSC->getChain();

	EVT MemoryVT = MSC->getMemoryVT();
	unsigned Alignment = MSC->getOriginalAlignment();

	EVT LoMemVT, HiMemVT;
	std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);

	SDValue DataLo, DataHi;
	std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);

	SDValue BasePtr = MSC->getBasePtr();
	SDValue IndexLo, IndexHi;
	std::tie(IndexLo, IndexHi) = DAG.SplitVector(MSC->getIndex(), DL);

	MachineMemOperand *MMO = DAG.getMachineFunction().
	getMachineMemOperand(MSC->getPointerInfo(),
	MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
	Alignment, MSC->getAAInfo(), MSC->getRanges());

	SDValue OpsLo[] = { Chain, DataLo, MaskLo, BasePtr, IndexLo };
	Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataLo.getValueType(),
	DL, OpsLo, MMO);

	SDValue OpsHi[] = {Chain, DataHi, MaskHi, BasePtr, IndexHi};
	Hi = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataHi.getValueType(),
	DL, OpsHi, MMO);

	AddToWorklist(Lo.getNode());
	AddToWorklist(Hi.getNode());

	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
	}

	SDValue DAGCombiner::visitMSTORE(SDNode *N) {
	if (Level >= AfterLegalizeTypes)
	return SDValue();

	MaskedStoreSDNode *MST = dyn_cast<MaskedStoreSDNode>(N);
	SDValue Mask = MST->getMask();
	SDValue Data = MST->getValue();
	EVT VT = Data.getValueType();
	SDLoc DL(N);

	// If the MSTORE data type requires splitting and the mask is provided by a
	// SETCC, then split both nodes and its operands before legalization. This
	// prevents the type legalizer from unrolling SETCC into scalar comparisons
	// and enables future optimizations (e.g. min/max pattern matching on X86).
	if (Mask.getOpcode() == ISD::SETCC) {
	// Check if any splitting is required.
	if (TLI.getTypeAction(*DAG.getContext(), VT) !=
	TargetLowering::TypeSplitVector)
	return SDValue();

	SDValue MaskLo, MaskHi, Lo, Hi;
	std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);

	SDValue Chain = MST->getChain();
	SDValue Ptr = MST->getBasePtr();

	EVT MemoryVT = MST->getMemoryVT();
	unsigned Alignment = MST->getOriginalAlignment();

	// if Alignment is equal to the vector size,
	// take the half of it for the second part
	unsigned SecondHalfAlignment =
	(Alignment == VT.getSizeInBits() / 8) ? Alignment / 2 : Alignment;

	EVT LoMemVT, HiMemVT;
	std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);

	SDValue DataLo, DataHi;
	std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);

	MachineMemOperand *MMO = DAG.getMachineFunction().
	getMachineMemOperand(MST->getPointerInfo(),
	MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
	Alignment, MST->getAAInfo(), MST->getRanges());

	Lo = DAG.getMaskedStore(Chain, DL, DataLo, Ptr, MaskLo, LoMemVT, MMO,
	MST->isTruncatingStore(),
	MST->isCompressingStore());

	Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
	MST->isCompressingStore());

	MMO = DAG.getMachineFunction().
	getMachineMemOperand(MST->getPointerInfo(),
	MachineMemOperand::MOStore, HiMemVT.getStoreSize(),
	SecondHalfAlignment, MST->getAAInfo(),
	MST->getRanges());

	Hi = DAG.getMaskedStore(Chain, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO,
	MST->isTruncatingStore(),
	MST->isCompressingStore());

	AddToWorklist(Lo.getNode());
	AddToWorklist(Hi.getNode());

	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
	}
	return SDValue();
	}

	SDValue DAGCombiner::visitMGATHER(SDNode *N) {
	if (Level >= AfterLegalizeTypes)
	return SDValue();

	MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(N);
	SDValue Mask = MGT->getMask();
	SDLoc DL(N);

	// If the MGATHER result requires splitting and the mask is provided by a
	// SETCC, then split both nodes and its operands before legalization. This
	// prevents the type legalizer from unrolling SETCC into scalar comparisons
	// and enables future optimizations (e.g. min/max pattern matching on X86).

	if (Mask.getOpcode() != ISD::SETCC)
	return SDValue();

	EVT VT = N->getValueType(0);

	// Check if any splitting is required.
	if (TLI.getTypeAction(*DAG.getContext(), VT) !=
	TargetLowering::TypeSplitVector)
	return SDValue();

	SDValue MaskLo, MaskHi, Lo, Hi;
	std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);

	SDValue Src0 = MGT->getValue();
	SDValue Src0Lo, Src0Hi;
	std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, DL);

	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

	SDValue Chain = MGT->getChain();
	EVT MemoryVT = MGT->getMemoryVT();
	unsigned Alignment = MGT->getOriginalAlignment();

	EVT LoMemVT, HiMemVT;
	std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);

	SDValue BasePtr = MGT->getBasePtr();
	SDValue Index = MGT->getIndex();
	SDValue IndexLo, IndexHi;
	std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL);

	MachineMemOperand *MMO = DAG.getMachineFunction().
	getMachineMemOperand(MGT->getPointerInfo(),
	MachineMemOperand::MOLoad, LoMemVT.getStoreSize(),
	Alignment, MGT->getAAInfo(), MGT->getRanges());

	SDValue OpsLo[] = { Chain, Src0Lo, MaskLo, BasePtr, IndexLo };
	Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, DL, OpsLo,
	MMO);

	SDValue OpsHi[] = {Chain, Src0Hi, MaskHi, BasePtr, IndexHi};
	Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, DL, OpsHi,
	MMO);

	AddToWorklist(Lo.getNode());
	AddToWorklist(Hi.getNode());

	// Build a factor node to remember that this load is independent of the
	// other one.
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1),
	Hi.getValue(1));

	// Legalized the chain result - switch anything that used the old chain to
	// use the new one.
	DAG.ReplaceAllUsesOfValueWith(SDValue(MGT, 1), Chain);

	SDValue GatherRes = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

	SDValue RetOps[] = { GatherRes, Chain };
	return DAG.getMergeValues(RetOps, DL);
	}

	SDValue DAGCombiner::visitMLOAD(SDNode *N) {
	if (Level >= AfterLegalizeTypes)
	return SDValue();

	MaskedLoadSDNode *MLD = dyn_cast<MaskedLoadSDNode>(N);
	SDValue Mask = MLD->getMask();
	SDLoc DL(N);

	// If the MLOAD result requires splitting and the mask is provided by a
	// SETCC, then split both nodes and its operands before legalization. This
	// prevents the type legalizer from unrolling SETCC into scalar comparisons
	// and enables future optimizations (e.g. min/max pattern matching on X86).
	if (Mask.getOpcode() == ISD::SETCC) {
	EVT VT = N->getValueType(0);

	// Check if any splitting is required.
	if (TLI.getTypeAction(*DAG.getContext(), VT) !=
	TargetLowering::TypeSplitVector)
	return SDValue();

	SDValue MaskLo, MaskHi, Lo, Hi;
	std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);

	SDValue Src0 = MLD->getSrc0();
	SDValue Src0Lo, Src0Hi;
	std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, DL);

	EVT LoVT, HiVT;
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0));

	SDValue Chain = MLD->getChain();
	SDValue Ptr = MLD->getBasePtr();
	EVT MemoryVT = MLD->getMemoryVT();
	unsigned Alignment = MLD->getOriginalAlignment();

	// if Alignment is equal to the vector size,
	// take the half of it for the second part
	unsigned SecondHalfAlignment =
	(Alignment == MLD->getValueType(0).getSizeInBits()/8) ?
	Alignment/2 : Alignment;

	EVT LoMemVT, HiMemVT;
	std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);

	MachineMemOperand *MMO = DAG.getMachineFunction().
	getMachineMemOperand(MLD->getPointerInfo(),
	MachineMemOperand::MOLoad, LoMemVT.getStoreSize(),
	Alignment, MLD->getAAInfo(), MLD->getRanges());

	Lo = DAG.getMaskedLoad(LoVT, DL, Chain, Ptr, MaskLo, Src0Lo, LoMemVT, MMO,
	ISD::NON_EXTLOAD, MLD->isExpandingLoad());

	Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
	MLD->isExpandingLoad());

	MMO = DAG.getMachineFunction().
	getMachineMemOperand(MLD->getPointerInfo(),
	MachineMemOperand::MOLoad, HiMemVT.getStoreSize(),
	SecondHalfAlignment, MLD->getAAInfo(), MLD->getRanges());

	Hi = DAG.getMaskedLoad(HiVT, DL, Chain, Ptr, MaskHi, Src0Hi, HiMemVT, MMO,
	ISD::NON_EXTLOAD, MLD->isExpandingLoad());

	AddToWorklist(Lo.getNode());
	AddToWorklist(Hi.getNode());

	// Build a factor node to remember that this load is independent of the
	// other one.
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1),
	Hi.getValue(1));

	// Legalized the chain result - switch anything that used the old chain to
	// use the new one.
	DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), Chain);

	SDValue LoadRes = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

	SDValue RetOps[] = { LoadRes, Chain };
	return DAG.getMergeValues(RetOps, DL);
	}
	return SDValue();
	}

	/// A vector select of 2 constant vectors can be simplified to math/logic to
	/// avoid a variable select instruction and possibly avoid constant loads.
	SDValue DAGCombiner::foldVSelectOfConstants(SDNode *N) {
	SDValue Cond = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue N2 = N->getOperand(2);
	EVT VT = N->getValueType(0);
	if (!Cond.hasOneUse() \|\| Cond.getScalarValueSizeInBits() != 1 \|\|
	!TLI.convertSelectOfConstantsToMath(VT) \|\|
	!ISD::isBuildVectorOfConstantSDNodes(N1.getNode()) \|\|
	!ISD::isBuildVectorOfConstantSDNodes(N2.getNode()))
	return SDValue();

	// Check if we can use the condition value to increment/decrement a single
	// constant value. This simplifies a select to an add and removes a constant
	// load/materialization from the general case.
	bool AllAddOne = true;
	bool AllSubOne = true;
	unsigned Elts = VT.getVectorNumElements();
	for (unsigned i = 0; i != Elts; ++i) {
	SDValue N1Elt = N1.getOperand(i);
	SDValue N2Elt = N2.getOperand(i);
	if (N1Elt.isUndef() \|\| N2Elt.isUndef())
	continue;

	const APInt &C1 = cast<ConstantSDNode>(N1Elt)->getAPIntValue();
	const APInt &C2 = cast<ConstantSDNode>(N2Elt)->getAPIntValue();
	if (C1 != C2 + 1)
	AllAddOne = false;
	if (C1 != C2 - 1)
	AllSubOne = false;
	}

	// Further simplifications for the extra-special cases where the constants are
	// all 0 or all -1 should be implemented as folds of these patterns.
	SDLoc DL(N);
	if (AllAddOne \|\| AllSubOne) {
	// vselect <N x i1> Cond, C+1, C --> add (zext Cond), C
	// vselect <N x i1> Cond, C-1, C --> add (sext Cond), C
	auto ExtendOpcode = AllAddOne ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
	SDValue ExtendedCond = DAG.getNode(ExtendOpcode, DL, VT, Cond);
	return DAG.getNode(ISD::ADD, DL, VT, ExtendedCond, N2);
	}

	// The general case for select-of-constants:
	// vselect <N x i1> Cond, C1, C2 --> xor (and (sext Cond), (C1^C2)), C2
	// ...but that only makes sense if a vselect is slower than 2 logic ops, so
	// leave that to a machine-specific pass.
	return SDValue();
	}

	SDValue DAGCombiner::visitVSELECT(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue N2 = N->getOperand(2);
	SDLoc DL(N);

	// fold (vselect C, X, X) -> X
	if (N1 == N2)
	return N1;

	// Canonicalize integer abs.
	// vselect (setg[te] X, 0), X, -X ->
	// vselect (setgt X, -1), X, -X ->
	// vselect (setl[te] X, 0), -X, X ->
	// Y = sra (X, size(X)-1); xor (add (X, Y), Y)
	if (N0.getOpcode() == ISD::SETCC) {
	SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
	ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
	bool isAbs = false;
	bool RHSIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());

	if (((RHSIsAllZeros && (CC == ISD::SETGT \|\| CC == ISD::SETGE)) \|\|
	(ISD::isBuildVectorAllOnes(RHS.getNode()) && CC == ISD::SETGT)) &&
	N1 == LHS && N2.getOpcode() == ISD::SUB && N1 == N2.getOperand(1))
	isAbs = ISD::isBuildVectorAllZeros(N2.getOperand(0).getNode());
	else if ((RHSIsAllZeros && (CC == ISD::SETLT \|\| CC == ISD::SETLE)) &&
	N2 == LHS && N1.getOpcode() == ISD::SUB && N2 == N1.getOperand(1))
	isAbs = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode());

	if (isAbs) {
	EVT VT = LHS.getValueType();
	if (TLI.isOperationLegalOrCustom(ISD::ABS, VT))
	return DAG.getNode(ISD::ABS, DL, VT, LHS);

	SDValue Shift = DAG.getNode(
	ISD::SRA, DL, VT, LHS,
	DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, LHS, Shift);
	AddToWorklist(Shift.getNode());
	AddToWorklist(Add.getNode());
	return DAG.getNode(ISD::XOR, DL, VT, Add, Shift);
	}
	}

	if (SimplifySelectOps(N, N1, N2))
	return SDValue(N, 0); // Don't revisit N.

	// Fold (vselect (build_vector all_ones), N1, N2) -> N1
	if (ISD::isBuildVectorAllOnes(N0.getNode()))
	return N1;
	// Fold (vselect (build_vector all_zeros), N1, N2) -> N2
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return N2;

	// The ConvertSelectToConcatVector function is assuming both the above
	// checks for (vselect (build_vector all{ones,zeros) ...) have been made
	// and addressed.
	if (N1.getOpcode() == ISD::CONCAT_VECTORS &&
	N2.getOpcode() == ISD::CONCAT_VECTORS &&
	ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
	if (SDValue CV = ConvertSelectToConcatVector(N, DAG))
	return CV;
	}

	if (SDValue V = foldVSelectOfConstants(N))
	return V;

	return SDValue();
	}

	SDValue DAGCombiner::visitSELECT_CC(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue N2 = N->getOperand(2);
	SDValue N3 = N->getOperand(3);
	SDValue N4 = N->getOperand(4);
	ISD::CondCode CC = cast<CondCodeSDNode>(N4)->get();

	// fold select_cc lhs, rhs, x, x, cc -> x
	if (N2 == N3)
	return N2;

	// Determine if the condition we're dealing with is constant
	if (SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()), N0, N1,
	CC, SDLoc(N), false)) {
	AddToWorklist(SCC.getNode());

	if (ConstantSDNode *SCCC = dyn_cast<ConstantSDNode>(SCC.getNode())) {
	if (!SCCC->isNullValue())
	return N2; // cond always true -> true val
	else
	return N3; // cond always false -> false val
	} else if (SCC->isUndef()) {
	// When the condition is UNDEF, just return the first operand. This is
	// coherent the DAG creation, no setcc node is created in this case
	return N2;
	} else if (SCC.getOpcode() == ISD::SETCC) {
	// Fold to a simpler select_cc
	return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N2.getValueType(),
	SCC.getOperand(0), SCC.getOperand(1), N2, N3,
	SCC.getOperand(2));
	}
	}

	// If we can fold this based on the true/false value, do so.
	if (SimplifySelectOps(N, N2, N3))
	return SDValue(N, 0); // Don't revisit N.

	// fold select_cc into other things, such as min/max/abs
	return SimplifySelectCC(SDLoc(N), N0, N1, N2, N3, CC);
	}

	SDValue DAGCombiner::visitSETCC(SDNode *N) {
	return SimplifySetCC(N->getValueType(0), N->getOperand(0), N->getOperand(1),
	cast<CondCodeSDNode>(N->getOperand(2))->get(),
	SDLoc(N));
	}

	SDValue DAGCombiner::visitSETCCE(SDNode *N) {
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	SDValue Carry = N->getOperand(2);
	SDValue Cond = N->getOperand(3);

	// If Carry is false, fold to a regular SETCC.
	if (Carry.getOpcode() == ISD::CARRY_FALSE)
	return DAG.getNode(ISD::SETCC, SDLoc(N), N->getVTList(), LHS, RHS, Cond);

	return SDValue();
	}

	SDValue DAGCombiner::visitSETCCCARRY(SDNode *N) {
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	SDValue Carry = N->getOperand(2);
	SDValue Cond = N->getOperand(3);

	// If Carry is false, fold to a regular SETCC.
	if (isNullConstant(Carry))
	return DAG.getNode(ISD::SETCC, SDLoc(N), N->getVTList(), LHS, RHS, Cond);

	return SDValue();
	}

	/// Try to fold a sext/zext/aext dag node into a ConstantSDNode or
	/// a build_vector of constants.
	/// This function is called by the DAGCombiner when visiting sext/zext/aext
	/// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND).
	/// Vector extends are not folded if operations are legal; this is to
	/// avoid introducing illegal build_vector dag nodes.
	static SDNode tryToFoldExtendOfConstant(SDNode N, const TargetLowering &TLI,
	SelectionDAG &DAG, bool LegalTypes,
	bool LegalOperations) {
	unsigned Opcode = N->getOpcode();
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	assert((Opcode == ISD::SIGN_EXTEND \|\| Opcode == ISD::ZERO_EXTEND \|\|
	Opcode == ISD::ANY_EXTEND \|\| Opcode == ISD::SIGN_EXTEND_VECTOR_INREG \|\|
	Opcode == ISD::ZERO_EXTEND_VECTOR_INREG)
	&& "Expected EXTEND dag node in input!");

	// fold (sext c1) -> c1
	// fold (zext c1) -> c1
	// fold (aext c1) -> c1
	if (isa<ConstantSDNode>(N0))
	return DAG.getNode(Opcode, SDLoc(N), VT, N0).getNode();

	// fold (sext (build_vector AllConstants) -> (build_vector AllConstants)
	// fold (zext (build_vector AllConstants) -> (build_vector AllConstants)
	// fold (aext (build_vector AllConstants) -> (build_vector AllConstants)
	EVT SVT = VT.getScalarType();
	if (!(VT.isVector() &&
	(!LegalTypes \|\| (!LegalOperations && TLI.isTypeLegal(SVT))) &&
	ISD::isBuildVectorOfConstantSDNodes(N0.getNode())))
	return nullptr;

	// We can fold this node into a build_vector.
	unsigned VTBits = SVT.getSizeInBits();
	unsigned EVTBits = N0->getValueType(0).getScalarSizeInBits();
	SmallVector<SDValue, 8> Elts;
	unsigned NumElts = VT.getVectorNumElements();
	SDLoc DL(N);

	for (unsigned i=0; i != NumElts; ++i) {
	SDValue Op = N0->getOperand(i);
	if (Op->isUndef()) {
	Elts.push_back(DAG.getUNDEF(SVT));
	continue;
	}

	SDLoc DL(Op);
	// Get the constant value and if needed trunc it to the size of the type.
	// Nodes like build_vector might have constants wider than the scalar type.
	APInt C = cast<ConstantSDNode>(Op)->getAPIntValue().zextOrTrunc(EVTBits);
	if (Opcode == ISD::SIGN_EXTEND \|\| Opcode == ISD::SIGN_EXTEND_VECTOR_INREG)
	Elts.push_back(DAG.getConstant(C.sext(VTBits), DL, SVT));
	else
	Elts.push_back(DAG.getConstant(C.zext(VTBits), DL, SVT));
	}

	return DAG.getBuildVector(VT, DL, Elts).getNode();
	}

	// ExtendUsesToFormExtLoad - Trying to extend uses of a load to enable this:
	// "fold ({s\|z\|a}ext (load x)) -> ({s\|z\|a}ext (truncate ({s\|z\|a}extload x)))"
	// transformation. Returns true if extension are possible and the above
	// mentioned transformation is profitable.
	static bool ExtendUsesToFormExtLoad(SDNode *N, SDValue N0,
	unsigned ExtOpc,
	SmallVectorImpl<SDNode *> &ExtendNodes,
	const TargetLowering &TLI) {
	bool HasCopyToRegUses = false;
	bool isTruncFree = TLI.isTruncateFree(N->getValueType(0), N0.getValueType());
	for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
	UE = N0.getNode()->use_end();
	UI != UE; ++UI) {
	SDNode User = UI;
	if (User == N)
	continue;
	if (UI.getUse().getResNo() != N0.getResNo())
	continue;
	// FIXME: Only extend SETCC N, N and SETCC N, c for now.
	if (ExtOpc != ISD::ANY_EXTEND && User->getOpcode() == ISD::SETCC) {
	ISD::CondCode CC = cast<CondCodeSDNode>(User->getOperand(2))->get();
	if (ExtOpc == ISD::ZERO_EXTEND && ISD::isSignedIntSetCC(CC))
	// Sign bits will be lost after a zext.
	return false;
	bool Add = false;
	for (unsigned i = 0; i != 2; ++i) {
	SDValue UseOp = User->getOperand(i);
	if (UseOp == N0)
	continue;
	if (!isa<ConstantSDNode>(UseOp))
	return false;
	Add = true;
	}
	if (Add)
	ExtendNodes.push_back(User);
	continue;
	}
	// If truncates aren't free and there are users we can't
	// extend, it isn't worthwhile.
	if (!isTruncFree)
	return false;
	// Remember if this value is live-out.
	if (User->getOpcode() == ISD::CopyToReg)
	HasCopyToRegUses = true;
	}

	if (HasCopyToRegUses) {
	bool BothLiveOut = false;
	for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
	UI != UE; ++UI) {
	SDUse &Use = UI.getUse();
	if (Use.getResNo() == 0 && Use.getUser()->getOpcode() == ISD::CopyToReg) {
	BothLiveOut = true;
	break;
	}
	}
	if (BothLiveOut)
	// Both unextended and extended values are live out. There had better be
	// a good reason for the transformation.
	return ExtendNodes.size();
	}
	return true;
	}

	void DAGCombiner::ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs,
	SDValue Trunc, SDValue ExtLoad,
	const SDLoc &DL, ISD::NodeType ExtType) {
	// Extend SetCC uses if necessary.
	for (unsigned i = 0, e = SetCCs.size(); i != e; ++i) {
	SDNode *SetCC = SetCCs[i];
	SmallVector<SDValue, 4> Ops;

	for (unsigned j = 0; j != 2; ++j) {
	SDValue SOp = SetCC->getOperand(j);
	if (SOp == Trunc)
	Ops.push_back(ExtLoad);
	else
	Ops.push_back(DAG.getNode(ExtType, DL, ExtLoad->getValueType(0), SOp));
	}

	Ops.push_back(SetCC->getOperand(2));
	CombineTo(SetCC, DAG.getNode(ISD::SETCC, DL, SetCC->getValueType(0), Ops));
	}
	}

	// FIXME: Bring more similar combines here, common to sext/zext (maybe aext?).
	SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT DstVT = N->getValueType(0);
	EVT SrcVT = N0.getValueType();

	assert((N->getOpcode() == ISD::SIGN_EXTEND \|\|
	N->getOpcode() == ISD::ZERO_EXTEND) &&
	"Unexpected node type (not an extend)!");

	// fold (sext (load x)) to multiple smaller sextloads; same for zext.
	// For example, on a target with legal v4i32, but illegal v8i32, turn:
	// (v8i32 (sext (v8i16 (load x))))
	// into:
	// (v8i32 (concat_vectors (v4i32 (sextload x)),
	// (v4i32 (sextload (x + 16)))))
	// Where uses of the original load, i.e.:
	// (v8i16 (load x))
	// are replaced with:
	// (v8i16 (truncate
	// (v8i32 (concat_vectors (v4i32 (sextload x)),
	// (v4i32 (sextload (x + 16)))))))
	//
	// This combine is only applicable to illegal, but splittable, vectors.
	// All legal types, and illegal non-vector types, are handled elsewhere.
	// This combine is controlled by TargetLowering::isVectorLoadExtDesirable.
	//
	if (N0->getOpcode() != ISD::LOAD)
	return SDValue();

	LoadSDNode *LN0 = cast<LoadSDNode>(N0);

	if (!ISD::isNON_EXTLoad(LN0) \|\| !ISD::isUNINDEXEDLoad(LN0) \|\|
	!N0.hasOneUse() \|\| LN0->isVolatile() \|\| !DstVT.isVector() \|\|
	!DstVT.isPow2VectorType() \|\| !TLI.isVectorLoadExtDesirable(SDValue(N, 0)))
	return SDValue();

	SmallVector<SDNode *, 4> SetCCs;
	if (!ExtendUsesToFormExtLoad(N, N0, N->getOpcode(), SetCCs, TLI))
	return SDValue();

	ISD::LoadExtType ExtType =
	N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;

	// Try to split the vector types to get down to legal types.
	EVT SplitSrcVT = SrcVT;
	EVT SplitDstVT = DstVT;
	while (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT) &&
	SplitSrcVT.getVectorNumElements() > 1) {
	SplitDstVT = DAG.GetSplitDestVTs(SplitDstVT).first;
	SplitSrcVT = DAG.GetSplitDestVTs(SplitSrcVT).first;
	}

	if (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT))
	return SDValue();

	SDLoc DL(N);
	const unsigned NumSplits =
	DstVT.getVectorNumElements() / SplitDstVT.getVectorNumElements();
	const unsigned Stride = SplitSrcVT.getStoreSize();
	SmallVector<SDValue, 4> Loads;
	SmallVector<SDValue, 4> Chains;

	SDValue BasePtr = LN0->getBasePtr();
	for (unsigned Idx = 0; Idx < NumSplits; Idx++) {
	const unsigned Offset = Idx * Stride;
	const unsigned Align = MinAlign(LN0->getAlignment(), Offset);

	SDValue SplitLoad = DAG.getExtLoad(
	ExtType, DL, SplitDstVT, LN0->getChain(), BasePtr,
	LN0->getPointerInfo().getWithOffset(Offset), SplitSrcVT, Align,
	LN0->getMemOperand()->getFlags(), LN0->getAAInfo());

	BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
	DAG.getConstant(Stride, DL, BasePtr.getValueType()));

	Loads.push_back(SplitLoad.getValue(0));
	Chains.push_back(SplitLoad.getValue(1));
	}

	SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
	SDValue NewValue = DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Loads);

	// Simplify TF.
	AddToWorklist(NewChain.getNode());

	CombineTo(N, NewValue);

	// Replace uses of the original load (before extension)
	// with a truncate of the concatenated sextloaded vectors.
	SDValue Trunc =
	DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), NewValue);
	CombineTo(N0.getNode(), Trunc, NewChain);
	ExtendSetCCUses(SetCCs, Trunc, NewValue, DL,
	(ISD::NodeType)N->getOpcode());
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}

	/// If we're narrowing or widening the result of a vector select and the final
	/// size is the same size as a setcc (compare) feeding the select, then try to
	/// apply the cast operation to the select's operands because matching vector
	/// sizes for a select condition and other operands should be more efficient.
	SDValue DAGCombiner::matchVSelectOpSizesWithSetCC(SDNode *Cast) {
	unsigned CastOpcode = Cast->getOpcode();
	assert((CastOpcode == ISD::SIGN_EXTEND \|\| CastOpcode == ISD::ZERO_EXTEND \|\|
	CastOpcode == ISD::TRUNCATE \|\| CastOpcode == ISD::FP_EXTEND \|\|
	CastOpcode == ISD::FP_ROUND) &&
	"Unexpected opcode for vector select narrowing/widening");

	// We only do this transform before legal ops because the pattern may be
	// obfuscated by target-specific operations after legalization. Do not create
	// an illegal select op, however, because that may be difficult to lower.
	EVT VT = Cast->getValueType(0);
	if (LegalOperations \|\| !TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
	return SDValue();

	SDValue VSel = Cast->getOperand(0);
	if (VSel.getOpcode() != ISD::VSELECT \|\| !VSel.hasOneUse() \|\|
	VSel.getOperand(0).getOpcode() != ISD::SETCC)
	return SDValue();

	// Does the setcc have the same vector size as the casted select?
	SDValue SetCC = VSel.getOperand(0);
	EVT SetCCVT = getSetCCResultType(SetCC.getOperand(0).getValueType());
	if (SetCCVT.getSizeInBits() != VT.getSizeInBits())
	return SDValue();

	// cast (vsel (setcc X), A, B) --> vsel (setcc X), (cast A), (cast B)
	SDValue A = VSel.getOperand(1);
	SDValue B = VSel.getOperand(2);
	SDValue CastA, CastB;
	SDLoc DL(Cast);
	if (CastOpcode == ISD::FP_ROUND) {
	// FP_ROUND (fptrunc) has an extra flag operand to pass along.
	CastA = DAG.getNode(CastOpcode, DL, VT, A, Cast->getOperand(1));
	CastB = DAG.getNode(CastOpcode, DL, VT, B, Cast->getOperand(1));
	} else {
	CastA = DAG.getNode(CastOpcode, DL, VT, A);
	CastB = DAG.getNode(CastOpcode, DL, VT, B);
	}
	return DAG.getNode(ISD::VSELECT, DL, VT, SetCC, CastA, CastB);
	}

	SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
	LegalOperations))
	return SDValue(Res, 0);

	// fold (sext (sext x)) -> (sext x)
	// fold (sext (aext x)) -> (sext x)
	if (N0.getOpcode() == ISD::SIGN_EXTEND \|\| N0.getOpcode() == ISD::ANY_EXTEND)
	return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N0.getOperand(0));

	if (N0.getOpcode() == ISD::TRUNCATE) {
	// fold (sext (truncate (load x))) -> (sext (smaller load x))
	// fold (sext (truncate (srl (load x), c))) -> (sext (smaller load (x+c/n)))
	if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
	SDNode *oye = N0.getOperand(0).getNode();
	if (NarrowLoad.getNode() != N0.getNode()) {
	CombineTo(N0.getNode(), NarrowLoad);
	// CombineTo deleted the truncate, if needed, but not what's under it.
	AddToWorklist(oye);
	}
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}

	// See if the value being truncated is already sign extended. If so, just
	// eliminate the trunc/sext pair.
	SDValue Op = N0.getOperand(0);
	unsigned OpBits = Op.getScalarValueSizeInBits();
	unsigned MidBits = N0.getScalarValueSizeInBits();
	unsigned DestBits = VT.getScalarSizeInBits();
	unsigned NumSignBits = DAG.ComputeNumSignBits(Op);

	if (OpBits == DestBits) {
	// Op is i32, Mid is i8, and Dest is i32. If Op has more than 24 sign
	// bits, it is already ready.
	if (NumSignBits > DestBits-MidBits)
	return Op;
	} else if (OpBits < DestBits) {
	// Op is i32, Mid is i8, and Dest is i64. If Op has more than 24 sign
	// bits, just sext from i32.
	if (NumSignBits > OpBits-MidBits)
	return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op);
	} else {
	// Op is i64, Mid is i8, and Dest is i32. If Op has more than 56 sign
	// bits, just truncate to i32.
	if (NumSignBits > OpBits-MidBits)
	return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
	}

	// fold (sext (truncate x)) -> (sextinreg x).
	if (!LegalOperations \|\| TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG,
	N0.getValueType())) {
	if (OpBits < DestBits)
	Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N0), VT, Op);
	else if (OpBits > DestBits)
	Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), VT, Op);
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op,
	DAG.getValueType(N0.getValueType()));
	}
	}

	// fold (sext (load x)) -> (sext (truncate (sextload x)))
	// Only generate vector extloads when 1) they're legal, and 2) they are
	// deemed desirable by the target.
	if (ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
	((!LegalOperations && !VT.isVector() &&
	!cast<LoadSDNode>(N0)->isVolatile()) \|\|
	TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, N0.getValueType()))) {
	bool DoXform = true;
	SmallVector<SDNode*, 4> SetCCs;
	if (!N0.hasOneUse())
	DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::SIGN_EXTEND, SetCCs, TLI);
	if (VT.isVector())
	DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0));
	if (DoXform) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(),
	LN0->getBasePtr(), N0.getValueType(),
	LN0->getMemOperand());
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
	N0.getValueType(), ExtLoad);
	ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND);
	// If the load value is used only by N, replace it via CombineTo N.
	bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
	CombineTo(N, ExtLoad);
	if (NoReplaceTrunc)
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
	else
	CombineTo(LN0, Trunc, ExtLoad.getValue(1));
	return SDValue(N, 0);
	}
	}

	// fold (sext (load x)) to multiple smaller sextloads.
	// Only on illegal but splittable vectors.
	if (SDValue ExtLoad = CombineExtLoad(N))
	return ExtLoad;

	// fold (sext (sextload x)) -> (sext (truncate (sextload x)))
	// fold (sext ( extload x)) -> (sext (truncate (sextload x)))
	if ((ISD::isSEXTLoad(N0.getNode()) \|\| ISD::isEXTLoad(N0.getNode())) &&
	ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	EVT MemVT = LN0->getMemoryVT();
	if ((!LegalOperations && !LN0->isVolatile()) \|\|
	TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT)) {
	SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(),
	LN0->getBasePtr(), MemVT,
	LN0->getMemOperand());
	CombineTo(N, ExtLoad);
	CombineTo(N0.getNode(),
	DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
	N0.getValueType(), ExtLoad),
	ExtLoad.getValue(1));
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}

	// fold (sext (and/or/xor (load x), cst)) ->
	// (and/or/xor (sextload x), (sext cst))
	if ((N0.getOpcode() == ISD::AND \|\| N0.getOpcode() == ISD::OR \|\|
	N0.getOpcode() == ISD::XOR) &&
	isa<LoadSDNode>(N0.getOperand(0)) &&
	N0.getOperand(1).getOpcode() == ISD::Constant &&
	TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, N0.getValueType()) &&
	(!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0.getOperand(0));
	if (LN0->getExtensionType() != ISD::ZEXTLOAD && LN0->isUnindexed()) {
	bool DoXform = true;
	SmallVector<SDNode*, 4> SetCCs;
	if (!N0.hasOneUse())
	DoXform = ExtendUsesToFormExtLoad(N, N0.getOperand(0), ISD::SIGN_EXTEND,
	SetCCs, TLI);
	if (DoXform) {
	SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(LN0), VT,
	LN0->getChain(), LN0->getBasePtr(),
	LN0->getMemoryVT(),
	LN0->getMemOperand());
	APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
	Mask = Mask.sext(VT.getSizeInBits());
	SDValue And = DAG.getNode(N0.getOpcode(), DL, VT,
	ExtLoad, DAG.getConstant(Mask, DL, VT));
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE,
	SDLoc(N0.getOperand(0)),
	N0.getOperand(0).getValueType(), ExtLoad);
	ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND);
	bool NoReplaceTruncAnd = !N0.hasOneUse();
	bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
	CombineTo(N, And);
	// If N0 has multiple uses, change other uses as well.
	if (NoReplaceTruncAnd) {
	SDValue TruncAnd =
	DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And);
	CombineTo(N0.getNode(), TruncAnd);
	}
	if (NoReplaceTrunc)
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
	else
	CombineTo(LN0, Trunc, ExtLoad.getValue(1));
	return SDValue(N,0); // Return N so it doesn't get rechecked!
	}
	}
	}

	if (N0.getOpcode() == ISD::SETCC) {
	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
	EVT N00VT = N0.getOperand(0).getValueType();

	// sext(setcc) -> sext_in_reg(vsetcc) for vectors.
	// Only do this before legalize for now.
	if (VT.isVector() && !LegalOperations &&
	TLI.getBooleanContents(N00VT) ==
	TargetLowering::ZeroOrNegativeOneBooleanContent) {
	// On some architectures (such as SSE/NEON/etc) the SETCC result type is
	// of the same size as the compared operands. Only optimize sext(setcc())
	// if this is the case.
	EVT SVT = getSetCCResultType(N00VT);

	// We know that the # elements of the results is the same as the
	// # elements of the compare (and the # elements of the compare result
	// for that matter). Check to see that they are the same size. If so,
	// we know that the element size of the sext'd result matches the
	// element size of the compare operands.
	if (VT.getSizeInBits() == SVT.getSizeInBits())
	return DAG.getSetCC(DL, VT, N00, N01, CC);

	// If the desired elements are smaller or larger than the source
	// elements, we can use a matching integer vector type and then
	// truncate/sign extend.
	EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
	if (SVT == MatchingVecType) {
	SDValue VsetCC = DAG.getSetCC(DL, MatchingVecType, N00, N01, CC);
	return DAG.getSExtOrTrunc(VsetCC, DL, VT);
	}
	}

	// sext(setcc x, y, cc) -> (select (setcc x, y, cc), T, 0)
	// Here, T can be 1 or -1, depending on the type of the setcc and
	// getBooleanContents().
	unsigned SetCCWidth = N0.getScalarValueSizeInBits();

	// To determine the "true" side of the select, we need to know the high bit
	// of the value returned by the setcc if it evaluates to true.
	// If the type of the setcc is i1, then the true case of the select is just
	// sext(i1 1), that is, -1.
	// If the type of the setcc is larger (say, i8) then the value of the high
	// bit depends on getBooleanContents(), so ask TLI for a real "true" value
	// of the appropriate width.
	SDValue ExtTrueVal = (SetCCWidth == 1) ? DAG.getAllOnesConstant(DL, VT)
	: TLI.getConstTrueVal(DAG, VT, DL);
	SDValue Zero = DAG.getConstant(0, DL, VT);
	if (SDValue SCC =
	SimplifySelectCC(DL, N00, N01, ExtTrueVal, Zero, CC, true))
	return SCC;

	if (!VT.isVector() && !TLI.convertSelectOfConstantsToMath(VT)) {
	EVT SetCCVT = getSetCCResultType(N00VT);
	// Don't do this transform for i1 because there's a select transform
	// that would reverse it.
	// TODO: We should not do this transform at all without a target hook
	// because a sext is likely cheaper than a select?
	if (SetCCVT.getScalarSizeInBits() != 1 &&
	(!LegalOperations \|\| TLI.isOperationLegal(ISD::SETCC, N00VT))) {
	SDValue SetCC = DAG.getSetCC(DL, SetCCVT, N00, N01, CC);
	return DAG.getSelect(DL, VT, SetCC, ExtTrueVal, Zero);
	}
	}
	}

	// fold (sext x) -> (zext x) if the sign bit is known zero.
	if ((!LegalOperations \|\| TLI.isOperationLegal(ISD::ZERO_EXTEND, VT)) &&
	DAG.SignBitIsZero(N0))
	return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0);

	if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
	return NewVSel;

	return SDValue();
	}

	// isTruncateOf - If N is a truncate of some other value, return true, record
	// the value being truncated in Op and which of Op's bits are zero/one in Known.
	// This function computes KnownBits to avoid a duplicated call to
	// computeKnownBits in the caller.
	static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op,
	KnownBits &Known) {
	if (N->getOpcode() == ISD::TRUNCATE) {
	Op = N->getOperand(0);
	DAG.computeKnownBits(Op, Known);
	return true;
	}

	if (N->getOpcode() != ISD::SETCC \|\| N->getValueType(0) != MVT::i1 \|\|
	cast<CondCodeSDNode>(N->getOperand(2))->get() != ISD::SETNE)
	return false;

	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	assert(Op0.getValueType() == Op1.getValueType());

	if (isNullConstant(Op0))
	Op = Op1;
	else if (isNullConstant(Op1))
	Op = Op0;
	else
	return false;

	DAG.computeKnownBits(Op, Known);

	if (!(Known.Zero \| 1).isAllOnesValue())
	return false;

	return true;
	}

	SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
	LegalOperations))
	return SDValue(Res, 0);

	// fold (zext (zext x)) -> (zext x)
	// fold (zext (aext x)) -> (zext x)
	if (N0.getOpcode() == ISD::ZERO_EXTEND \|\| N0.getOpcode() == ISD::ANY_EXTEND)
	return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT,
	N0.getOperand(0));

	// fold (zext (truncate x)) -> (zext x) or
	// (zext (truncate x)) -> (truncate x)
	// This is valid when the truncated bits of x are already zero.
	// FIXME: We should extend this to work for vectors too.
	SDValue Op;
	KnownBits Known;
	if (!VT.isVector() && isTruncateOf(DAG, N0, Op, Known)) {
	APInt TruncatedBits =
	(Op.getValueSizeInBits() == N0.getValueSizeInBits()) ?
	APInt(Op.getValueSizeInBits(), 0) :
	APInt::getBitsSet(Op.getValueSizeInBits(),
	N0.getValueSizeInBits(),
	std::min(Op.getValueSizeInBits(),
	VT.getSizeInBits()));
	if (TruncatedBits.isSubsetOf(Known.Zero))
	return DAG.getZExtOrTrunc(Op, SDLoc(N), VT);
	}

	// fold (zext (truncate x)) -> (and x, mask)
	if (N0.getOpcode() == ISD::TRUNCATE) {
	// fold (zext (truncate (load x))) -> (zext (smaller load x))
	// fold (zext (truncate (srl (load x), c))) -> (zext (smaller load (x+c/n)))
	if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
	SDNode *oye = N0.getOperand(0).getNode();
	if (NarrowLoad.getNode() != N0.getNode()) {
	CombineTo(N0.getNode(), NarrowLoad);
	// CombineTo deleted the truncate, if needed, but not what's under it.
	AddToWorklist(oye);
	}
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}

	EVT SrcVT = N0.getOperand(0).getValueType();
	EVT MinVT = N0.getValueType();

	// Try to mask before the extension to avoid having to generate a larger mask,
	// possibly over several sub-vectors.
	if (SrcVT.bitsLT(VT)) {
	if (!LegalOperations \|\| (TLI.isOperationLegal(ISD::AND, SrcVT) &&
	TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) {
	SDValue Op = N0.getOperand(0);
	Op = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType());
	AddToWorklist(Op.getNode());
	return DAG.getZExtOrTrunc(Op, SDLoc(N), VT);
	}
	}

	if (!LegalOperations \|\| TLI.isOperationLegal(ISD::AND, VT)) {
	SDValue Op = DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
	AddToWorklist(Op.getNode());
	SDValue And = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType());
	// We may safely transfer the debug info describing the truncate node over
	// to the equivalent and operation.
	DAG.transferDbgValues(N0, And);
	return And;
	}
	}

	// Fold (zext (and (trunc x), cst)) -> (and x, cst),
	// if either of the casts is not free.
	if (N0.getOpcode() == ISD::AND &&
	N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
	N0.getOperand(1).getOpcode() == ISD::Constant &&
	(!TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(),
	N0.getValueType()) \|\|
	!TLI.isZExtFree(N0.getValueType(), VT))) {
	SDValue X = N0.getOperand(0).getOperand(0);
	X = DAG.getAnyExtOrTrunc(X, SDLoc(X), VT);
	APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
	Mask = Mask.zext(VT.getSizeInBits());
	SDLoc DL(N);
	return DAG.getNode(ISD::AND, DL, VT,
	X, DAG.getConstant(Mask, DL, VT));
	}

	// fold (zext (load x)) -> (zext (truncate (zextload x)))
	// Only generate vector extloads when 1) they're legal, and 2) they are
	// deemed desirable by the target.
	if (ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
	((!LegalOperations && !VT.isVector() &&
	!cast<LoadSDNode>(N0)->isVolatile()) \|\|
	TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, N0.getValueType()))) {
	bool DoXform = true;
	SmallVector<SDNode*, 4> SetCCs;
	if (!N0.hasOneUse())
	DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ZERO_EXTEND, SetCCs, TLI);
	if (VT.isVector())
	DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0));
	if (DoXform) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT,
	LN0->getChain(),
	LN0->getBasePtr(), N0.getValueType(),
	LN0->getMemOperand());

	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
	N0.getValueType(), ExtLoad);
	ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N), ISD::ZERO_EXTEND);
	// If the load value is used only by N, replace it via CombineTo N.
	bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
	CombineTo(N, ExtLoad);
	if (NoReplaceTrunc)
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
	else
	CombineTo(LN0, Trunc, ExtLoad.getValue(1));
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}

	// fold (zext (load x)) to multiple smaller zextloads.
	// Only on illegal but splittable vectors.
	if (SDValue ExtLoad = CombineExtLoad(N))
	return ExtLoad;

	// fold (zext (and/or/xor (load x), cst)) ->
	// (and/or/xor (zextload x), (zext cst))
	// Unless (and (load x) cst) will match as a zextload already and has
	// additional users.
	if ((N0.getOpcode() == ISD::AND \|\| N0.getOpcode() == ISD::OR \|\|
	N0.getOpcode() == ISD::XOR) &&
	isa<LoadSDNode>(N0.getOperand(0)) &&
	N0.getOperand(1).getOpcode() == ISD::Constant &&
	TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, N0.getValueType()) &&
	(!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0.getOperand(0));
	if (LN0->getExtensionType() != ISD::SEXTLOAD && LN0->isUnindexed()) {
	bool DoXform = true;
	SmallVector<SDNode*, 4> SetCCs;
	if (!N0.hasOneUse()) {
	if (N0.getOpcode() == ISD::AND) {
	auto *AndC = cast<ConstantSDNode>(N0.getOperand(1));
	EVT LoadResultTy = AndC->getValueType(0);
	EVT ExtVT;
	if (isAndLoadExtLoad(AndC, LN0, LoadResultTy, ExtVT))
	DoXform = false;
	}
	if (DoXform)
	DoXform = ExtendUsesToFormExtLoad(N, N0.getOperand(0),
	ISD::ZERO_EXTEND, SetCCs, TLI);
	}
	if (DoXform) {
	SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), VT,
	LN0->getChain(), LN0->getBasePtr(),
	LN0->getMemoryVT(),
	LN0->getMemOperand());
	APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
	Mask = Mask.zext(VT.getSizeInBits());
	SDLoc DL(N);
	SDValue And = DAG.getNode(N0.getOpcode(), DL, VT,
	ExtLoad, DAG.getConstant(Mask, DL, VT));
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE,
	SDLoc(N0.getOperand(0)),
	N0.getOperand(0).getValueType(), ExtLoad);
	ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::ZERO_EXTEND);
	bool NoReplaceTruncAnd = !N0.hasOneUse();
	bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
	CombineTo(N, And);
	// If N0 has multiple uses, change other uses as well.
	if (NoReplaceTruncAnd) {
	SDValue TruncAnd =
	DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And);
	CombineTo(N0.getNode(), TruncAnd);
	}
	if (NoReplaceTrunc)
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
	else
	CombineTo(LN0, Trunc, ExtLoad.getValue(1));
	return SDValue(N,0); // Return N so it doesn't get rechecked!
	}
	}
	}

	// fold (zext (zextload x)) -> (zext (truncate (zextload x)))
	// fold (zext ( extload x)) -> (zext (truncate (zextload x)))
	if ((ISD::isZEXTLoad(N0.getNode()) \|\| ISD::isEXTLoad(N0.getNode())) &&
	ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	EVT MemVT = LN0->getMemoryVT();
	if ((!LegalOperations && !LN0->isVolatile()) \|\|
	TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT)) {
	SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT,
	LN0->getChain(),
	LN0->getBasePtr(), MemVT,
	LN0->getMemOperand());
	CombineTo(N, ExtLoad);
	CombineTo(N0.getNode(),
	DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(),
	ExtLoad),
	ExtLoad.getValue(1));
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}

	if (N0.getOpcode() == ISD::SETCC) {
	// Only do this before legalize for now.
	if (!LegalOperations && VT.isVector() &&
	N0.getValueType().getVectorElementType() == MVT::i1) {
	EVT N00VT = N0.getOperand(0).getValueType();
	if (getSetCCResultType(N00VT) == N0.getValueType())
	return SDValue();

	// We know that the # elements of the results is the same as the #
	// elements of the compare (and the # elements of the compare result for
	// that matter). Check to see that they are the same size. If so, we know
	// that the element size of the sext'd result matches the element size of
	// the compare operands.
	SDLoc DL(N);
	SDValue VecOnes = DAG.getConstant(1, DL, VT);
	if (VT.getSizeInBits() == N00VT.getSizeInBits()) {
	// zext(setcc) -> (and (vsetcc), (1, 1, ...) for vectors.
	SDValue VSetCC = DAG.getNode(ISD::SETCC, DL, VT, N0.getOperand(0),
	N0.getOperand(1), N0.getOperand(2));
	return DAG.getNode(ISD::AND, DL, VT, VSetCC, VecOnes);
	}

	// If the desired elements are smaller or larger than the source
	// elements we can use a matching integer vector type and then
	// truncate/sign extend.
	EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger();
	SDValue VsetCC =
	DAG.getNode(ISD::SETCC, DL, MatchingVectorType, N0.getOperand(0),
	N0.getOperand(1), N0.getOperand(2));
	return DAG.getNode(ISD::AND, DL, VT, DAG.getSExtOrTrunc(VsetCC, DL, VT),
	VecOnes);
	}

	// zext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
	SDLoc DL(N);
	if (SDValue SCC = SimplifySelectCC(
	DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT),
	DAG.getConstant(0, DL, VT),
	cast<CondCodeSDNode>(N0.getOperand(2))->get(), true))
	return SCC;
	}

	// (zext (shl (zext x), cst)) -> (shl (zext x), cst)
	if ((N0.getOpcode() == ISD::SHL \|\| N0.getOpcode() == ISD::SRL) &&
	isa<ConstantSDNode>(N0.getOperand(1)) &&
	N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND &&
	N0.hasOneUse()) {
	SDValue ShAmt = N0.getOperand(1);
	unsigned ShAmtVal = cast<ConstantSDNode>(ShAmt)->getZExtValue();
	if (N0.getOpcode() == ISD::SHL) {
	SDValue InnerZExt = N0.getOperand(0);
	// If the original shl may be shifting out bits, do not perform this
	// transformation.
	unsigned KnownZeroBits = InnerZExt.getValueSizeInBits() -
	InnerZExt.getOperand(0).getValueSizeInBits();
	if (ShAmtVal > KnownZeroBits)
	return SDValue();
	}

	SDLoc DL(N);

	// Ensure that the shift amount is wide enough for the shifted value.
	if (VT.getSizeInBits() >= 256)
	ShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShAmt);

	return DAG.getNode(N0.getOpcode(), DL, VT,
	DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)),
	ShAmt);
	}

	if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
	return NewVSel;

	return SDValue();
	}

	SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
	LegalOperations))
	return SDValue(Res, 0);

	// fold (aext (aext x)) -> (aext x)
	// fold (aext (zext x)) -> (zext x)
	// fold (aext (sext x)) -> (sext x)
	if (N0.getOpcode() == ISD::ANY_EXTEND \|\|
	N0.getOpcode() == ISD::ZERO_EXTEND \|\|
	N0.getOpcode() == ISD::SIGN_EXTEND)
	return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0));

	// fold (aext (truncate (load x))) -> (aext (smaller load x))
	// fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n)))
	if (N0.getOpcode() == ISD::TRUNCATE) {
	if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
	SDNode *oye = N0.getOperand(0).getNode();
	if (NarrowLoad.getNode() != N0.getNode()) {
	CombineTo(N0.getNode(), NarrowLoad);
	// CombineTo deleted the truncate, if needed, but not what's under it.
	AddToWorklist(oye);
	}
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}

	// fold (aext (truncate x))
	if (N0.getOpcode() == ISD::TRUNCATE)
	return DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);

	// Fold (aext (and (trunc x), cst)) -> (and x, cst)
	// if the trunc is not free.
	if (N0.getOpcode() == ISD::AND &&
	N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
	N0.getOperand(1).getOpcode() == ISD::Constant &&
	!TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(),
	N0.getValueType())) {
	SDLoc DL(N);
	SDValue X = N0.getOperand(0).getOperand(0);
	X = DAG.getAnyExtOrTrunc(X, DL, VT);
	APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
	Mask = Mask.zext(VT.getSizeInBits());
	return DAG.getNode(ISD::AND, DL, VT,
	X, DAG.getConstant(Mask, DL, VT));
	}

	// fold (aext (load x)) -> (aext (truncate (extload x)))
	// None of the supported targets knows how to perform load and any_ext
	// on vectors in one instruction. We only perform this transformation on
	// scalars.
	if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() &&
	ISD::isUNINDEXEDLoad(N0.getNode()) &&
	TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
	bool DoXform = true;
	SmallVector<SDNode*, 4> SetCCs;
	if (!N0.hasOneUse())
	DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ANY_EXTEND, SetCCs, TLI);
	if (DoXform) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
	LN0->getChain(),
	LN0->getBasePtr(), N0.getValueType(),
	LN0->getMemOperand());
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
	N0.getValueType(), ExtLoad);
	ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N),
	ISD::ANY_EXTEND);
	// If the load value is used only by N, replace it via CombineTo N.
	bool NoReplaceTrunc = N0.hasOneUse();
	CombineTo(N, ExtLoad);
	if (NoReplaceTrunc)
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
	else
	CombineTo(LN0, Trunc, ExtLoad.getValue(1));
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}

	// fold (aext (zextload x)) -> (aext (truncate (zextload x)))
	// fold (aext (sextload x)) -> (aext (truncate (sextload x)))
	// fold (aext ( extload x)) -> (aext (truncate (extload x)))
	if (N0.getOpcode() == ISD::LOAD &&
	!ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
	N0.hasOneUse()) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	ISD::LoadExtType ExtType = LN0->getExtensionType();
	EVT MemVT = LN0->getMemoryVT();
	if (!LegalOperations \|\| TLI.isLoadExtLegal(ExtType, VT, MemVT)) {
	SDValue ExtLoad = DAG.getExtLoad(ExtType, SDLoc(N),
	VT, LN0->getChain(), LN0->getBasePtr(),
	MemVT, LN0->getMemOperand());
	CombineTo(N, ExtLoad);
	CombineTo(N0.getNode(),
	DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
	N0.getValueType(), ExtLoad),
	ExtLoad.getValue(1));
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}

	if (N0.getOpcode() == ISD::SETCC) {
	// For vectors:
	// aext(setcc) -> vsetcc
	// aext(setcc) -> truncate(vsetcc)
	// aext(setcc) -> aext(vsetcc)
	// Only do this before legalize for now.
	if (VT.isVector() && !LegalOperations) {
	EVT N00VT = N0.getOperand(0).getValueType();
	if (getSetCCResultType(N00VT) == N0.getValueType())
	return SDValue();

	// We know that the # elements of the results is the same as the
	// # elements of the compare (and the # elements of the compare result
	// for that matter). Check to see that they are the same size. If so,
	// we know that the element size of the sext'd result matches the
	// element size of the compare operands.
	if (VT.getSizeInBits() == N00VT.getSizeInBits())
	return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0),
	N0.getOperand(1),
	cast<CondCodeSDNode>(N0.getOperand(2))->get());
	// If the desired elements are smaller or larger than the source
	// elements we can use a matching integer vector type and then
	// truncate/any extend
	else {
	EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger();
	SDValue VsetCC =
	DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0),
	N0.getOperand(1),
	cast<CondCodeSDNode>(N0.getOperand(2))->get());
	return DAG.getAnyExtOrTrunc(VsetCC, SDLoc(N), VT);
	}
	}

	// aext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
	SDLoc DL(N);
	if (SDValue SCC = SimplifySelectCC(
	DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT),
	DAG.getConstant(0, DL, VT),
	cast<CondCodeSDNode>(N0.getOperand(2))->get(), true))
	return SCC;
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitAssertExt(SDNode *N) {
	unsigned Opcode = N->getOpcode();
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT AssertVT = cast<VTSDNode>(N1)->getVT();

	// fold (assert?ext (assert?ext x, vt), vt) -> (assert?ext x, vt)
	if (N0.getOpcode() == Opcode &&
	AssertVT == cast<VTSDNode>(N0.getOperand(1))->getVT())
	return N0;

	if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
	N0.getOperand(0).getOpcode() == Opcode) {
	// We have an assert, truncate, assert sandwich. Make one stronger assert
	// by asserting on the smallest asserted type to the larger source type.
	// This eliminates the later assert:
	// assert (trunc (assert X, i8) to iN), i1 --> trunc (assert X, i1) to iN
	// assert (trunc (assert X, i1) to iN), i8 --> trunc (assert X, i1) to iN
	SDValue BigA = N0.getOperand(0);
	EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT();
	assert(BigA_AssertVT.bitsLE(N0.getValueType()) &&
	"Asserting zero/sign-extended bits to a type larger than the "
	"truncated destination does not provide information");

	SDLoc DL(N);
	EVT MinAssertVT = AssertVT.bitsLT(BigA_AssertVT) ? AssertVT : BigA_AssertVT;
	SDValue MinAssertVTVal = DAG.getValueType(MinAssertVT);
	SDValue NewAssert = DAG.getNode(Opcode, DL, BigA.getValueType(),
	BigA.getOperand(0), MinAssertVTVal);
	return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert);
	}

	return SDValue();
	}

	/// If the result of a wider load is shifted to right of N bits and then
	/// truncated to a narrower type and where N is a multiple of number of bits of
	/// the narrower type, transform it to a narrower load from address + N / num of
	/// bits of new type. Also narrow the load if the result is masked with an AND
	/// to effectively produce a smaller type. If the result is to be extended, also
	/// fold the extension to form a extending load.
	SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
	unsigned Opc = N->getOpcode();

	ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT ExtVT = VT;

	// This transformation isn't valid for vector loads.
	if (VT.isVector())
	return SDValue();

	// Special case: SIGN_EXTEND_INREG is basically truncating to ExtVT then
	// extended to VT.
	if (Opc == ISD::SIGN_EXTEND_INREG) {
	ExtType = ISD::SEXTLOAD;
	ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT();
	} else if (Opc == ISD::SRL) {
	// Another special-case: SRL is basically zero-extending a narrower value,
	// or it maybe shifting a higher subword, half or byte into the lowest
	// bits.
	ExtType = ISD::ZEXTLOAD;
	N0 = SDValue(N, 0);

	auto *LN0 = dyn_cast<LoadSDNode>(N0.getOperand(0));
	auto *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (!N01 \|\| !LN0)
	return SDValue();

	uint64_t ShiftAmt = N01->getZExtValue();
	uint64_t MemoryWidth = LN0->getMemoryVT().getSizeInBits();
	if (LN0->getExtensionType() != ISD::SEXTLOAD && MemoryWidth > ShiftAmt)
	ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShiftAmt);
	else
	ExtVT = EVT::getIntegerVT(*DAG.getContext(),
	VT.getSizeInBits() - ShiftAmt);
	} else if (Opc == ISD::AND) {
	// An AND with a constant mask is the same as a truncate + zero-extend.
	auto AndC = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!AndC \|\| !AndC->getAPIntValue().isMask())
	return SDValue();

	unsigned ActiveBits = AndC->getAPIntValue().countTrailingOnes();
	ExtType = ISD::ZEXTLOAD;
	ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
	}

	unsigned ShAmt = 0;
	if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) {
	if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
	ShAmt = N01->getZExtValue();
	unsigned EVTBits = ExtVT.getSizeInBits();
	// Is the shift amount a multiple of size of VT?
	if ((ShAmt & (EVTBits-1)) == 0) {
	N0 = N0.getOperand(0);
	// Is the load width a multiple of size of VT?
	if ((N0.getValueSizeInBits() & (EVTBits-1)) != 0)
	return SDValue();
	}

	// At this point, we must have a load or else we can't do the transform.
	if (!isa<LoadSDNode>(N0)) return SDValue();

	// Because a SRL must be assumed to need to zero-extend the high bits
	// (as opposed to anyext the high bits), we can't combine the zextload
	// lowering of SRL and an sextload.
	if (cast<LoadSDNode>(N0)->getExtensionType() == ISD::SEXTLOAD)
	return SDValue();

	// If the shift amount is larger than the input type then we're not
	// accessing any of the loaded bytes. If the load was a zextload/extload
	// then the result of the shift+trunc is zero/undef (handled elsewhere).
	if (ShAmt >= cast<LoadSDNode>(N0)->getMemoryVT().getSizeInBits())
	return SDValue();
	}
	}

	// If the load is shifted left (and the result isn't shifted back right),
	// we can fold the truncate through the shift.
	unsigned ShLeftAmt = 0;
	if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
	ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) {
	if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
	ShLeftAmt = N01->getZExtValue();
	N0 = N0.getOperand(0);
	}
	}

	// If we haven't found a load, we can't narrow it.
	if (!isa<LoadSDNode>(N0))
	return SDValue();

	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	if (!isLegalNarrowLoad(LN0, ExtType, ExtVT, ShAmt))
	return SDValue();

	// For big endian targets, we need to adjust the offset to the pointer to
	// load the correct bytes.
	if (DAG.getDataLayout().isBigEndian()) {
	unsigned LVTStoreBits = LN0->getMemoryVT().getStoreSizeInBits();
	unsigned EVTStoreBits = ExtVT.getStoreSizeInBits();
	ShAmt = LVTStoreBits - EVTStoreBits - ShAmt;
	}

	EVT PtrType = N0.getOperand(1).getValueType();
	uint64_t PtrOff = ShAmt / 8;
	unsigned NewAlign = MinAlign(LN0->getAlignment(), PtrOff);
	SDLoc DL(LN0);
	// The original load itself didn't wrap, so an offset within it doesn't.
	SDNodeFlags Flags;
	Flags.setNoUnsignedWrap(true);
	SDValue NewPtr = DAG.getNode(ISD::ADD, DL,
	PtrType, LN0->getBasePtr(),
	DAG.getConstant(PtrOff, DL, PtrType),
	Flags);
	AddToWorklist(NewPtr.getNode());

	SDValue Load;
	if (ExtType == ISD::NON_EXTLOAD)
	Load = DAG.getLoad(VT, SDLoc(N0), LN0->getChain(), NewPtr,
	LN0->getPointerInfo().getWithOffset(PtrOff), NewAlign,
	LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
	else
	Load = DAG.getExtLoad(ExtType, SDLoc(N0), VT, LN0->getChain(), NewPtr,
	LN0->getPointerInfo().getWithOffset(PtrOff), ExtVT,
	NewAlign, LN0->getMemOperand()->getFlags(),
	LN0->getAAInfo());

	// Replace the old load's chain with the new load's chain.
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));

	// Shift the result left, if we've swallowed a left shift.
	SDValue Result = Load;
	if (ShLeftAmt != 0) {
	EVT ShImmTy = getShiftAmountTy(Result.getValueType());
	if (!isUIntN(ShImmTy.getSizeInBits(), ShLeftAmt))
	ShImmTy = VT;
	// If the shift amount is as large as the result size (but, presumably,
	// no larger than the source) then the useful bits of the result are
	// zero; we can't simply return the shortened shift, because the result
	// of that operation is undefined.
	SDLoc DL(N0);
	if (ShLeftAmt >= VT.getSizeInBits())
	Result = DAG.getConstant(0, DL, VT);
	else
	Result = DAG.getNode(ISD::SHL, DL, VT,
	Result, DAG.getConstant(ShLeftAmt, DL, ShImmTy));
	}

	// Return the new loaded value.
	return Result;
	}

	SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	EVT EVT = cast<VTSDNode>(N1)->getVT();
	unsigned VTBits = VT.getScalarSizeInBits();
	unsigned EVTBits = EVT.getScalarSizeInBits();

	if (N0.isUndef())
	return DAG.getUNDEF(VT);

	// fold (sext_in_reg c1) -> c1
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0, N1);

	// If the input is already sign extended, just drop the extension.
	if (DAG.ComputeNumSignBits(N0) >= VTBits-EVTBits+1)
	return N0;

	// fold (sext_in_reg (sext_in_reg x, VT2), VT1) -> (sext_in_reg x, minVT) pt2
	if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG &&
	EVT.bitsLT(cast<VTSDNode>(N0.getOperand(1))->getVT()))
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,
	N0.getOperand(0), N1);

	// fold (sext_in_reg (sext x)) -> (sext x)
	// fold (sext_in_reg (aext x)) -> (sext x)
	// if x is small enough.
	if (N0.getOpcode() == ISD::SIGN_EXTEND \|\| N0.getOpcode() == ISD::ANY_EXTEND) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getScalarValueSizeInBits() <= EVTBits &&
	(!LegalOperations \|\| TLI.isOperationLegal(ISD::SIGN_EXTEND, VT)))
	return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1);
	}

	// fold (sext_in_reg (*_extend_vector_inreg x)) -> (sext_vector_in_reg x)
	if ((N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG \|\|
	N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG \|\|
	N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) &&
	N0.getOperand(0).getScalarValueSizeInBits() == EVTBits) {
	if (!LegalOperations \|\|
	TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT))
	return DAG.getSignExtendVectorInReg(N0.getOperand(0), SDLoc(N), VT);
	}

	// fold (sext_in_reg (zext x)) -> (sext x)
	// iff we are extending the source sign bit.
	if (N0.getOpcode() == ISD::ZERO_EXTEND) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getScalarValueSizeInBits() == EVTBits &&
	(!LegalOperations \|\| TLI.isOperationLegal(ISD::SIGN_EXTEND, VT)))
	return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1);
	}

	// fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero.
	if (DAG.MaskedValueIsZero(N0, APInt::getOneBitSet(VTBits, EVTBits - 1)))
	return DAG.getZeroExtendInReg(N0, SDLoc(N), EVT.getScalarType());

	// fold operands of sext_in_reg based on knowledge that the top bits are not
	// demanded.
	if (SimplifyDemandedBits(SDValue(N, 0)))
	return SDValue(N, 0);

	// fold (sext_in_reg (load x)) -> (smaller sextload x)
	// fold (sext_in_reg (srl (load x), c)) -> (smaller sextload (x+c/evtbits))
	if (SDValue NarrowLoad = ReduceLoadWidth(N))
	return NarrowLoad;

	// fold (sext_in_reg (srl X, 24), i8) -> (sra X, 24)
	// fold (sext_in_reg (srl X, 23), i8) -> (sra X, 23) iff possible.
	// We already fold "(sext_in_reg (srl X, 25), i8) -> srl X, 25" above.
	if (N0.getOpcode() == ISD::SRL) {
	if (ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1)))
	if (ShAmt->getZExtValue()+EVTBits <= VTBits) {
	// We can turn this into an SRA iff the input to the SRL is already sign
	// extended enough.
	unsigned InSignBits = DAG.ComputeNumSignBits(N0.getOperand(0));
	if (VTBits-(ShAmt->getZExtValue()+EVTBits) < InSignBits)
	return DAG.getNode(ISD::SRA, SDLoc(N), VT,
	N0.getOperand(0), N0.getOperand(1));
	}
	}

	// fold (sext_inreg (extload x)) -> (sextload x)
	// If sextload is not supported by target, we can only do the combine when
	// load has one use. Doing otherwise can block folding the extload with other
	// extends that the target does support.
	if (ISD::isEXTLoad(N0.getNode()) &&
	ISD::isUNINDEXEDLoad(N0.getNode()) &&
	EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
	((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile() &&
	N0.hasOneUse()) \|\|
	TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
	LN0->getChain(),
	LN0->getBasePtr(), EVT,
	LN0->getMemOperand());
	CombineTo(N, ExtLoad);
	CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
	AddToWorklist(ExtLoad.getNode());
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	// fold (sext_inreg (zextload x)) -> (sextload x) iff load has one use
	if (ISD::isZEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
	N0.hasOneUse() &&
	EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
	((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) \|\|
	TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
	LN0->getChain(),
	LN0->getBasePtr(), EVT,
	LN0->getMemOperand());
	CombineTo(N, ExtLoad);
	CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}

	// Form (sext_inreg (bswap >> 16)) or (sext_inreg (rotl (bswap) 16))
	if (EVTBits <= 16 && N0.getOpcode() == ISD::OR) {
	if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
	N0.getOperand(1), false))
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,
	BSwap, N1);
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitSIGN_EXTEND_VECTOR_INREG(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	if (N0.isUndef())
	return DAG.getUNDEF(VT);

	if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
	LegalOperations))
	return SDValue(Res, 0);

	return SDValue();
	}

	SDValue DAGCombiner::visitZERO_EXTEND_VECTOR_INREG(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	if (N0.isUndef())
	return DAG.getUNDEF(VT);

	if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
	LegalOperations))
	return SDValue(Res, 0);

	return SDValue();
	}

	SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	bool isLE = DAG.getDataLayout().isLittleEndian();

	// noop truncate
	if (N0.getValueType() == N->getValueType(0))
	return N0;

	// fold (truncate (truncate x)) -> (truncate x)
	if (N0.getOpcode() == ISD::TRUNCATE)
	return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0));

	// fold (truncate c1) -> c1
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) {
	SDValue C = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0);
	if (C.getNode() != N)
	return C;
	}

	// fold (truncate (ext x)) -> (ext x) or (truncate x) or x
	if (N0.getOpcode() == ISD::ZERO_EXTEND \|\|
	N0.getOpcode() == ISD::SIGN_EXTEND \|\|
	N0.getOpcode() == ISD::ANY_EXTEND) {
	// if the source is smaller than the dest, we still need an extend.
	if (N0.getOperand(0).getValueType().bitsLT(VT))
	return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0));
	// if the source is larger than the dest, than we just need the truncate.
	if (N0.getOperand(0).getValueType().bitsGT(VT))
	return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0));
	// if the source and dest are the same type, we can drop both the extend
	// and the truncate.
	return N0.getOperand(0);
	}

	// If this is anyext(trunc), don't fold it, allow ourselves to be folded.
	if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ANY_EXTEND))
	return SDValue();

	// Fold extract-and-trunc into a narrow extract. For example:
	// i64 x = EXTRACT_VECTOR_ELT(v2i64 val, i32 1)
	// i32 y = TRUNCATE(i64 x)
	// -- becomes --
	// v16i8 b = BITCAST (v2i64 val)
	// i8 x = EXTRACT_VECTOR_ELT(v16i8 b, i32 8)
	//
	// Note: We only run this optimization after type legalization (which often
	// creates this pattern) and before operation legalization after which
	// we need to be more careful about the vector instructions that we generate.
	if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	LegalTypes && !LegalOperations && N0->hasOneUse() && VT != MVT::i1) {
	EVT VecTy = N0.getOperand(0).getValueType();
	EVT ExTy = N0.getValueType();
	EVT TrTy = N->getValueType(0);

	unsigned NumElem = VecTy.getVectorNumElements();
	unsigned SizeRatio = ExTy.getSizeInBits()/TrTy.getSizeInBits();

	EVT NVT = EVT::getVectorVT(DAG.getContext(), TrTy, SizeRatio NumElem);
	assert(NVT.getSizeInBits() == VecTy.getSizeInBits() && "Invalid Size");

	SDValue EltNo = N0->getOperand(1);
	if (isa<ConstantSDNode>(EltNo) && isTypeLegal(NVT)) {
	int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
	EVT IndexTy = TLI.getVectorIdxTy(DAG.getDataLayout());
	int Index = isLE ? (EltSizeRatio) : (EltSizeRatio + (SizeRatio-1));

	SDLoc DL(N);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TrTy,
	DAG.getBitcast(NVT, N0.getOperand(0)),
	DAG.getConstant(Index, DL, IndexTy));
	}
	}

	// trunc (select c, a, b) -> select c, (trunc a), (trunc b)
	if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse()) {
	EVT SrcVT = N0.getValueType();
	if ((!LegalOperations \|\| TLI.isOperationLegal(ISD::SELECT, SrcVT)) &&
	TLI.isTruncateFree(SrcVT, VT)) {
	SDLoc SL(N0);
	SDValue Cond = N0.getOperand(0);
	SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
	SDValue TruncOp1 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(2));
	return DAG.getNode(ISD::SELECT, SDLoc(N), VT, Cond, TruncOp0, TruncOp1);
	}
	}

	// trunc (shl x, K) -> shl (trunc x), K => K < VT.getScalarSizeInBits()
	if (N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
	(!LegalOperations \|\| TLI.isOperationLegalOrCustom(ISD::SHL, VT)) &&
	TLI.isTypeDesirableForOp(ISD::SHL, VT)) {
	SDValue Amt = N0.getOperand(1);
	KnownBits Known;
	DAG.computeKnownBits(Amt, Known);
	unsigned Size = VT.getScalarSizeInBits();
	if (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size)) {
	SDLoc SL(N);
	EVT AmtVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());

	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0));
	if (AmtVT != Amt.getValueType()) {
	Amt = DAG.getZExtOrTrunc(Amt, SL, AmtVT);
	AddToWorklist(Amt.getNode());
	}
	return DAG.getNode(ISD::SHL, SL, VT, Trunc, Amt);
	}
	}

	// Fold a series of buildvector, bitcast, and truncate if possible.
	// For example fold
	// (2xi32 trunc (bitcast ((4xi32)buildvector x, x, y, y) 2xi64)) to
	// (2xi32 (buildvector x, y)).
	if (Level == AfterLegalizeVectorOps && VT.isVector() &&
	N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() &&
	N0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
	N0.getOperand(0).hasOneUse()) {
	SDValue BuildVect = N0.getOperand(0);
	EVT BuildVectEltTy = BuildVect.getValueType().getVectorElementType();
	EVT TruncVecEltTy = VT.getVectorElementType();

	// Check that the element types match.
	if (BuildVectEltTy == TruncVecEltTy) {
	// Now we only need to compute the offset of the truncated elements.
	unsigned BuildVecNumElts = BuildVect.getNumOperands();
	unsigned TruncVecNumElts = VT.getVectorNumElements();
	unsigned TruncEltOffset = BuildVecNumElts / TruncVecNumElts;

	assert((BuildVecNumElts % TruncVecNumElts) == 0 &&
	"Invalid number of elements");

	SmallVector<SDValue, 8> Opnds;
	for (unsigned i = 0, e = BuildVecNumElts; i != e; i += TruncEltOffset)
	Opnds.push_back(BuildVect.getOperand(i));

	return DAG.getBuildVector(VT, SDLoc(N), Opnds);
	}
	}

	// See if we can simplify the input to this truncate through knowledge that
	// only the low bits are being used.
	// For example "trunc (or (shl x, 8), y)" // -> trunc y
	// Currently we only perform this optimization on scalars because vectors
	// may have different active low bits.
	if (!VT.isVector()) {
	APInt Mask =
	APInt::getLowBitsSet(N0.getValueSizeInBits(), VT.getSizeInBits());
	if (SDValue Shorter = DAG.GetDemandedBits(N0, Mask))
	return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Shorter);
	}

	// fold (truncate (load x)) -> (smaller load x)
	// fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits))
	if (!LegalTypes \|\| TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) {
	if (SDValue Reduced = ReduceLoadWidth(N))
	return Reduced;

	// Handle the case where the load remains an extending load even
	// after truncation.
	if (N0.hasOneUse() && ISD::isUNINDEXEDLoad(N0.getNode())) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	if (!LN0->isVolatile() &&
	LN0->getMemoryVT().getStoreSizeInBits() < VT.getSizeInBits()) {
	SDValue NewLoad = DAG.getExtLoad(LN0->getExtensionType(), SDLoc(LN0),
	VT, LN0->getChain(), LN0->getBasePtr(),
	LN0->getMemoryVT(),
	LN0->getMemOperand());
	DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLoad.getValue(1));
	return NewLoad;
	}
	}
	}

	// fold (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...)),
	// where ... are all 'undef'.
	if (N0.getOpcode() == ISD::CONCAT_VECTORS && !LegalTypes) {
	SmallVector<EVT, 8> VTs;
	SDValue V;
	unsigned Idx = 0;
	unsigned NumDefs = 0;

	for (unsigned i = 0, e = N0.getNumOperands(); i != e; ++i) {
	SDValue X = N0.getOperand(i);
	if (!X.isUndef()) {
	V = X;
	Idx = i;
	NumDefs++;
	}
	// Stop if more than one members are non-undef.
	if (NumDefs > 1)
	break;
	VTs.push_back(EVT::getVectorVT(*DAG.getContext(),
	VT.getVectorElementType(),
	X.getValueType().getVectorNumElements()));
	}

	if (NumDefs == 0)
	return DAG.getUNDEF(VT);

	if (NumDefs == 1) {
	assert(V.getNode() && "The single defined operand is empty!");
	SmallVector<SDValue, 8> Opnds;
	for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
	if (i != Idx) {
	Opnds.push_back(DAG.getUNDEF(VTs[i]));
	continue;
	}
	SDValue NV = DAG.getNode(ISD::TRUNCATE, SDLoc(V), VTs[i], V);
	AddToWorklist(NV.getNode());
	Opnds.push_back(NV);
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Opnds);
	}
	}

	// Fold truncate of a bitcast of a vector to an extract of the low vector
	// element.
	//
	// e.g. trunc (i64 (bitcast v2i32:x)) -> extract_vector_elt v2i32:x, idx
	if (N0.getOpcode() == ISD::BITCAST && !VT.isVector()) {
	SDValue VecSrc = N0.getOperand(0);
	EVT SrcVT = VecSrc.getValueType();
	if (SrcVT.isVector() && SrcVT.getScalarType() == VT &&
	(!LegalOperations \|\|
	TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, SrcVT))) {
	SDLoc SL(N);

	EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
	unsigned Idx = isLE ? 0 : SrcVT.getVectorNumElements() - 1;
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, VT,
	VecSrc, DAG.getConstant(Idx, SL, IdxVT));
	}
	}

	// Simplify the operands using demanded-bits information.
	if (!VT.isVector() &&
	SimplifyDemandedBits(SDValue(N, 0)))
	return SDValue(N, 0);

	// (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry)
	// (trunc addcarry(X, Y, Carry)) -> (addcarry trunc(X), trunc(Y), Carry)
	// When the adde's carry is not used.
	if ((N0.getOpcode() == ISD::ADDE \|\| N0.getOpcode() == ISD::ADDCARRY) &&
	N0.hasOneUse() && !N0.getNode()->hasAnyUseOfValue(1) &&
	(!LegalOperations \|\| TLI.isOperationLegal(N0.getOpcode(), VT))) {
	SDLoc SL(N);
	auto X = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0));
	auto Y = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
	auto VTs = DAG.getVTList(VT, N0->getValueType(1));
	return DAG.getNode(N0.getOpcode(), SL, VTs, X, Y, N0.getOperand(2));
	}

	if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
	return NewVSel;

	return SDValue();
	}

	static SDNode getBuildPairElt(SDNode N, unsigned i) {
	SDValue Elt = N->getOperand(i);
	if (Elt.getOpcode() != ISD::MERGE_VALUES)
	return Elt.getNode();
	return Elt.getOperand(Elt.getResNo()).getNode();
	}

	/// build_pair (load, load) -> load
	/// if load locations are consecutive.
	SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {
	assert(N->getOpcode() == ISD::BUILD_PAIR);

	LoadSDNode *LD1 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 0));
	LoadSDNode *LD2 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 1));

	// A BUILD_PAIR is always having the least significant part in elt 0 and the
	// most significant part in elt 1. So when combining into one large load, we
	// need to consider the endianness.
	if (DAG.getDataLayout().isBigEndian())
	std::swap(LD1, LD2);

	if (!LD1 \|\| !LD2 \|\| !ISD::isNON_EXTLoad(LD1) \|\| !LD1->hasOneUse() \|\|
	LD1->getAddressSpace() != LD2->getAddressSpace())
	return SDValue();
	EVT LD1VT = LD1->getValueType(0);
	unsigned LD1Bytes = LD1VT.getStoreSize();
	if (ISD::isNON_EXTLoad(LD2) && LD2->hasOneUse() &&
	DAG.areNonVolatileConsecutiveLoads(LD2, LD1, LD1Bytes, 1)) {
	unsigned Align = LD1->getAlignment();
	unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
	VT.getTypeForEVT(*DAG.getContext()));

	if (NewAlign <= Align &&
	(!LegalOperations \|\| TLI.isOperationLegal(ISD::LOAD, VT)))
	return DAG.getLoad(VT, SDLoc(N), LD1->getChain(), LD1->getBasePtr(),
	LD1->getPointerInfo(), Align);
	}

	return SDValue();
	}

	static unsigned getPPCf128HiElementSelector(const SelectionDAG &DAG) {
	// On little-endian machines, bitcasting from ppcf128 to i128 does swap the Hi
	// and Lo parts; on big-endian machines it doesn't.
	return DAG.getDataLayout().isBigEndian() ? 1 : 0;
	}

	static SDValue foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG,
	const TargetLowering &TLI) {
	// If this is not a bitcast to an FP type or if the target doesn't have
	// IEEE754-compliant FP logic, we're done.
	EVT VT = N->getValueType(0);
	if (!VT.isFloatingPoint() \|\| !TLI.hasBitPreservingFPLogic(VT))
	return SDValue();

	// TODO: Use splat values for the constant-checking below and remove this
	// restriction.
	SDValue N0 = N->getOperand(0);
	EVT SourceVT = N0.getValueType();
	if (SourceVT.isVector())
	return SDValue();

	unsigned FPOpcode;
	APInt SignMask;
	switch (N0.getOpcode()) {
	case ISD::AND:
	FPOpcode = ISD::FABS;
	SignMask = ~APInt::getSignMask(SourceVT.getSizeInBits());
	break;
	case ISD::XOR:
	FPOpcode = ISD::FNEG;
	SignMask = APInt::getSignMask(SourceVT.getSizeInBits());
	break;
	// TODO: ISD::OR --> ISD::FNABS?
	default:
	return SDValue();
	}

	// Fold (bitcast int (and (bitcast fp X to int), 0x7fff...) to fp) -> fabs X
	// Fold (bitcast int (xor (bitcast fp X to int), 0x8000...) to fp) -> fneg X
	SDValue LogicOp0 = N0.getOperand(0);
	ConstantSDNode *LogicOp1 = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (LogicOp1 && LogicOp1->getAPIntValue() == SignMask &&
	LogicOp0.getOpcode() == ISD::BITCAST &&
	LogicOp0->getOperand(0).getValueType() == VT)
	return DAG.getNode(FPOpcode, SDLoc(N), VT, LogicOp0->getOperand(0));

	return SDValue();
	}

	SDValue DAGCombiner::visitBITCAST(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	if (N0.isUndef())
	return DAG.getUNDEF(VT);

	// If the input is a BUILD_VECTOR with all constant elements, fold this now.
	// Only do this before legalize, since afterward the target may be depending
	// on the bitconvert.
	// First check to see if this is all constant.
	if (!LegalTypes &&
	N0.getOpcode() == ISD::BUILD_VECTOR && N0.getNode()->hasOneUse() &&
	VT.isVector()) {
	bool isSimple = cast<BuildVectorSDNode>(N0)->isConstant();

	EVT DestEltVT = N->getValueType(0).getVectorElementType();
	assert(!DestEltVT.isVector() &&
	"Element type of vector ValueType must not be vector!");
	if (isSimple)
	return ConstantFoldBITCASTofBUILD_VECTOR(N0.getNode(), DestEltVT);
	}

	// If the input is a constant, let getNode fold it.
	if (isa<ConstantSDNode>(N0) \|\| isa<ConstantFPSDNode>(N0)) {
	// If we can't allow illegal operations, we need to check that this is just
	// a fp -> int or int -> conversion and that the resulting operation will
	// be legal.
	if (!LegalOperations \|\|
	(isa<ConstantSDNode>(N0) && VT.isFloatingPoint() && !VT.isVector() &&
	TLI.isOperationLegal(ISD::ConstantFP, VT)) \|\|
	(isa<ConstantFPSDNode>(N0) && VT.isInteger() && !VT.isVector() &&
	TLI.isOperationLegal(ISD::Constant, VT)))
	return DAG.getBitcast(VT, N0);
	}

	// (conv (conv x, t1), t2) -> (conv x, t2)
	if (N0.getOpcode() == ISD::BITCAST)
	return DAG.getBitcast(VT, N0.getOperand(0));

	// fold (conv (load x)) -> (load (conv*)x)
	// If the resultant load doesn't need a higher alignment than the original!
	if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
	// Do not change the width of a volatile load.
	!cast<LoadSDNode>(N0)->isVolatile() &&
	// Do not remove the cast if the types differ in endian layout.
	TLI.hasBigEndianPartOrdering(N0.getValueType(), DAG.getDataLayout()) ==
	TLI.hasBigEndianPartOrdering(VT, DAG.getDataLayout()) &&
	(!LegalOperations \|\| TLI.isOperationLegal(ISD::LOAD, VT)) &&
	TLI.isLoadBitCastBeneficial(N0.getValueType(), VT)) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	unsigned OrigAlign = LN0->getAlignment();

	bool Fast = false;
	if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
	LN0->getAddressSpace(), OrigAlign, &Fast) &&
	Fast) {
	SDValue Load =
	DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
	LN0->getPointerInfo(), OrigAlign,
	LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
	DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
	return Load;
	}
	}

	if (SDValue V = foldBitcastedFPLogic(N, DAG, TLI))
	return V;

	// fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
	// fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
	//
	// For ppc_fp128:
	// fold (bitcast (fneg x)) ->
	// flipbit = signbit
	// (xor (bitcast x) (build_pair flipbit, flipbit))
	//
	// fold (bitcast (fabs x)) ->
	// flipbit = (and (extract_element (bitcast x), 0), signbit)
	// (xor (bitcast x) (build_pair flipbit, flipbit))
	// This often reduces constant pool loads.
	if (((N0.getOpcode() == ISD::FNEG && !TLI.isFNegFree(N0.getValueType())) \|\|
	(N0.getOpcode() == ISD::FABS && !TLI.isFAbsFree(N0.getValueType()))) &&
	N0.getNode()->hasOneUse() && VT.isInteger() &&
	!VT.isVector() && !N0.getValueType().isVector()) {
	SDValue NewConv = DAG.getBitcast(VT, N0.getOperand(0));
	AddToWorklist(NewConv.getNode());

	SDLoc DL(N);
	if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) {
	assert(VT.getSizeInBits() == 128);
	SDValue SignBit = DAG.getConstant(
	APInt::getSignMask(VT.getSizeInBits() / 2), SDLoc(N0), MVT::i64);
	SDValue FlipBit;
	if (N0.getOpcode() == ISD::FNEG) {
	FlipBit = SignBit;
	AddToWorklist(FlipBit.getNode());
	} else {
	assert(N0.getOpcode() == ISD::FABS);
	SDValue Hi =
	DAG.getNode(ISD::EXTRACT_ELEMENT, SDLoc(NewConv), MVT::i64, NewConv,
	DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG),
	SDLoc(NewConv)));
	AddToWorklist(Hi.getNode());
	FlipBit = DAG.getNode(ISD::AND, SDLoc(N0), MVT::i64, Hi, SignBit);
	AddToWorklist(FlipBit.getNode());
	}
	SDValue FlipBits =
	DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit);
	AddToWorklist(FlipBits.getNode());
	return DAG.getNode(ISD::XOR, DL, VT, NewConv, FlipBits);
	}
	APInt SignBit = APInt::getSignMask(VT.getSizeInBits());
	if (N0.getOpcode() == ISD::FNEG)
	return DAG.getNode(ISD::XOR, DL, VT,
	NewConv, DAG.getConstant(SignBit, DL, VT));
	assert(N0.getOpcode() == ISD::FABS);
	return DAG.getNode(ISD::AND, DL, VT,
	NewConv, DAG.getConstant(~SignBit, DL, VT));
	}

	// fold (bitconvert (fcopysign cst, x)) ->
	// (or (and (bitconvert x), sign), (and cst, (not sign)))
	// Note that we don't handle (copysign x, cst) because this can always be
	// folded to an fneg or fabs.
	//
	// For ppc_fp128:
	// fold (bitcast (fcopysign cst, x)) ->
	// flipbit = (and (extract_element
	// (xor (bitcast cst), (bitcast x)), 0),
	// signbit)
	// (xor (bitcast cst) (build_pair flipbit, flipbit))
	if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse() &&
	isa<ConstantFPSDNode>(N0.getOperand(0)) &&
	VT.isInteger() && !VT.isVector()) {
	unsigned OrigXWidth = N0.getOperand(1).getValueSizeInBits();
	EVT IntXVT = EVT::getIntegerVT(*DAG.getContext(), OrigXWidth);
	if (isTypeLegal(IntXVT)) {
	SDValue X = DAG.getBitcast(IntXVT, N0.getOperand(1));
	AddToWorklist(X.getNode());

	// If X has a different width than the result/lhs, sext it or truncate it.
	unsigned VTWidth = VT.getSizeInBits();
	if (OrigXWidth < VTWidth) {
	X = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, X);
	AddToWorklist(X.getNode());
	} else if (OrigXWidth > VTWidth) {
	// To get the sign bit in the right place, we have to shift it right
	// before truncating.
	SDLoc DL(X);
	X = DAG.getNode(ISD::SRL, DL,
	X.getValueType(), X,
	DAG.getConstant(OrigXWidth-VTWidth, DL,
	X.getValueType()));
	AddToWorklist(X.getNode());
	X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X);
	AddToWorklist(X.getNode());
	}

	if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) {
	APInt SignBit = APInt::getSignMask(VT.getSizeInBits() / 2);
	SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0));
	AddToWorklist(Cst.getNode());
	SDValue X = DAG.getBitcast(VT, N0.getOperand(1));
	AddToWorklist(X.getNode());
	SDValue XorResult = DAG.getNode(ISD::XOR, SDLoc(N0), VT, Cst, X);
	AddToWorklist(XorResult.getNode());
	SDValue XorResult64 = DAG.getNode(
	ISD::EXTRACT_ELEMENT, SDLoc(XorResult), MVT::i64, XorResult,
	DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG),
	SDLoc(XorResult)));
	AddToWorklist(XorResult64.getNode());
	SDValue FlipBit =
	DAG.getNode(ISD::AND, SDLoc(XorResult64), MVT::i64, XorResult64,
	DAG.getConstant(SignBit, SDLoc(XorResult64), MVT::i64));
	AddToWorklist(FlipBit.getNode());
	SDValue FlipBits =
	DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit);
	AddToWorklist(FlipBits.getNode());
	return DAG.getNode(ISD::XOR, SDLoc(N), VT, Cst, FlipBits);
	}
	APInt SignBit = APInt::getSignMask(VT.getSizeInBits());
	X = DAG.getNode(ISD::AND, SDLoc(X), VT,
	X, DAG.getConstant(SignBit, SDLoc(X), VT));
	AddToWorklist(X.getNode());

	SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0));
	Cst = DAG.getNode(ISD::AND, SDLoc(Cst), VT,
	Cst, DAG.getConstant(~SignBit, SDLoc(Cst), VT));
	AddToWorklist(Cst.getNode());

	return DAG.getNode(ISD::OR, SDLoc(N), VT, X, Cst);
	}
	}

	// bitconvert(build_pair(ld, ld)) -> ld iff load locations are consecutive.
	if (N0.getOpcode() == ISD::BUILD_PAIR)
	if (SDValue CombineLD = CombineConsecutiveLoads(N0.getNode(), VT))
	return CombineLD;

	// Remove double bitcasts from shuffles - this is often a legacy of
	// XformToShuffleWithZero being used to combine bitmaskings (of
	// float vectors bitcast to integer vectors) into shuffles.
	// bitcast(shuffle(bitcast(s0),bitcast(s1))) -> shuffle(s0,s1)
	if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT) && VT.isVector() &&
	N0->getOpcode() == ISD::VECTOR_SHUFFLE &&
	VT.getVectorNumElements() >= N0.getValueType().getVectorNumElements() &&
	!(VT.getVectorNumElements() % N0.getValueType().getVectorNumElements())) {
	ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N0);

	// If operands are a bitcast, peek through if it casts the original VT.
	// If operands are a constant, just bitcast back to original VT.
	auto PeekThroughBitcast = [&](SDValue Op) {
	if (Op.getOpcode() == ISD::BITCAST &&
	Op.getOperand(0).getValueType() == VT)
	return SDValue(Op.getOperand(0));
	if (Op.isUndef() \|\| ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) \|\|
	ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()))
	return DAG.getBitcast(VT, Op);
	return SDValue();
	};

	// FIXME: If either input vector is bitcast, try to convert the shuffle to
	// the result type of this bitcast. This would eliminate at least one
	// bitcast. See the transform in InstCombine.
	SDValue SV0 = PeekThroughBitcast(N0->getOperand(0));
	SDValue SV1 = PeekThroughBitcast(N0->getOperand(1));
	if (!(SV0 && SV1))
	return SDValue();

	int MaskScale =
	VT.getVectorNumElements() / N0.getValueType().getVectorNumElements();
	SmallVector<int, 8> NewMask;
	for (int M : SVN->getMask())
	for (int i = 0; i != MaskScale; ++i)
	NewMask.push_back(M < 0 ? -1 : M * MaskScale + i);

	bool LegalMask = TLI.isShuffleMaskLegal(NewMask, VT);
	if (!LegalMask) {
	std::swap(SV0, SV1);
	ShuffleVectorSDNode::commuteMask(NewMask);
	LegalMask = TLI.isShuffleMaskLegal(NewMask, VT);
	}

	if (LegalMask)
	return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, NewMask);
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitBUILD_PAIR(SDNode *N) {
	EVT VT = N->getValueType(0);
	return CombineConsecutiveLoads(N, VT);
	}

	/// We know that BV is a build_vector node with Constant, ConstantFP or Undef
	/// operands. DstEltVT indicates the destination element value type.
	SDValue DAGCombiner::
	ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
	EVT SrcEltVT = BV->getValueType(0).getVectorElementType();

	// If this is already the right type, we're done.
	if (SrcEltVT == DstEltVT) return SDValue(BV, 0);

	unsigned SrcBitSize = SrcEltVT.getSizeInBits();
	unsigned DstBitSize = DstEltVT.getSizeInBits();

	// If this is a conversion of N elements of one type to N elements of another
	// type, convert each element. This handles FP<->INT cases.
	if (SrcBitSize == DstBitSize) {
	EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT,
	BV->getValueType(0).getVectorNumElements());

	// Due to the FP element handling below calling this routine recursively,
	// we can end up with a scalar-to-vector node here.
	if (BV->getOpcode() == ISD::SCALAR_TO_VECTOR)
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(BV), VT,
	DAG.getBitcast(DstEltVT, BV->getOperand(0)));

	SmallVector<SDValue, 8> Ops;
	for (SDValue Op : BV->op_values()) {
	// If the vector element type is not legal, the BUILD_VECTOR operands
	// are promoted and implicitly truncated. Make that explicit here.
	if (Op.getValueType() != SrcEltVT)
	Op = DAG.getNode(ISD::TRUNCATE, SDLoc(BV), SrcEltVT, Op);
	Ops.push_back(DAG.getBitcast(DstEltVT, Op));
	AddToWorklist(Ops.back().getNode());
	}
	return DAG.getBuildVector(VT, SDLoc(BV), Ops);
	}

	// Otherwise, we're growing or shrinking the elements. To avoid having to
	// handle annoying details of growing/shrinking FP values, we convert them to
	// int first.
	if (SrcEltVT.isFloatingPoint()) {
	// Convert the input float vector to a int vector where the elements are the
	// same sizes.
	EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltVT.getSizeInBits());
	BV = ConstantFoldBITCASTofBUILD_VECTOR(BV, IntVT).getNode();
	SrcEltVT = IntVT;
	}

	// Now we know the input is an integer vector. If the output is a FP type,
	// convert to integer first, then to FP of the right size.
	if (DstEltVT.isFloatingPoint()) {
	EVT TmpVT = EVT::getIntegerVT(*DAG.getContext(), DstEltVT.getSizeInBits());
	SDNode *Tmp = ConstantFoldBITCASTofBUILD_VECTOR(BV, TmpVT).getNode();

	// Next, convert to FP elements of the same size.
	return ConstantFoldBITCASTofBUILD_VECTOR(Tmp, DstEltVT);
	}

	SDLoc DL(BV);

	// Okay, we know the src/dst types are both integers of differing types.
	// Handling growing first.
	assert(SrcEltVT.isInteger() && DstEltVT.isInteger());
	if (SrcBitSize < DstBitSize) {
	unsigned NumInputsPerOutput = DstBitSize/SrcBitSize;

	SmallVector<SDValue, 8> Ops;
	for (unsigned i = 0, e = BV->getNumOperands(); i != e;
	i += NumInputsPerOutput) {
	bool isLE = DAG.getDataLayout().isLittleEndian();
	APInt NewBits = APInt(DstBitSize, 0);
	bool EltIsUndef = true;
	for (unsigned j = 0; j != NumInputsPerOutput; ++j) {
	// Shift the previously computed bits over.
	NewBits <<= SrcBitSize;
	SDValue Op = BV->getOperand(i+ (isLE ? (NumInputsPerOutput-j-1) : j));
	if (Op.isUndef()) continue;
	EltIsUndef = false;

	NewBits \|= cast<ConstantSDNode>(Op)->getAPIntValue().
	zextOrTrunc(SrcBitSize).zext(DstBitSize);
	}

	if (EltIsUndef)
	Ops.push_back(DAG.getUNDEF(DstEltVT));
	else
	Ops.push_back(DAG.getConstant(NewBits, DL, DstEltVT));
	}

	EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, Ops.size());
	return DAG.getBuildVector(VT, DL, Ops);
	}

	// Finally, this must be the case where we are shrinking elements: each input
	// turns into multiple outputs.
	unsigned NumOutputsPerInput = SrcBitSize/DstBitSize;
	EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT,
	NumOutputsPerInput*BV->getNumOperands());
	SmallVector<SDValue, 8> Ops;

	for (const SDValue &Op : BV->op_values()) {
	if (Op.isUndef()) {
	Ops.append(NumOutputsPerInput, DAG.getUNDEF(DstEltVT));
	continue;
	}

	APInt OpVal = cast<ConstantSDNode>(Op)->
	getAPIntValue().zextOrTrunc(SrcBitSize);

	for (unsigned j = 0; j != NumOutputsPerInput; ++j) {
	APInt ThisVal = OpVal.trunc(DstBitSize);
	Ops.push_back(DAG.getConstant(ThisVal, DL, DstEltVT));
	OpVal.lshrInPlace(DstBitSize);
	}

	// For big endian targets, swap the order of the pieces of each element.
	if (DAG.getDataLayout().isBigEndian())
	std::reverse(Ops.end()-NumOutputsPerInput, Ops.end());
	}

	return DAG.getBuildVector(VT, DL, Ops);
	}

	static bool isContractable(SDNode *N) {
	SDNodeFlags F = N->getFlags();
	return F.hasAllowContract() \|\| F.hasUnsafeAlgebra();
	}

	/// Try to perform FMA combining on a given FADD node.
	SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc SL(N);

	const TargetOptions &Options = DAG.getTarget().Options;

	// Floating-point multiply-add with intermediate rounding.
	bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));

	// Floating-point multiply-add without intermediate rounding.
	bool HasFMA =
	TLI.isFMAFasterThanFMulAndFAdd(VT) &&
	(!LegalOperations \|\| TLI.isOperationLegalOrCustom(ISD::FMA, VT));

	// No valid opcode, do not combine.
	if (!HasFMAD && !HasFMA)
	return SDValue();

	bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast \|\|
	Options.UnsafeFPMath \|\| HasFMAD);
	// If the addition is not contractable, do not combine.
	if (!AllowFusionGlobally && !isContractable(N))
	return SDValue();

	const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo();
	if (STI && STI->generateFMAsInMachineCombiner(OptLevel))
	return SDValue();

	// Always prefer FMAD to FMA for precision.
	unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
	bool Aggressive = TLI.enableAggressiveFMAFusion(VT);

	// Is the node an FMUL and contractable either due to global flags or
	// SDNodeFlags.
	auto isContractableFMUL = [AllowFusionGlobally](SDValue N) {
	if (N.getOpcode() != ISD::FMUL)
	return false;
	return AllowFusionGlobally \|\| isContractable(N.getNode());
	};
	// If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
	// prefer to fold the multiply with fewer uses.
	if (Aggressive && isContractableFMUL(N0) && isContractableFMUL(N1)) {
	if (N0.getNode()->use_size() > N1.getNode()->use_size())
	std::swap(N0, N1);
	}

	// fold (fadd (fmul x, y), z) -> (fma x, y, z)
	if (isContractableFMUL(N0) && (Aggressive \|\| N0->hasOneUse())) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	N0.getOperand(0), N0.getOperand(1), N1);
	}

	// fold (fadd x, (fmul y, z)) -> (fma y, z, x)
	// Note: Commutes FADD operands.
	if (isContractableFMUL(N1) && (Aggressive \|\| N1->hasOneUse())) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	N1.getOperand(0), N1.getOperand(1), N0);
	}

	// Look through FP_EXTEND nodes to do more combining.

	// fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
	if (N0.getOpcode() == ISD::FP_EXTEND) {
	SDValue N00 = N0.getOperand(0);
	if (isContractableFMUL(N00) &&
	TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N00.getOperand(0)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N00.getOperand(1)), N1);
	}
	}

	// fold (fadd x, (fpext (fmul y, z))) -> (fma (fpext y), (fpext z), x)
	// Note: Commutes FADD operands.
	if (N1.getOpcode() == ISD::FP_EXTEND) {
	SDValue N10 = N1.getOperand(0);
	if (isContractableFMUL(N10) &&
	TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N10.getOperand(0)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N10.getOperand(1)), N0);
	}
	}

	// More folding opportunities when target permits.
	if (Aggressive) {
	// fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, z))
	// FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF
	// are currently only supported on binary nodes.
	if (Options.UnsafeFPMath &&
	N0.getOpcode() == PreferredFusedOpcode &&
	N0.getOperand(2).getOpcode() == ISD::FMUL &&
	N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	N0.getOperand(0), N0.getOperand(1),
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	N0.getOperand(2).getOperand(0),
	N0.getOperand(2).getOperand(1),
	N1));
	}

	// fold (fadd x, (fma y, z, (fmul u, v)) -> (fma y, z (fma u, v, x))
	// FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF
	// are currently only supported on binary nodes.
	if (Options.UnsafeFPMath &&
	N1->getOpcode() == PreferredFusedOpcode &&
	N1.getOperand(2).getOpcode() == ISD::FMUL &&
	N1->hasOneUse() && N1.getOperand(2)->hasOneUse()) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	N1.getOperand(0), N1.getOperand(1),
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	N1.getOperand(2).getOperand(0),
	N1.getOperand(2).getOperand(1),
	N0));
	}


	// fold (fadd (fma x, y, (fpext (fmul u, v))), z)
	// -> (fma x, y, (fma (fpext u), (fpext v), z))
	auto FoldFAddFMAFPExtFMul = [&] (
	SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT, X, Y,
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT, U),
	DAG.getNode(ISD::FP_EXTEND, SL, VT, V),
	Z));
	};
	if (N0.getOpcode() == PreferredFusedOpcode) {
	SDValue N02 = N0.getOperand(2);
	if (N02.getOpcode() == ISD::FP_EXTEND) {
	SDValue N020 = N02.getOperand(0);
	if (isContractableFMUL(N020) &&
	TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N020.getValueType())) {
	return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1),
	N020.getOperand(0), N020.getOperand(1),
	N1);
	}
	}
	}

	// fold (fadd (fpext (fma x, y, (fmul u, v))), z)
	// -> (fma (fpext x), (fpext y), (fma (fpext u), (fpext v), z))
	// FIXME: This turns two single-precision and one double-precision
	// operation into two double-precision operations, which might not be
	// interesting for all targets, especially GPUs.
	auto FoldFAddFPExtFMAFMul = [&] (
	SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT, X),
	DAG.getNode(ISD::FP_EXTEND, SL, VT, Y),
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT, U),
	DAG.getNode(ISD::FP_EXTEND, SL, VT, V),
	Z));
	};
	if (N0.getOpcode() == ISD::FP_EXTEND) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getOpcode() == PreferredFusedOpcode) {
	SDValue N002 = N00.getOperand(2);
	if (isContractableFMUL(N002) &&
	TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) {
	return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1),
	N002.getOperand(0), N002.getOperand(1),
	N1);
	}
	}
	}

	// fold (fadd x, (fma y, z, (fpext (fmul u, v)))
	// -> (fma y, z, (fma (fpext u), (fpext v), x))
	if (N1.getOpcode() == PreferredFusedOpcode) {
	SDValue N12 = N1.getOperand(2);
	if (N12.getOpcode() == ISD::FP_EXTEND) {
	SDValue N120 = N12.getOperand(0);
	if (isContractableFMUL(N120) &&
	TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N120.getValueType())) {
	return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1),
	N120.getOperand(0), N120.getOperand(1),
	N0);
	}
	}
	}

	// fold (fadd x, (fpext (fma y, z, (fmul u, v)))
	// -> (fma (fpext y), (fpext z), (fma (fpext u), (fpext v), x))
	// FIXME: This turns two single-precision and one double-precision
	// operation into two double-precision operations, which might not be
	// interesting for all targets, especially GPUs.
	if (N1.getOpcode() == ISD::FP_EXTEND) {
	SDValue N10 = N1.getOperand(0);
	if (N10.getOpcode() == PreferredFusedOpcode) {
	SDValue N102 = N10.getOperand(2);
	if (isContractableFMUL(N102) &&
	TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) {
	return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1),
	N102.getOperand(0), N102.getOperand(1),
	N0);
	}
	}
	}
	}

	return SDValue();
	}

	/// Try to perform FMA combining on a given FSUB node.
	SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc SL(N);

	const TargetOptions &Options = DAG.getTarget().Options;
	// Floating-point multiply-add with intermediate rounding.
	bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));

	// Floating-point multiply-add without intermediate rounding.
	bool HasFMA =
	TLI.isFMAFasterThanFMulAndFAdd(VT) &&
	(!LegalOperations \|\| TLI.isOperationLegalOrCustom(ISD::FMA, VT));

	// No valid opcode, do not combine.
	if (!HasFMAD && !HasFMA)
	return SDValue();

	bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast \|\|
	Options.UnsafeFPMath \|\| HasFMAD);
	// If the subtraction is not contractable, do not combine.
	if (!AllowFusionGlobally && !isContractable(N))
	return SDValue();

	const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo();
	if (STI && STI->generateFMAsInMachineCombiner(OptLevel))
	return SDValue();

	// Always prefer FMAD to FMA for precision.
	unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
	bool Aggressive = TLI.enableAggressiveFMAFusion(VT);

	// Is the node an FMUL and contractable either due to global flags or
	// SDNodeFlags.
	auto isContractableFMUL = [AllowFusionGlobally](SDValue N) {
	if (N.getOpcode() != ISD::FMUL)
	return false;
	return AllowFusionGlobally \|\| isContractable(N.getNode());
	};

	// fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
	if (isContractableFMUL(N0) && (Aggressive \|\| N0->hasOneUse())) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	N0.getOperand(0), N0.getOperand(1),
	DAG.getNode(ISD::FNEG, SL, VT, N1));
	}

	// fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
	// Note: Commutes FSUB operands.
	if (isContractableFMUL(N1) && (Aggressive \|\| N1->hasOneUse()))
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT,
	N1.getOperand(0)),
	N1.getOperand(1), N0);

	// fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
	if (N0.getOpcode() == ISD::FNEG && isContractableFMUL(N0.getOperand(0)) &&
	(Aggressive \|\| (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) {
	SDValue N00 = N0.getOperand(0).getOperand(0);
	SDValue N01 = N0.getOperand(0).getOperand(1);
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT, N00), N01,
	DAG.getNode(ISD::FNEG, SL, VT, N1));
	}

	// Look through FP_EXTEND nodes to do more combining.

	// fold (fsub (fpext (fmul x, y)), z)
	// -> (fma (fpext x), (fpext y), (fneg z))
	if (N0.getOpcode() == ISD::FP_EXTEND) {
	SDValue N00 = N0.getOperand(0);
	if (isContractableFMUL(N00) &&
	TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N00.getOperand(0)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N00.getOperand(1)),
	DAG.getNode(ISD::FNEG, SL, VT, N1));
	}
	}

	// fold (fsub x, (fpext (fmul y, z)))
	// -> (fma (fneg (fpext y)), (fpext z), x)
	// Note: Commutes FSUB operands.
	if (N1.getOpcode() == ISD::FP_EXTEND) {
	SDValue N10 = N1.getOperand(0);
	if (isContractableFMUL(N10) &&
	TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N10.getOperand(0))),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N10.getOperand(1)),
	N0);
	}
	}

	// fold (fsub (fpext (fneg (fmul, x, y))), z)
	// -> (fneg (fma (fpext x), (fpext y), z))
	// Note: This could be removed with appropriate canonicalization of the
	// input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the
	// orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent
	// from implementing the canonicalization in visitFSUB.
	if (N0.getOpcode() == ISD::FP_EXTEND) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getOpcode() == ISD::FNEG) {
	SDValue N000 = N00.getOperand(0);
	if (isContractableFMUL(N000) &&
	TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) {
	return DAG.getNode(ISD::FNEG, SL, VT,
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N000.getOperand(0)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N000.getOperand(1)),
	N1));
	}
	}
	}

	// fold (fsub (fneg (fpext (fmul, x, y))), z)
	// -> (fneg (fma (fpext x)), (fpext y), z)
	// Note: This could be removed with appropriate canonicalization of the
	// input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the
	// orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent
	// from implementing the canonicalization in visitFSUB.
	if (N0.getOpcode() == ISD::FNEG) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getOpcode() == ISD::FP_EXTEND) {
	SDValue N000 = N00.getOperand(0);
	if (isContractableFMUL(N000) &&
	TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N000.getValueType())) {
	return DAG.getNode(ISD::FNEG, SL, VT,
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N000.getOperand(0)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N000.getOperand(1)),
	N1));
	}
	}
	}

	// More folding opportunities when target permits.
	if (Aggressive) {
	// fold (fsub (fma x, y, (fmul u, v)), z)
	// -> (fma x, y (fma u, v, (fneg z)))
	// FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF
	// are currently only supported on binary nodes.
	if (Options.UnsafeFPMath && N0.getOpcode() == PreferredFusedOpcode &&
	isContractableFMUL(N0.getOperand(2)) && N0->hasOneUse() &&
	N0.getOperand(2)->hasOneUse()) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	N0.getOperand(0), N0.getOperand(1),
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	N0.getOperand(2).getOperand(0),
	N0.getOperand(2).getOperand(1),
	DAG.getNode(ISD::FNEG, SL, VT,
	N1)));
	}

	// fold (fsub x, (fma y, z, (fmul u, v)))
	// -> (fma (fneg y), z, (fma (fneg u), v, x))
	// FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF
	// are currently only supported on binary nodes.
	if (Options.UnsafeFPMath && N1.getOpcode() == PreferredFusedOpcode &&
	isContractableFMUL(N1.getOperand(2))) {
	SDValue N20 = N1.getOperand(2).getOperand(0);
	SDValue N21 = N1.getOperand(2).getOperand(1);
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT,
	N1.getOperand(0)),
	N1.getOperand(1),
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT, N20),

	N21, N0));
	}


	// fold (fsub (fma x, y, (fpext (fmul u, v))), z)
	// -> (fma x, y (fma (fpext u), (fpext v), (fneg z)))
	if (N0.getOpcode() == PreferredFusedOpcode) {
	SDValue N02 = N0.getOperand(2);
	if (N02.getOpcode() == ISD::FP_EXTEND) {
	SDValue N020 = N02.getOperand(0);
	if (isContractableFMUL(N020) &&
	TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N020.getValueType())) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	N0.getOperand(0), N0.getOperand(1),
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N020.getOperand(0)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N020.getOperand(1)),
	DAG.getNode(ISD::FNEG, SL, VT,
	N1)));
	}
	}
	}

	// fold (fsub (fpext (fma x, y, (fmul u, v))), z)
	// -> (fma (fpext x), (fpext y),
	// (fma (fpext u), (fpext v), (fneg z)))
	// FIXME: This turns two single-precision and one double-precision
	// operation into two double-precision operations, which might not be
	// interesting for all targets, especially GPUs.
	if (N0.getOpcode() == ISD::FP_EXTEND) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getOpcode() == PreferredFusedOpcode) {
	SDValue N002 = N00.getOperand(2);
	if (isContractableFMUL(N002) &&
	TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) {
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N00.getOperand(0)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N00.getOperand(1)),
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N002.getOperand(0)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N002.getOperand(1)),
	DAG.getNode(ISD::FNEG, SL, VT,
	N1)));
	}
	}
	}

	// fold (fsub x, (fma y, z, (fpext (fmul u, v))))
	// -> (fma (fneg y), z, (fma (fneg (fpext u)), (fpext v), x))
	if (N1.getOpcode() == PreferredFusedOpcode &&
	N1.getOperand(2).getOpcode() == ISD::FP_EXTEND) {
	SDValue N120 = N1.getOperand(2).getOperand(0);
	if (isContractableFMUL(N120) &&
	TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N120.getValueType())) {
	SDValue N1200 = N120.getOperand(0);
	SDValue N1201 = N120.getOperand(1);
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)),
	N1.getOperand(1),
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL,
	VT, N1200)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N1201),
	N0));
	}
	}

	// fold (fsub x, (fpext (fma y, z, (fmul u, v))))
	// -> (fma (fneg (fpext y)), (fpext z),
	// (fma (fneg (fpext u)), (fpext v), x))
	// FIXME: This turns two single-precision and one double-precision
	// operation into two double-precision operations, which might not be
	// interesting for all targets, especially GPUs.
	if (N1.getOpcode() == ISD::FP_EXTEND &&
	N1.getOperand(0).getOpcode() == PreferredFusedOpcode) {
	SDValue CvtSrc = N1.getOperand(0);
	SDValue N100 = CvtSrc.getOperand(0);
	SDValue N101 = CvtSrc.getOperand(1);
	SDValue N102 = CvtSrc.getOperand(2);
	if (isContractableFMUL(N102) &&
	TLI.isFPExtFoldable(PreferredFusedOpcode, VT, CvtSrc.getValueType())) {
	SDValue N1020 = N102.getOperand(0);
	SDValue N1021 = N102.getOperand(1);
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N100)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT, N101),
	DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT,
	DAG.getNode(ISD::FP_EXTEND, SL,
	VT, N1020)),
	DAG.getNode(ISD::FP_EXTEND, SL, VT,
	N1021),
	N0));
	}
	}
	}

	return SDValue();
	}

	/// Try to perform FMA combining on a given FMUL node based on the distributive
	/// law x * (y + 1) = x * y + x and variants thereof (commuted versions,
	/// subtraction instead of addition).
	SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc SL(N);

	assert(N->getOpcode() == ISD::FMUL && "Expected FMUL Operation");

	const TargetOptions &Options = DAG.getTarget().Options;

	// The transforms below are incorrect when x == 0 and y == inf, because the
	// intermediate multiplication produces a nan.
	if (!Options.NoInfsFPMath)
	return SDValue();

	// Floating-point multiply-add without intermediate rounding.
	bool HasFMA =
	(Options.AllowFPOpFusion == FPOpFusion::Fast \|\| Options.UnsafeFPMath) &&
	TLI.isFMAFasterThanFMulAndFAdd(VT) &&
	(!LegalOperations \|\| TLI.isOperationLegalOrCustom(ISD::FMA, VT));

	// Floating-point multiply-add with intermediate rounding. This can result
	// in a less precise result due to the changed rounding order.
	bool HasFMAD = Options.UnsafeFPMath &&
	(LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));

	// No valid opcode, do not combine.
	if (!HasFMAD && !HasFMA)
	return SDValue();

	// Always prefer FMAD to FMA for precision.
	unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
	bool Aggressive = TLI.enableAggressiveFMAFusion(VT);

	// fold (fmul (fadd x, +1.0), y) -> (fma x, y, y)
	// fold (fmul (fadd x, -1.0), y) -> (fma x, y, (fneg y))
	auto FuseFADD = [&](SDValue X, SDValue Y) {
	if (X.getOpcode() == ISD::FADD && (Aggressive \|\| X->hasOneUse())) {
	auto XC1 = isConstOrConstSplatFP(X.getOperand(1));
	if (XC1 && XC1->isExactlyValue(+1.0))
	return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, Y);
	if (XC1 && XC1->isExactlyValue(-1.0))
	return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
	DAG.getNode(ISD::FNEG, SL, VT, Y));
	}
	return SDValue();
	};

	if (SDValue FMA = FuseFADD(N0, N1))
	return FMA;
	if (SDValue FMA = FuseFADD(N1, N0))
	return FMA;

	// fold (fmul (fsub +1.0, x), y) -> (fma (fneg x), y, y)
	// fold (fmul (fsub -1.0, x), y) -> (fma (fneg x), y, (fneg y))
	// fold (fmul (fsub x, +1.0), y) -> (fma x, y, (fneg y))
	// fold (fmul (fsub x, -1.0), y) -> (fma x, y, y)
	auto FuseFSUB = [&](SDValue X, SDValue Y) {
	if (X.getOpcode() == ISD::FSUB && (Aggressive \|\| X->hasOneUse())) {
	auto XC0 = isConstOrConstSplatFP(X.getOperand(0));
	if (XC0 && XC0->isExactlyValue(+1.0))
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
	Y);
	if (XC0 && XC0->isExactlyValue(-1.0))
	return DAG.getNode(PreferredFusedOpcode, SL, VT,
	DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
	DAG.getNode(ISD::FNEG, SL, VT, Y));

	auto XC1 = isConstOrConstSplatFP(X.getOperand(1));
	if (XC1 && XC1->isExactlyValue(+1.0))
	return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
	DAG.getNode(ISD::FNEG, SL, VT, Y));
	if (XC1 && XC1->isExactlyValue(-1.0))
	return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, Y);
	}
	return SDValue();
	};

	if (SDValue FMA = FuseFSUB(N0, N1))
	return FMA;
	if (SDValue FMA = FuseFSUB(N1, N0))
	return FMA;

	return SDValue();
	}

	static bool isFMulNegTwo(SDValue &N) {
	if (N.getOpcode() != ISD::FMUL)
	return false;
	if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N.getOperand(1)))
	return CFP->isExactlyValue(-2.0);
	return false;
	}

	SDValue DAGCombiner::visitFADD(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	bool N0CFP = isConstantFPBuildVectorOrConstantFP(N0);
	bool N1CFP = isConstantFPBuildVectorOrConstantFP(N1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);
	const TargetOptions &Options = DAG.getTarget().Options;
	const SDNodeFlags Flags = N->getFlags();

	// fold vector ops
	if (VT.isVector())
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	// fold (fadd c1, c2) -> c1 + c2
	if (N0CFP && N1CFP)
	return DAG.getNode(ISD::FADD, DL, VT, N0, N1, Flags);

	// canonicalize constant to RHS
	if (N0CFP && !N1CFP)
	return DAG.getNode(ISD::FADD, DL, VT, N1, N0, Flags);

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// fold (fadd A, (fneg B)) -> (fsub A, B)
	if ((!LegalOperations \|\| TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
	isNegatibleForFree(N1, LegalOperations, TLI, &Options) == 2)
	return DAG.getNode(ISD::FSUB, DL, VT, N0,
	GetNegatedExpression(N1, DAG, LegalOperations), Flags);

	// fold (fadd (fneg A), B) -> (fsub B, A)
	if ((!LegalOperations \|\| TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
	isNegatibleForFree(N0, LegalOperations, TLI, &Options) == 2)
	return DAG.getNode(ISD::FSUB, DL, VT, N1,
	GetNegatedExpression(N0, DAG, LegalOperations), Flags);

	// fold (fadd A, (fmul B, -2.0)) -> (fsub A, (fadd B, B))
	// fold (fadd (fmul B, -2.0), A) -> (fsub A, (fadd B, B))
	if ((isFMulNegTwo(N0) && N0.hasOneUse()) \|\|
	(isFMulNegTwo(N1) && N1.hasOneUse())) {
	bool N1IsFMul = isFMulNegTwo(N1);
	SDValue AddOp = N1IsFMul ? N1.getOperand(0) : N0.getOperand(0);
	SDValue Add = DAG.getNode(ISD::FADD, DL, VT, AddOp, AddOp, Flags);
	return DAG.getNode(ISD::FSUB, DL, VT, N1IsFMul ? N0 : N1, Add, Flags);
	}

	// FIXME: Auto-upgrade the target/function-level option.
	if (Options.NoSignedZerosFPMath \|\| N->getFlags().hasNoSignedZeros()) {
	// fold (fadd A, 0) -> A
	if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1))
	if (N1C->isZero())
	return N0;
	}

	// If 'unsafe math' is enabled, fold lots of things.
	if (Options.UnsafeFPMath) {
	// No FP constant should be created after legalization as Instruction
	// Selection pass has a hard time dealing with FP constants.
	bool AllowNewConst = (Level < AfterLegalizeDAG);

	// fold (fadd (fadd x, c1), c2) -> (fadd x, (fadd c1, c2))
	if (N1CFP && N0.getOpcode() == ISD::FADD && N0.getNode()->hasOneUse() &&
	isConstantFPBuildVectorOrConstantFP(N0.getOperand(1)))
	return DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(0),
	DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), N1,
	Flags),
	Flags);

	// If allowed, fold (fadd (fneg x), x) -> 0.0
	if (AllowNewConst && N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1)
	return DAG.getConstantFP(0.0, DL, VT);

	// If allowed, fold (fadd x, (fneg x)) -> 0.0
	if (AllowNewConst && N1.getOpcode() == ISD::FNEG && N1.getOperand(0) == N0)
	return DAG.getConstantFP(0.0, DL, VT);

	// We can fold chains of FADD's of the same value into multiplications.
	// This transform is not safe in general because we are reducing the number
	// of rounding steps.
	if (TLI.isOperationLegalOrCustom(ISD::FMUL, VT) && !N0CFP && !N1CFP) {
	if (N0.getOpcode() == ISD::FMUL) {
	bool CFP00 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
	bool CFP01 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(1));

	// (fadd (fmul x, c), x) -> (fmul x, c+1)
	if (CFP01 && !CFP00 && N0.getOperand(0) == N1) {
	SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1),
	DAG.getConstantFP(1.0, DL, VT), Flags);
	return DAG.getNode(ISD::FMUL, DL, VT, N1, NewCFP, Flags);
	}

	// (fadd (fmul x, c), (fadd x, x)) -> (fmul x, c+2)
	if (CFP01 && !CFP00 && N1.getOpcode() == ISD::FADD &&
	N1.getOperand(0) == N1.getOperand(1) &&
	N0.getOperand(0) == N1.getOperand(0)) {
	SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1),
	DAG.getConstantFP(2.0, DL, VT), Flags);
	return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), NewCFP, Flags);
	}
	}

	if (N1.getOpcode() == ISD::FMUL) {
	bool CFP10 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
	bool CFP11 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(1));

	// (fadd x, (fmul x, c)) -> (fmul x, c+1)
	if (CFP11 && !CFP10 && N1.getOperand(0) == N0) {
	SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1),
	DAG.getConstantFP(1.0, DL, VT), Flags);
	return DAG.getNode(ISD::FMUL, DL, VT, N0, NewCFP, Flags);
	}

	// (fadd (fadd x, x), (fmul x, c)) -> (fmul x, c+2)
	if (CFP11 && !CFP10 && N0.getOpcode() == ISD::FADD &&
	N0.getOperand(0) == N0.getOperand(1) &&
	N1.getOperand(0) == N0.getOperand(0)) {
	SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1),
	DAG.getConstantFP(2.0, DL, VT), Flags);
	return DAG.getNode(ISD::FMUL, DL, VT, N1.getOperand(0), NewCFP, Flags);
	}
	}

	if (N0.getOpcode() == ISD::FADD && AllowNewConst) {
	bool CFP00 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
	// (fadd (fadd x, x), x) -> (fmul x, 3.0)
	if (!CFP00 && N0.getOperand(0) == N0.getOperand(1) &&
	(N0.getOperand(0) == N1)) {
	return DAG.getNode(ISD::FMUL, DL, VT,
	N1, DAG.getConstantFP(3.0, DL, VT), Flags);
	}
	}

	if (N1.getOpcode() == ISD::FADD && AllowNewConst) {
	bool CFP10 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
	// (fadd x, (fadd x, x)) -> (fmul x, 3.0)
	if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) &&
	N1.getOperand(0) == N0) {
	return DAG.getNode(ISD::FMUL, DL, VT,
	N0, DAG.getConstantFP(3.0, DL, VT), Flags);
	}
	}

	// (fadd (fadd x, x), (fadd x, x)) -> (fmul x, 4.0)
	if (AllowNewConst &&
	N0.getOpcode() == ISD::FADD && N1.getOpcode() == ISD::FADD &&
	N0.getOperand(0) == N0.getOperand(1) &&
	N1.getOperand(0) == N1.getOperand(1) &&
	N0.getOperand(0) == N1.getOperand(0)) {
	return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0),
	DAG.getConstantFP(4.0, DL, VT), Flags);
	}
	}
	} // enable-unsafe-fp-math

	// FADD -> FMA combines:
	if (SDValue Fused = visitFADDForFMACombine(N)) {
	AddToWorklist(Fused.getNode());
	return Fused;
	}
	return SDValue();
	}

	SDValue DAGCombiner::visitFSUB(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0);
	ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);
	const TargetOptions &Options = DAG.getTarget().Options;
	const SDNodeFlags Flags = N->getFlags();

	// fold vector ops
	if (VT.isVector())
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	// fold (fsub c1, c2) -> c1-c2
	if (N0CFP && N1CFP)
	return DAG.getNode(ISD::FSUB, DL, VT, N0, N1, Flags);

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	// fold (fsub A, (fneg B)) -> (fadd A, B)
	if (isNegatibleForFree(N1, LegalOperations, TLI, &Options))
	return DAG.getNode(ISD::FADD, DL, VT, N0,
	GetNegatedExpression(N1, DAG, LegalOperations), Flags);

	// FIXME: Auto-upgrade the target/function-level option.
	if (Options.NoSignedZerosFPMath \|\| N->getFlags().hasNoSignedZeros()) {
	// (fsub 0, B) -> -B
	if (N0CFP && N0CFP->isZero()) {
	if (isNegatibleForFree(N1, LegalOperations, TLI, &Options))
	return GetNegatedExpression(N1, DAG, LegalOperations);
	if (!LegalOperations \|\| TLI.isOperationLegal(ISD::FNEG, VT))
	return DAG.getNode(ISD::FNEG, DL, VT, N1, Flags);
	}
	}

	// If 'unsafe math' is enabled, fold lots of things.
	if (Options.UnsafeFPMath) {
	// (fsub A, 0) -> A
	if (N1CFP && N1CFP->isZero())
	return N0;

	// (fsub x, x) -> 0.0
	if (N0 == N1)
	return DAG.getConstantFP(0.0f, DL, VT);

	// (fsub x, (fadd x, y)) -> (fneg y)
	// (fsub x, (fadd y, x)) -> (fneg y)
	if (N1.getOpcode() == ISD::FADD) {
	SDValue N10 = N1->getOperand(0);
	SDValue N11 = N1->getOperand(1);

	if (N10 == N0 && isNegatibleForFree(N11, LegalOperations, TLI, &Options))
	return GetNegatedExpression(N11, DAG, LegalOperations);

	if (N11 == N0 && isNegatibleForFree(N10, LegalOperations, TLI, &Options))
	return GetNegatedExpression(N10, DAG, LegalOperations);
	}
	}

	// FSUB -> FMA combines:
	if (SDValue Fused = visitFSUBForFMACombine(N)) {
	AddToWorklist(Fused.getNode());
	return Fused;
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitFMUL(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0);
	ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);
	const TargetOptions &Options = DAG.getTarget().Options;
	const SDNodeFlags Flags = N->getFlags();

	// fold vector ops
	if (VT.isVector()) {
	// This just handles C1 * C2 for vectors. Other vector folds are below.
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;
	}

	// fold (fmul c1, c2) -> c1*c2
	if (N0CFP && N1CFP)
	return DAG.getNode(ISD::FMUL, DL, VT, N0, N1, Flags);

	// canonicalize constant to RHS
	if (isConstantFPBuildVectorOrConstantFP(N0) &&
	!isConstantFPBuildVectorOrConstantFP(N1))
	return DAG.getNode(ISD::FMUL, DL, VT, N1, N0, Flags);

	// fold (fmul A, 1.0) -> A
	if (N1CFP && N1CFP->isExactlyValue(1.0))
	return N0;

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	if (Options.UnsafeFPMath) {
	// fold (fmul A, 0) -> 0
	if (N1CFP && N1CFP->isZero())
	return N1;

	// fold (fmul (fmul x, c1), c2) -> (fmul x, (fmul c1, c2))
	if (N0.getOpcode() == ISD::FMUL) {
	// Fold scalars or any vector constants (not just splats).
	// This fold is done in general by InstCombine, but extra fmul insts
	// may have been generated during lowering.
	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	auto *BV1 = dyn_cast<BuildVectorSDNode>(N1);
	auto *BV00 = dyn_cast<BuildVectorSDNode>(N00);
	auto *BV01 = dyn_cast<BuildVectorSDNode>(N01);

	// Check 1: Make sure that the first operand of the inner multiply is NOT
	// a constant. Otherwise, we may induce infinite looping.
	if (!(isConstOrConstSplatFP(N00) \|\| (BV00 && BV00->isConstant()))) {
	// Check 2: Make sure that the second operand of the inner multiply and
	// the second operand of the outer multiply are constants.
	if ((N1CFP && isConstOrConstSplatFP(N01)) \|\|
	(BV1 && BV01 && BV1->isConstant() && BV01->isConstant())) {
	SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, N01, N1, Flags);
	return DAG.getNode(ISD::FMUL, DL, VT, N00, MulConsts, Flags);
	}
	}
	}

	// fold (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c))
	// Undo the fmul 2.0, x -> fadd x, x transformation, since if it occurs
	// during an early run of DAGCombiner can prevent folding with fmuls
	// inserted during lowering.
	if (N0.getOpcode() == ISD::FADD &&
	(N0.getOperand(0) == N0.getOperand(1)) &&
	N0.hasOneUse()) {
	const SDValue Two = DAG.getConstantFP(2.0, DL, VT);
	SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, Two, N1, Flags);
	return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), MulConsts, Flags);
	}
	}

	// fold (fmul X, 2.0) -> (fadd X, X)
	if (N1CFP && N1CFP->isExactlyValue(+2.0))
	return DAG.getNode(ISD::FADD, DL, VT, N0, N0, Flags);

	// fold (fmul X, -1.0) -> (fneg X)
	if (N1CFP && N1CFP->isExactlyValue(-1.0))
	if (!LegalOperations \|\| TLI.isOperationLegal(ISD::FNEG, VT))
	return DAG.getNode(ISD::FNEG, DL, VT, N0);

	// fold (fmul (fneg X), (fneg Y)) -> (fmul X, Y)
	if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI, &Options)) {
	if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, &Options)) {
	// Both can be negated for free, check to see if at least one is cheaper
	// negated.
	if (LHSNeg == 2 \|\| RHSNeg == 2)
	return DAG.getNode(ISD::FMUL, DL, VT,
	GetNegatedExpression(N0, DAG, LegalOperations),
	GetNegatedExpression(N1, DAG, LegalOperations),
	Flags);
	}
	}

	// fold (fmul X, (select (fcmp X > 0.0), -1.0, 1.0)) -> (fneg (fabs X))
	// fold (fmul X, (select (fcmp X > 0.0), 1.0, -1.0)) -> (fabs X)
	if (Flags.hasNoNaNs() && Flags.hasNoSignedZeros() &&
	(N0.getOpcode() == ISD::SELECT \|\| N1.getOpcode() == ISD::SELECT) &&
	TLI.isOperationLegal(ISD::FABS, VT)) {
	SDValue Select = N0, X = N1;
	if (Select.getOpcode() != ISD::SELECT)
	std::swap(Select, X);

	SDValue Cond = Select.getOperand(0);
	auto TrueOpnd = dyn_cast<ConstantFPSDNode>(Select.getOperand(1));
	auto FalseOpnd = dyn_cast<ConstantFPSDNode>(Select.getOperand(2));

	if (TrueOpnd && FalseOpnd &&
	Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0) == X &&
	isa<ConstantFPSDNode>(Cond.getOperand(1)) &&
	cast<ConstantFPSDNode>(Cond.getOperand(1))->isExactlyValue(0.0)) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
	switch (CC) {
	default: break;
	case ISD::SETOLT:
	case ISD::SETULT:
	case ISD::SETOLE:
	case ISD::SETULE:
	case ISD::SETLT:
	case ISD::SETLE:
	std::swap(TrueOpnd, FalseOpnd);
	LLVM_FALLTHROUGH;
	case ISD::SETOGT:
	case ISD::SETUGT:
	case ISD::SETOGE:
	case ISD::SETUGE:
	case ISD::SETGT:
	case ISD::SETGE:
	if (TrueOpnd->isExactlyValue(-1.0) && FalseOpnd->isExactlyValue(1.0) &&
	TLI.isOperationLegal(ISD::FNEG, VT))
	return DAG.getNode(ISD::FNEG, DL, VT,
	DAG.getNode(ISD::FABS, DL, VT, X));
	if (TrueOpnd->isExactlyValue(1.0) && FalseOpnd->isExactlyValue(-1.0))
	return DAG.getNode(ISD::FABS, DL, VT, X);

	break;
	}
	}
	}

	// FMUL -> FMA combines:
	if (SDValue Fused = visitFMULForFMADistributiveCombine(N)) {
	AddToWorklist(Fused.getNode());
	return Fused;
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitFMA(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue N2 = N->getOperand(2);
	ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
	ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);
	const TargetOptions &Options = DAG.getTarget().Options;

	// Constant fold FMA.
	if (isa<ConstantFPSDNode>(N0) &&
	isa<ConstantFPSDNode>(N1) &&
	isa<ConstantFPSDNode>(N2)) {
	return DAG.getNode(ISD::FMA, DL, VT, N0, N1, N2);
	}

	if (Options.UnsafeFPMath) {
	if (N0CFP && N0CFP->isZero())
	return N2;
	if (N1CFP && N1CFP->isZero())
	return N2;
	}
	// TODO: The FMA node should have flags that propagate to these nodes.
	if (N0CFP && N0CFP->isExactlyValue(1.0))
	return DAG.getNode(ISD::FADD, SDLoc(N), VT, N1, N2);
	if (N1CFP && N1CFP->isExactlyValue(1.0))
	return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N2);

	// Canonicalize (fma c, x, y) -> (fma x, c, y)
	if (isConstantFPBuildVectorOrConstantFP(N0) &&
	!isConstantFPBuildVectorOrConstantFP(N1))
	return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2);

	// TODO: FMA nodes should have flags that propagate to the created nodes.
	// For now, create a Flags object for use with all unsafe math transforms.
	SDNodeFlags Flags;
	Flags.setUnsafeAlgebra(true);

	if (Options.UnsafeFPMath) {
	// (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
	if (N2.getOpcode() == ISD::FMUL && N0 == N2.getOperand(0) &&
	isConstantFPBuildVectorOrConstantFP(N1) &&
	isConstantFPBuildVectorOrConstantFP(N2.getOperand(1))) {
	return DAG.getNode(ISD::FMUL, DL, VT, N0,
	DAG.getNode(ISD::FADD, DL, VT, N1, N2.getOperand(1),
	Flags), Flags);
	}

	// (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)
	if (N0.getOpcode() == ISD::FMUL &&
	isConstantFPBuildVectorOrConstantFP(N1) &&
	isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) {
	return DAG.getNode(ISD::FMA, DL, VT,
	N0.getOperand(0),
	DAG.getNode(ISD::FMUL, DL, VT, N1, N0.getOperand(1),
	Flags),
	N2);
	}
	}

	// (fma x, 1, y) -> (fadd x, y)
	// (fma x, -1, y) -> (fadd (fneg x), y)
	if (N1CFP) {
	if (N1CFP->isExactlyValue(1.0))
	// TODO: The FMA node should have flags that propagate to this node.
	return DAG.getNode(ISD::FADD, DL, VT, N0, N2);

	if (N1CFP->isExactlyValue(-1.0) &&
	(!LegalOperations \|\| TLI.isOperationLegal(ISD::FNEG, VT))) {
	SDValue RHSNeg = DAG.getNode(ISD::FNEG, DL, VT, N0);
	AddToWorklist(RHSNeg.getNode());
	// TODO: The FMA node should have flags that propagate to this node.
	return DAG.getNode(ISD::FADD, DL, VT, N2, RHSNeg);
	}

	// fma (fneg x), K, y -> fma x -K, y
	if (N0.getOpcode() == ISD::FNEG &&
	(TLI.isOperationLegal(ISD::ConstantFP, VT) \|\|
	(N1.hasOneUse() && !TLI.isFPImmLegal(N1CFP->getValueAPF(), VT)))) {
	return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0),
	DAG.getNode(ISD::FNEG, DL, VT, N1, Flags), N2);
	}
	}

	if (Options.UnsafeFPMath) {
	// (fma x, c, x) -> (fmul x, (c+1))
	if (N1CFP && N0 == N2) {
	return DAG.getNode(ISD::FMUL, DL, VT, N0,
	DAG.getNode(ISD::FADD, DL, VT, N1,
	DAG.getConstantFP(1.0, DL, VT), Flags),
	Flags);
	}

	// (fma x, c, (fneg x)) -> (fmul x, (c-1))
	if (N1CFP && N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0) {
	return DAG.getNode(ISD::FMUL, DL, VT, N0,
	DAG.getNode(ISD::FADD, DL, VT, N1,
	DAG.getConstantFP(-1.0, DL, VT), Flags),
	Flags);
	}
	}

	return SDValue();
	}

	// Combine multiple FDIVs with the same divisor into multiple FMULs by the
	// reciprocal.
	// E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip)
	// Notice that this is not always beneficial. One reason is different targets
	// may have different costs for FDIV and FMUL, so sometimes the cost of two
	// FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason
	// is the critical path is increased from "one FDIV" to "one FDIV + one FMUL".
	SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) {
	bool UnsafeMath = DAG.getTarget().Options.UnsafeFPMath;
	const SDNodeFlags Flags = N->getFlags();
	if (!UnsafeMath && !Flags.hasAllowReciprocal())
	return SDValue();

	// Skip if current node is a reciprocal.
	SDValue N0 = N->getOperand(0);
	ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
	if (N0CFP && N0CFP->isExactlyValue(1.0))
	return SDValue();

	// Exit early if the target does not want this transform or if there can't
	// possibly be enough uses of the divisor to make the transform worthwhile.
	SDValue N1 = N->getOperand(1);
	unsigned MinUses = TLI.combineRepeatedFPDivisors();
	if (!MinUses \|\| N1->use_size() < MinUses)
	return SDValue();

	// Find all FDIV users of the same divisor.
	// Use a set because duplicates may be present in the user list.
	SetVector<SDNode *> Users;
	for (auto *U : N1->uses()) {
	if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1) {
	// This division is eligible for optimization only if global unsafe math
	// is enabled or if this division allows reciprocal formation.
	if (UnsafeMath \|\| U->getFlags().hasAllowReciprocal())
	Users.insert(U);
	}
	}

	// Now that we have the actual number of divisor uses, make sure it meets
	// the minimum threshold specified by the target.
	if (Users.size() < MinUses)
	return SDValue();

	EVT VT = N->getValueType(0);
	SDLoc DL(N);
	SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);
	SDValue Reciprocal = DAG.getNode(ISD::FDIV, DL, VT, FPOne, N1, Flags);

	// Dividend / Divisor -> Dividend * Reciprocal
	for (auto *U : Users) {
	SDValue Dividend = U->getOperand(0);
	if (Dividend != FPOne) {
	SDValue NewNode = DAG.getNode(ISD::FMUL, SDLoc(U), VT, Dividend,
	Reciprocal, Flags);
	CombineTo(U, NewNode);
	} else if (U != Reciprocal.getNode()) {
	// In the absence of fast-math-flags, this user node is always the
	// same node as Reciprocal, but with FMF they may be different nodes.
	CombineTo(U, Reciprocal);
	}
	}
	return SDValue(N, 0); // N was replaced.
	}

	SDValue DAGCombiner::visitFDIV(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
	ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);
	const TargetOptions &Options = DAG.getTarget().Options;
	SDNodeFlags Flags = N->getFlags();

	// fold vector ops
	if (VT.isVector())
	if (SDValue FoldedVOp = SimplifyVBinOp(N))
	return FoldedVOp;

	// fold (fdiv c1, c2) -> c1/c2
	if (N0CFP && N1CFP)
	return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1, Flags);

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	if (Options.UnsafeFPMath) {
	// fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
	if (N1CFP) {
	// Compute the reciprocal 1.0 / c2.
	const APFloat &N1APF = N1CFP->getValueAPF();
	APFloat Recip(N1APF.getSemantics(), 1); // 1.0
	APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);
	// Only do the transform if the reciprocal is a legal fp immediate that
	// isn't too nasty (eg NaN, denormal, ...).
	if ((st == APFloat::opOK \|\| st == APFloat::opInexact) && // Not too nasty
	(!LegalOperations \|\|
	// FIXME: custom lowering of ConstantFP might fail (see e.g. ARM
	// backend)... we should handle this gracefully after Legalize.
	// TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT) \|\|
	TLI.isOperationLegal(ISD::ConstantFP, VT) \|\|
	TLI.isFPImmLegal(Recip, VT)))
	return DAG.getNode(ISD::FMUL, DL, VT, N0,
	DAG.getConstantFP(Recip, DL, VT), Flags);
	}

	// If this FDIV is part of a reciprocal square root, it may be folded
	// into a target-specific square root estimate instruction.
	if (N1.getOpcode() == ISD::FSQRT) {
	if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0), Flags)) {
	return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
	}
	} else if (N1.getOpcode() == ISD::FP_EXTEND &&
	N1.getOperand(0).getOpcode() == ISD::FSQRT) {
	if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0),
	Flags)) {
	RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV);
	AddToWorklist(RV.getNode());
	return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
	}
	} else if (N1.getOpcode() == ISD::FP_ROUND &&
	N1.getOperand(0).getOpcode() == ISD::FSQRT) {
	if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0),
	Flags)) {
	RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1));
	AddToWorklist(RV.getNode());
	return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
	}
	} else if (N1.getOpcode() == ISD::FMUL) {
	// Look through an FMUL. Even though this won't remove the FDIV directly,
	// it's still worthwhile to get rid of the FSQRT if possible.
	SDValue SqrtOp;
	SDValue OtherOp;
	if (N1.getOperand(0).getOpcode() == ISD::FSQRT) {
	SqrtOp = N1.getOperand(0);
	OtherOp = N1.getOperand(1);
	} else if (N1.getOperand(1).getOpcode() == ISD::FSQRT) {
	SqrtOp = N1.getOperand(1);
	OtherOp = N1.getOperand(0);
	}
	if (SqrtOp.getNode()) {
	// We found a FSQRT, so try to make this fold:
	// x / (y * sqrt(z)) -> x * (rsqrt(z) / y)
	if (SDValue RV = buildRsqrtEstimate(SqrtOp.getOperand(0), Flags)) {
	RV = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, RV, OtherOp, Flags);
	AddToWorklist(RV.getNode());
	return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
	}
	}
	}

	// Fold into a reciprocal estimate and multiply instead of a real divide.
	if (SDValue RV = BuildReciprocalEstimate(N1, Flags)) {
	AddToWorklist(RV.getNode());
	return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
	}
	}

	// (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y)
	if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI, &Options)) {
	if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, &Options)) {
	// Both can be negated for free, check to see if at least one is cheaper
	// negated.
	if (LHSNeg == 2 \|\| RHSNeg == 2)
	return DAG.getNode(ISD::FDIV, SDLoc(N), VT,
	GetNegatedExpression(N0, DAG, LegalOperations),
	GetNegatedExpression(N1, DAG, LegalOperations),
	Flags);
	}
	}

	if (SDValue CombineRepeatedDivisors = combineRepeatedFPDivisors(N))
	return CombineRepeatedDivisors;

	return SDValue();
	}

	SDValue DAGCombiner::visitFREM(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
	ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
	EVT VT = N->getValueType(0);

	// fold (frem c1, c2) -> fmod(c1,c2)
	if (N0CFP && N1CFP)
	return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1, N->getFlags());

	if (SDValue NewSel = foldBinOpIntoSelect(N))
	return NewSel;

	return SDValue();
	}

	SDValue DAGCombiner::visitFSQRT(SDNode *N) {
	if (!DAG.getTarget().Options.UnsafeFPMath)
	return SDValue();

	SDValue N0 = N->getOperand(0);
	if (TLI.isFsqrtCheap(N0, DAG))
	return SDValue();

	// TODO: FSQRT nodes should have flags that propagate to the created nodes.
	// For now, create a Flags object for use with all unsafe math transforms.
	SDNodeFlags Flags;
	Flags.setUnsafeAlgebra(true);
	return buildSqrtEstimate(N0, Flags);
	}

	/// copysign(x, fp_extend(y)) -> copysign(x, y)
	/// copysign(x, fp_round(y)) -> copysign(x, y)
	static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) {
	SDValue N1 = N->getOperand(1);
	if ((N1.getOpcode() == ISD::FP_EXTEND \|\|
	N1.getOpcode() == ISD::FP_ROUND)) {
	// Do not optimize out type conversion of f128 type yet.
	// For some targets like x86_64, configuration is changed to keep one f128
	// value in one SSE register, but instruction selection cannot handle
	// FCOPYSIGN on SSE registers yet.
	EVT N1VT = N1->getValueType(0);
	EVT N1Op0VT = N1->getOperand(0).getValueType();
	return (N1VT == N1Op0VT \|\| N1Op0VT != MVT::f128);
	}
	return false;
	}

	SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
	ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
	EVT VT = N->getValueType(0);

	if (N0CFP && N1CFP) // Constant fold
	return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1);

	if (N1CFP) {
	const APFloat &V = N1CFP->getValueAPF();
	// copysign(x, c1) -> fabs(x) iff ispos(c1)
	// copysign(x, c1) -> fneg(fabs(x)) iff isneg(c1)
	if (!V.isNegative()) {
	if (!LegalOperations \|\| TLI.isOperationLegal(ISD::FABS, VT))
	return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);
	} else {
	if (!LegalOperations \|\| TLI.isOperationLegal(ISD::FNEG, VT))
	return DAG.getNode(ISD::FNEG, SDLoc(N), VT,
	DAG.getNode(ISD::FABS, SDLoc(N0), VT, N0));
	}
	}

	// copysign(fabs(x), y) -> copysign(x, y)
	// copysign(fneg(x), y) -> copysign(x, y)
	// copysign(copysign(x,z), y) -> copysign(x, y)
	if (N0.getOpcode() == ISD::FABS \|\| N0.getOpcode() == ISD::FNEG \|\|
	N0.getOpcode() == ISD::FCOPYSIGN)
	return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0.getOperand(0), N1);

	// copysign(x, abs(y)) -> abs(x)
	if (N1.getOpcode() == ISD::FABS)
	return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);

	// copysign(x, copysign(y,z)) -> copysign(x, z)
	if (N1.getOpcode() == ISD::FCOPYSIGN)
	return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(1));

	// copysign(x, fp_extend(y)) -> copysign(x, y)
	// copysign(x, fp_round(y)) -> copysign(x, y)
	if (CanCombineFCOPYSIGN_EXTEND_ROUND(N))
	return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(0));

	return SDValue();
	}

	SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT OpVT = N0.getValueType();

	// fold (sint_to_fp c1) -> c1fp
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
	// ...but only if the target supports immediate floating-point values
	(!LegalOperations \|\|
	TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))
	return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0);

	// If the input is a legal type, and SINT_TO_FP is not legal on this target,
	// but UINT_TO_FP is legal on this target, try to convert.
	if (!TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, OpVT) &&
	TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, OpVT)) {
	// If the sign bit is known to be zero, we can change this to UINT_TO_FP.
	if (DAG.SignBitIsZero(N0))
	return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0);
	}

	// The next optimizations are desirable only if SELECT_CC can be lowered.
	if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT) \|\| !LegalOperations) {
	// fold (sint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc)
	if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 &&
	!VT.isVector() &&
	(!LegalOperations \|\|
	TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
	SDLoc DL(N);
	SDValue Ops[] =
	{ N0.getOperand(0), N0.getOperand(1),
	DAG.getConstantFP(-1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT),
	N0.getOperand(2) };
	return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops);
	}

	// fold (sint_to_fp (zext (setcc x, y, cc))) ->
	// (select_cc x, y, 1.0, 0.0,, cc)
	if (N0.getOpcode() == ISD::ZERO_EXTEND &&
	N0.getOperand(0).getOpcode() == ISD::SETCC &&!VT.isVector() &&
	(!LegalOperations \|\|
	TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
	SDLoc DL(N);
	SDValue Ops[] =
	{ N0.getOperand(0).getOperand(0), N0.getOperand(0).getOperand(1),
	DAG.getConstantFP(1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT),
	N0.getOperand(0).getOperand(2) };
	return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops);
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT OpVT = N0.getValueType();

	// fold (uint_to_fp c1) -> c1fp
	if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
	// ...but only if the target supports immediate floating-point values
	(!LegalOperations \|\|
	TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))
	return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0);

	// If the input is a legal type, and UINT_TO_FP is not legal on this target,
	// but SINT_TO_FP is legal on this target, try to convert.
	if (!TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, OpVT) &&
	TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, OpVT)) {
	// If the sign bit is known to be zero, we can change this to SINT_TO_FP.
	if (DAG.SignBitIsZero(N0))
	return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0);
	}

	// The next optimizations are desirable only if SELECT_CC can be lowered.
	if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT) \|\| !LegalOperations) {
	// fold (uint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc)
	if (N0.getOpcode() == ISD::SETCC && !VT.isVector() &&
	(!LegalOperations \|\|
	TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
	SDLoc DL(N);
	SDValue Ops[] =
	{ N0.getOperand(0), N0.getOperand(1),
	DAG.getConstantFP(1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT),
	N0.getOperand(2) };
	return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops);
	}
	}

	return SDValue();
	}

	// Fold (fp_to_{s/u}int ({s/u}int_to_fpx)) -> zext x, sext x, trunc x, or x
	static SDValue FoldIntToFPToInt(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	if (N0.getOpcode() != ISD::UINT_TO_FP && N0.getOpcode() != ISD::SINT_TO_FP)
	return SDValue();

	SDValue Src = N0.getOperand(0);
	EVT SrcVT = Src.getValueType();
	bool IsInputSigned = N0.getOpcode() == ISD::SINT_TO_FP;
	bool IsOutputSigned = N->getOpcode() == ISD::FP_TO_SINT;

	// We can safely assume the conversion won't overflow the output range,
	// because (for example) (uint8_t)18293.f is undefined behavior.

	// Since we can assume the conversion won't overflow, our decision as to
	// whether the input will fit in the float should depend on the minimum
	// of the input range and output range.

	// This means this is also safe for a signed input and unsigned output, since
	// a negative input would lead to undefined behavior.
	unsigned InputSize = (int)SrcVT.getScalarSizeInBits() - IsInputSigned;
	unsigned OutputSize = (int)VT.getScalarSizeInBits() - IsOutputSigned;
	unsigned ActualSize = std::min(InputSize, OutputSize);
	const fltSemantics &sem = DAG.EVTToAPFloatSemantics(N0.getValueType());

	// We can only fold away the float conversion if the input range can be
	// represented exactly in the float range.
	if (APFloat::semanticsPrecision(sem) >= ActualSize) {
	if (VT.getScalarSizeInBits() > SrcVT.getScalarSizeInBits()) {
	unsigned ExtOp = IsInputSigned && IsOutputSigned ? ISD::SIGN_EXTEND
	: ISD::ZERO_EXTEND;
	return DAG.getNode(ExtOp, SDLoc(N), VT, Src);
	}
	if (VT.getScalarSizeInBits() < SrcVT.getScalarSizeInBits())
	return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Src);
	return DAG.getBitcast(VT, Src);
	}
	return SDValue();
	}

	SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (fp_to_sint c1fp) -> c1
	if (isConstantFPBuildVectorOrConstantFP(N0))
	return DAG.getNode(ISD::FP_TO_SINT, SDLoc(N), VT, N0);

	return FoldIntToFPToInt(N, DAG);
	}

	SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (fp_to_uint c1fp) -> c1
	if (isConstantFPBuildVectorOrConstantFP(N0))
	return DAG.getNode(ISD::FP_TO_UINT, SDLoc(N), VT, N0);

	return FoldIntToFPToInt(N, DAG);
	}

	SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
	EVT VT = N->getValueType(0);

	// fold (fp_round c1fp) -> c1fp
	if (N0CFP)
	return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, N0, N1);

	// fold (fp_round (fp_extend x)) -> x
	if (N0.getOpcode() == ISD::FP_EXTEND && VT == N0.getOperand(0).getValueType())
	return N0.getOperand(0);

	// fold (fp_round (fp_round x)) -> (fp_round x)
	if (N0.getOpcode() == ISD::FP_ROUND) {
	const bool NIsTrunc = N->getConstantOperandVal(1) == 1;
	const bool N0IsTrunc = N0.getConstantOperandVal(1) == 1;

	// Skip this folding if it results in an fp_round from f80 to f16.
	//
	// f80 to f16 always generates an expensive (and as yet, unimplemented)
	// libcall to __truncxfhf2 instead of selecting native f16 conversion
	// instructions from f32 or f64. Moreover, the first (value-preserving)
	// fp_round from f80 to either f32 or f64 may become a NOP in platforms like
	// x86.
	if (N0.getOperand(0).getValueType() == MVT::f80 && VT == MVT::f16)
	return SDValue();

	// If the first fp_round isn't a value preserving truncation, it might
	// introduce a tie in the second fp_round, that wouldn't occur in the
	// single-step fp_round we want to fold to.
	// In other words, double rounding isn't the same as rounding.
	// Also, this is a value preserving truncation iff both fp_round's are.
	if (DAG.getTarget().Options.UnsafeFPMath \|\| N0IsTrunc) {
	SDLoc DL(N);
	return DAG.getNode(ISD::FP_ROUND, DL, VT, N0.getOperand(0),
	DAG.getIntPtrConstant(NIsTrunc && N0IsTrunc, DL));
	}
	}

	// fold (fp_round (copysign X, Y)) -> (copysign (fp_round X), Y)
	if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse()) {
	SDValue Tmp = DAG.getNode(ISD::FP_ROUND, SDLoc(N0), VT,
	N0.getOperand(0), N1);
	AddToWorklist(Tmp.getNode());
	return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT,
	Tmp, N0.getOperand(1));
	}

	if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
	return NewVSel;

	return SDValue();
	}

	SDValue DAGCombiner::visitFP_ROUND_INREG(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
	ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);

	// fold (fp_round_inreg c1fp) -> c1fp
	if (N0CFP && isTypeLegal(EVT)) {
	SDLoc DL(N);
	SDValue Round = DAG.getConstantFP(*N0CFP->getConstantFPValue(), DL, EVT);
	return DAG.getNode(ISD::FP_EXTEND, DL, VT, Round);
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
	if (N->hasOneUse() &&
	N->use_begin()->getOpcode() == ISD::FP_ROUND)
	return SDValue();

	// fold (fp_extend c1fp) -> c1fp
	if (isConstantFPBuildVectorOrConstantFP(N0))
	return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N0);

	// fold (fp_extend (fp16_to_fp op)) -> (fp16_to_fp op)
	if (N0.getOpcode() == ISD::FP16_TO_FP &&
	TLI.getOperationAction(ISD::FP16_TO_FP, VT) == TargetLowering::Legal)
	return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), VT, N0.getOperand(0));

	// Turn fp_extend(fp_round(X, 1)) -> x since the fp_round doesn't affect the
	// value of X.
	if (N0.getOpcode() == ISD::FP_ROUND
	&& N0.getConstantOperandVal(1) == 1) {
	SDValue In = N0.getOperand(0);
	if (In.getValueType() == VT) return In;
	if (VT.bitsLT(In.getValueType()))
	return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT,
	In, N0.getOperand(1));
	return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, In);
	}

	// fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
	if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
	TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
	LN0->getChain(),
	LN0->getBasePtr(), N0.getValueType(),
	LN0->getMemOperand());
	CombineTo(N, ExtLoad);
	CombineTo(N0.getNode(),
	DAG.getNode(ISD::FP_ROUND, SDLoc(N0),
	N0.getValueType(), ExtLoad,
	DAG.getIntPtrConstant(1, SDLoc(N0))),
	ExtLoad.getValue(1));
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}

	if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
	return NewVSel;

	return SDValue();
	}

	SDValue DAGCombiner::visitFCEIL(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (fceil c1) -> fceil(c1)
	if (isConstantFPBuildVectorOrConstantFP(N0))
	return DAG.getNode(ISD::FCEIL, SDLoc(N), VT, N0);

	return SDValue();
	}

	SDValue DAGCombiner::visitFTRUNC(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (ftrunc c1) -> ftrunc(c1)
	if (isConstantFPBuildVectorOrConstantFP(N0))
	return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0);

	// fold ftrunc (known rounded int x) -> x
	// ftrunc is a part of fptosi/fptoui expansion on some targets, so this is
	// likely to be generated to extract integer from a rounded floating value.
	switch (N0.getOpcode()) {
	default: break;
	case ISD::FRINT:
	case ISD::FTRUNC:
	case ISD::FNEARBYINT:
	case ISD::FFLOOR:
	case ISD::FCEIL:
	return N0;
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitFFLOOR(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (ffloor c1) -> ffloor(c1)
	if (isConstantFPBuildVectorOrConstantFP(N0))
	return DAG.getNode(ISD::FFLOOR, SDLoc(N), VT, N0);

	return SDValue();
	}

	// FIXME: FNEG and FABS have a lot in common; refactor.
	SDValue DAGCombiner::visitFNEG(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// Constant fold FNEG.
	if (isConstantFPBuildVectorOrConstantFP(N0))
	return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0);

	if (isNegatibleForFree(N0, LegalOperations, DAG.getTargetLoweringInfo(),
	&DAG.getTarget().Options))
	return GetNegatedExpression(N0, DAG, LegalOperations);

	// Transform fneg(bitconvert(x)) -> bitconvert(x ^ sign) to avoid loading
	// constant pool values.
	if (!TLI.isFNegFree(VT) &&
	N0.getOpcode() == ISD::BITCAST &&
	N0.getNode()->hasOneUse()) {
	SDValue Int = N0.getOperand(0);
	EVT IntVT = Int.getValueType();
	if (IntVT.isInteger() && !IntVT.isVector()) {
	APInt SignMask;
	if (N0.getValueType().isVector()) {
	// For a vector, get a mask such as 0x80... per scalar element
	// and splat it.
	SignMask = APInt::getSignMask(N0.getScalarValueSizeInBits());
	SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask);
	} else {
	// For a scalar, just generate 0x80...
	SignMask = APInt::getSignMask(IntVT.getSizeInBits());
	}
	SDLoc DL0(N0);
	Int = DAG.getNode(ISD::XOR, DL0, IntVT, Int,
	DAG.getConstant(SignMask, DL0, IntVT));
	AddToWorklist(Int.getNode());
	return DAG.getBitcast(VT, Int);
	}
	}

	// (fneg (fmul c, x)) -> (fmul -c, x)
	if (N0.getOpcode() == ISD::FMUL &&
	(N0.getNode()->hasOneUse() \|\| !TLI.isFNegFree(VT))) {
	ConstantFPSDNode *CFP1 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
	if (CFP1) {
	APFloat CVal = CFP1->getValueAPF();
	CVal.changeSign();
	if (Level >= AfterLegalizeDAG &&
	(TLI.isFPImmLegal(CVal, VT) \|\|
	TLI.isOperationLegal(ISD::ConstantFP, VT)))
	return DAG.getNode(
	ISD::FMUL, SDLoc(N), VT, N0.getOperand(0),
	DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0.getOperand(1)),
	N0->getFlags());
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitFMINNUM(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	const ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0);
	const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1);

	if (N0CFP && N1CFP) {
	const APFloat &C0 = N0CFP->getValueAPF();
	const APFloat &C1 = N1CFP->getValueAPF();
	return DAG.getConstantFP(minnum(C0, C1), SDLoc(N), VT);
	}

	// Canonicalize to constant on RHS.
	if (isConstantFPBuildVectorOrConstantFP(N0) &&
	!isConstantFPBuildVectorOrConstantFP(N1))
	return DAG.getNode(ISD::FMINNUM, SDLoc(N), VT, N1, N0);

	return SDValue();
	}

	SDValue DAGCombiner::visitFMAXNUM(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	const ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0);
	const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1);

	if (N0CFP && N1CFP) {
	const APFloat &C0 = N0CFP->getValueAPF();
	const APFloat &C1 = N1CFP->getValueAPF();
	return DAG.getConstantFP(maxnum(C0, C1), SDLoc(N), VT);
	}

	// Canonicalize to constant on RHS.
	if (isConstantFPBuildVectorOrConstantFP(N0) &&
	!isConstantFPBuildVectorOrConstantFP(N1))
	return DAG.getNode(ISD::FMAXNUM, SDLoc(N), VT, N1, N0);

	return SDValue();
	}

	SDValue DAGCombiner::visitFABS(SDNode *N) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// fold (fabs c1) -> fabs(c1)
	if (isConstantFPBuildVectorOrConstantFP(N0))
	return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);

	// fold (fabs (fabs x)) -> (fabs x)
	if (N0.getOpcode() == ISD::FABS)
	return N->getOperand(0);

	// fold (fabs (fneg x)) -> (fabs x)
	// fold (fabs (fcopysign x, y)) -> (fabs x)
	if (N0.getOpcode() == ISD::FNEG \|\| N0.getOpcode() == ISD::FCOPYSIGN)
	return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0.getOperand(0));

	// Transform fabs(bitconvert(x)) -> bitconvert(x & ~sign) to avoid loading
	// constant pool values.
	if (!TLI.isFAbsFree(VT) &&
	N0.getOpcode() == ISD::BITCAST &&
	N0.getNode()->hasOneUse()) {
	SDValue Int = N0.getOperand(0);
	EVT IntVT = Int.getValueType();
	if (IntVT.isInteger() && !IntVT.isVector()) {
	APInt SignMask;
	if (N0.getValueType().isVector()) {
	// For a vector, get a mask such as 0x7f... per scalar element
	// and splat it.
	SignMask = ~APInt::getSignMask(N0.getScalarValueSizeInBits());
	SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask);
	} else {
	// For a scalar, just generate 0x7f...
	SignMask = ~APInt::getSignMask(IntVT.getSizeInBits());
	}
	SDLoc DL(N0);
	Int = DAG.getNode(ISD::AND, DL, IntVT, Int,
	DAG.getConstant(SignMask, DL, IntVT));
	AddToWorklist(Int.getNode());
	return DAG.getBitcast(N->getValueType(0), Int);
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitBRCOND(SDNode *N) {
	SDValue Chain = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue N2 = N->getOperand(2);

	// If N is a constant we could fold this into a fallthrough or unconditional
	// branch. However that doesn't happen very often in normal code, because
	// Instcombine/SimplifyCFG should have handled the available opportunities.
	// If we did this folding here, it would be necessary to update the
	// MachineBasicBlock CFG, which is awkward.

	// fold a brcond with a setcc condition into a BR_CC node if BR_CC is legal
	// on the target.
	if (N1.getOpcode() == ISD::SETCC &&
	TLI.isOperationLegalOrCustom(ISD::BR_CC,
	N1.getOperand(0).getValueType())) {
	return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other,
	Chain, N1.getOperand(2),
	N1.getOperand(0), N1.getOperand(1), N2);
	}

	if ((N1.hasOneUse() && N1.getOpcode() == ISD::SRL) \|\|
	((N1.getOpcode() == ISD::TRUNCATE && N1.hasOneUse()) &&
	(N1.getOperand(0).hasOneUse() &&
	N1.getOperand(0).getOpcode() == ISD::SRL))) {
	SDNode *Trunc = nullptr;
	if (N1.getOpcode() == ISD::TRUNCATE) {
	// Look pass the truncate.
	Trunc = N1.getNode();
	N1 = N1.getOperand(0);
	}

	// Match this pattern so that we can generate simpler code:
	//
	// %a = ...
	// %b = and i32 %a, 2
	// %c = srl i32 %b, 1
	// brcond i32 %c ...
	//
	// into
	//
	// %a = ...
	// %b = and i32 %a, 2
	// %c = setcc eq %b, 0
	// brcond %c ...
	//
	// This applies only when the AND constant value has one bit set and the
	// SRL constant is equal to the log2 of the AND constant. The back-end is
	// smart enough to convert the result into a TEST/JMP sequence.
	SDValue Op0 = N1.getOperand(0);
	SDValue Op1 = N1.getOperand(1);

	if (Op0.getOpcode() == ISD::AND &&
	Op1.getOpcode() == ISD::Constant) {
	SDValue AndOp1 = Op0.getOperand(1);

	if (AndOp1.getOpcode() == ISD::Constant) {
	const APInt &AndConst = cast<ConstantSDNode>(AndOp1)->getAPIntValue();

	if (AndConst.isPowerOf2() &&
	cast<ConstantSDNode>(Op1)->getAPIntValue()==AndConst.logBase2()) {
	SDLoc DL(N);
	SDValue SetCC =
	DAG.getSetCC(DL,
	getSetCCResultType(Op0.getValueType()),
	Op0, DAG.getConstant(0, DL, Op0.getValueType()),
	ISD::SETNE);

	SDValue NewBRCond = DAG.getNode(ISD::BRCOND, DL,
	MVT::Other, Chain, SetCC, N2);
	// Don't add the new BRCond into the worklist or else SimplifySelectCC
	// will convert it back to (X & C1) >> C2.
	CombineTo(N, NewBRCond, false);
	// Truncate is dead.
	if (Trunc)
	deleteAndRecombine(Trunc);
	// Replace the uses of SRL with SETCC
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(N1, SetCC);
	deleteAndRecombine(N1.getNode());
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}
	}

	if (Trunc)
	// Restore N1 if the above transformation doesn't match.
	N1 = N->getOperand(1);
	}

	// Transform br(xor(x, y)) -> br(x != y)
	// Transform br(xor(xor(x,y), 1)) -> br (x == y)
	if (N1.hasOneUse() && N1.getOpcode() == ISD::XOR) {
	SDNode *TheXor = N1.getNode();
	SDValue Op0 = TheXor->getOperand(0);
	SDValue Op1 = TheXor->getOperand(1);
	if (Op0.getOpcode() == Op1.getOpcode()) {
	// Avoid missing important xor optimizations.
	if (SDValue Tmp = visitXOR(TheXor)) {
	if (Tmp.getNode() != TheXor) {
	DEBUG(dbgs() << "\nReplacing.8 ";
	TheXor->dump(&DAG);
	dbgs() << "\nWith: ";
	Tmp.getNode()->dump(&DAG);
	dbgs() << '\n');
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(N1, Tmp);
	deleteAndRecombine(TheXor);
	return DAG.getNode(ISD::BRCOND, SDLoc(N),
	MVT::Other, Chain, Tmp, N2);
	}

	// visitXOR has changed XOR's operands or replaced the XOR completely,
	// bail out.
	return SDValue(N, 0);
	}
	}

	if (Op0.getOpcode() != ISD::SETCC && Op1.getOpcode() != ISD::SETCC) {
	bool Equal = false;
	if (isOneConstant(Op0) && Op0.hasOneUse() &&
	Op0.getOpcode() == ISD::XOR) {
	TheXor = Op0.getNode();
	Equal = true;
	}

	EVT SetCCVT = N1.getValueType();
	if (LegalTypes)
	SetCCVT = getSetCCResultType(SetCCVT);
	SDValue SetCC = DAG.getSetCC(SDLoc(TheXor),
	SetCCVT,
	Op0, Op1,
	Equal ? ISD::SETEQ : ISD::SETNE);
	// Replace the uses of XOR with SETCC
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(N1, SetCC);
	deleteAndRecombine(N1.getNode());
	return DAG.getNode(ISD::BRCOND, SDLoc(N),
	MVT::Other, Chain, SetCC, N2);
	}
	}

	return SDValue();
	}

	// Operand List for BR_CC: Chain, CondCC, CondLHS, CondRHS, DestBB.
	//
	SDValue DAGCombiner::visitBR_CC(SDNode *N) {
	CondCodeSDNode *CC = cast<CondCodeSDNode>(N->getOperand(1));
	SDValue CondLHS = N->getOperand(2), CondRHS = N->getOperand(3);

	// If N is a constant we could fold this into a fallthrough or unconditional
	// branch. However that doesn't happen very often in normal code, because
	// Instcombine/SimplifyCFG should have handled the available opportunities.
	// If we did this folding here, it would be necessary to update the
	// MachineBasicBlock CFG, which is awkward.

	// Use SimplifySetCC to simplify SETCC's.
	SDValue Simp = SimplifySetCC(getSetCCResultType(CondLHS.getValueType()),
	CondLHS, CondRHS, CC->get(), SDLoc(N),
	false);
	if (Simp.getNode()) AddToWorklist(Simp.getNode());

	// fold to a simpler setcc
	if (Simp.getNode() && Simp.getOpcode() == ISD::SETCC)
	return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other,
	N->getOperand(0), Simp.getOperand(2),
	Simp.getOperand(0), Simp.getOperand(1),
	N->getOperand(4));

	return SDValue();
	}

	/// Return true if 'Use' is a load or a store that uses N as its base pointer
	/// and that N may be folded in the load / store addressing mode.
	static bool canFoldInAddressingMode(SDNode N, SDNode Use,
	SelectionDAG &DAG,
	const TargetLowering &TLI) {
	EVT VT;
	unsigned AS;

	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Use)) {
	if (LD->isIndexed() \|\| LD->getBasePtr().getNode() != N)
	return false;
	VT = LD->getMemoryVT();
	AS = LD->getAddressSpace();
	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Use)) {
	if (ST->isIndexed() \|\| ST->getBasePtr().getNode() != N)
	return false;
	VT = ST->getMemoryVT();
	AS = ST->getAddressSpace();
	} else
	return false;

	TargetLowering::AddrMode AM;
	if (N->getOpcode() == ISD::ADD) {
	ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (Offset)
	// [reg +/- imm]
	AM.BaseOffs = Offset->getSExtValue();
	else
	// [reg +/- reg]
	AM.Scale = 1;
	} else if (N->getOpcode() == ISD::SUB) {
	ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (Offset)
	// [reg +/- imm]
	AM.BaseOffs = -Offset->getSExtValue();
	else
	// [reg +/- reg]
	AM.Scale = 1;
	} else
	return false;

	return TLI.isLegalAddressingMode(DAG.getDataLayout(), AM,
	VT.getTypeForEVT(*DAG.getContext()), AS);
	}

	/// Try turning a load/store into a pre-indexed load/store when the base
	/// pointer is an add or subtract and it has other uses besides the load/store.
	/// After the transformation, the new indexed load/store has effectively folded
	/// the add/subtract in and all of its other uses are redirected to the
	/// new load/store.
	bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
	if (Level < AfterLegalizeDAG)
	return false;

	bool isLoad = true;
	SDValue Ptr;
	EVT VT;
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
	if (LD->isIndexed())
	return false;
	VT = LD->getMemoryVT();
	if (!TLI.isIndexedLoadLegal(ISD::PRE_INC, VT) &&
	!TLI.isIndexedLoadLegal(ISD::PRE_DEC, VT))
	return false;
	Ptr = LD->getBasePtr();
	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
	if (ST->isIndexed())
	return false;
	VT = ST->getMemoryVT();
	if (!TLI.isIndexedStoreLegal(ISD::PRE_INC, VT) &&
	!TLI.isIndexedStoreLegal(ISD::PRE_DEC, VT))
	return false;
	Ptr = ST->getBasePtr();
	isLoad = false;
	} else {
	return false;
	}

	// If the pointer is not an add/sub, or if it doesn't have multiple uses, bail
	// out. There is no reason to make this a preinc/predec.
	if ((Ptr.getOpcode() != ISD::ADD && Ptr.getOpcode() != ISD::SUB) \|\|
	Ptr.getNode()->hasOneUse())
	return false;

	// Ask the target to do addressing mode selection.
	SDValue BasePtr;
	SDValue Offset;
	ISD::MemIndexedMode AM = ISD::UNINDEXED;
	if (!TLI.getPreIndexedAddressParts(N, BasePtr, Offset, AM, DAG))
	return false;

	// Backends without true r+i pre-indexed forms may need to pass a
	// constant base with a variable offset so that constant coercion
	// will work with the patterns in canonical form.
	bool Swapped = false;
	if (isa<ConstantSDNode>(BasePtr)) {
	std::swap(BasePtr, Offset);
	Swapped = true;
	}

	// Don't create a indexed load / store with zero offset.
	if (isNullConstant(Offset))
	return false;

	// Try turning it into a pre-indexed load / store except when:
	// 1) The new base ptr is a frame index.
	// 2) If N is a store and the new base ptr is either the same as or is a
	// predecessor of the value being stored.
	// 3) Another use of old base ptr is a predecessor of N. If ptr is folded
	// that would create a cycle.
	// 4) All uses are load / store ops that use it as old base ptr.

	// Check #1. Preinc'ing a frame index would require copying the stack pointer
	// (plus the implicit offset) to a register to preinc anyway.
	if (isa<FrameIndexSDNode>(BasePtr) \|\| isa<RegisterSDNode>(BasePtr))
	return false;

	// Check #2.
	if (!isLoad) {
	SDValue Val = cast<StoreSDNode>(N)->getValue();
	if (Val == BasePtr \|\| BasePtr.getNode()->isPredecessorOf(Val.getNode()))
	return false;
	}

	// Caches for hasPredecessorHelper.
	SmallPtrSet<const SDNode *, 32> Visited;
	SmallVector<const SDNode *, 16> Worklist;
	Worklist.push_back(N);

	// If the offset is a constant, there may be other adds of constants that
	// can be folded with this one. We should do this to avoid having to keep
	// a copy of the original base pointer.
	SmallVector<SDNode *, 16> OtherUses;
	if (isa<ConstantSDNode>(Offset))
	for (SDNode::use_iterator UI = BasePtr.getNode()->use_begin(),
	UE = BasePtr.getNode()->use_end();
	UI != UE; ++UI) {
	SDUse &Use = UI.getUse();
	// Skip the use that is Ptr and uses of other results from BasePtr's
	// node (important for nodes that return multiple results).
	if (Use.getUser() == Ptr.getNode() \|\| Use != BasePtr)
	continue;

	if (SDNode::hasPredecessorHelper(Use.getUser(), Visited, Worklist))
	continue;

	if (Use.getUser()->getOpcode() != ISD::ADD &&
	Use.getUser()->getOpcode() != ISD::SUB) {
	OtherUses.clear();
	break;
	}

	SDValue Op1 = Use.getUser()->getOperand((UI.getOperandNo() + 1) & 1);
	if (!isa<ConstantSDNode>(Op1)) {
	OtherUses.clear();
	break;
	}

	// FIXME: In some cases, we can be smarter about this.
	if (Op1.getValueType() != Offset.getValueType()) {
	OtherUses.clear();
	break;
	}

	OtherUses.push_back(Use.getUser());
	}

	if (Swapped)
	std::swap(BasePtr, Offset);

	// Now check for #3 and #4.
	bool RealUse = false;

	for (SDNode *Use : Ptr.getNode()->uses()) {
	if (Use == N)
	continue;
	if (SDNode::hasPredecessorHelper(Use, Visited, Worklist))
	return false;

	// If Ptr may be folded in addressing mode of other use, then it's
	// not profitable to do this transformation.
	if (!canFoldInAddressingMode(Ptr.getNode(), Use, DAG, TLI))
	RealUse = true;
	}

	if (!RealUse)
	return false;

	SDValue Result;
	if (isLoad)
	Result = DAG.getIndexedLoad(SDValue(N,0), SDLoc(N),
	BasePtr, Offset, AM);
	else
	Result = DAG.getIndexedStore(SDValue(N,0), SDLoc(N),
	BasePtr, Offset, AM);
	++PreIndexedNodes;
	++NodesCombined;
	DEBUG(dbgs() << "\nReplacing.4 ";
	N->dump(&DAG);
	dbgs() << "\nWith: ";
	Result.getNode()->dump(&DAG);
	dbgs() << '\n');
	WorklistRemover DeadNodes(*this);
	if (isLoad) {
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
	} else {
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));
	}

	// Finally, since the node is now dead, remove it from the graph.
	deleteAndRecombine(N);

	if (Swapped)
	std::swap(BasePtr, Offset);

	// Replace other uses of BasePtr that can be updated to use Ptr
	for (unsigned i = 0, e = OtherUses.size(); i != e; ++i) {
	unsigned OffsetIdx = 1;
	if (OtherUses[i]->getOperand(OffsetIdx).getNode() == BasePtr.getNode())
	OffsetIdx = 0;
	assert(OtherUses[i]->getOperand(!OffsetIdx).getNode() ==
	BasePtr.getNode() && "Expected BasePtr operand");

	// We need to replace ptr0 in the following expression:
	// x0 * offset0 + y0 * ptr0 = t0
	// knowing that
	// x1 * offset1 + y1 * ptr0 = t1 (the indexed load/store)
	//
	// where x0, x1, y0 and y1 in {-1, 1} are given by the types of the
	// indexed load/store and the expression that needs to be re-written.
	//
	// Therefore, we have:
	// t0 = (x0 * offset0 - x1 * y0 * y1 offset1) + (y0 y1) * t1

	ConstantSDNode *CN =
	cast<ConstantSDNode>(OtherUses[i]->getOperand(OffsetIdx));
	int X0, X1, Y0, Y1;
	const APInt &Offset0 = CN->getAPIntValue();
	APInt Offset1 = cast<ConstantSDNode>(Offset)->getAPIntValue();

	X0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 1) ? -1 : 1;
	Y0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 0) ? -1 : 1;
	X1 = (AM == ISD::PRE_DEC && !Swapped) ? -1 : 1;
	Y1 = (AM == ISD::PRE_DEC && Swapped) ? -1 : 1;

	unsigned Opcode = (Y0 * Y1 < 0) ? ISD::SUB : ISD::ADD;

	APInt CNV = Offset0;
	if (X0 < 0) CNV = -CNV;
	if (X1 * Y0 * Y1 < 0) CNV = CNV + Offset1;
	else CNV = CNV - Offset1;

	SDLoc DL(OtherUses[i]);

	// We can now generate the new expression.
	SDValue NewOp1 = DAG.getConstant(CNV, DL, CN->getValueType(0));
	SDValue NewOp2 = Result.getValue(isLoad ? 1 : 0);

	SDValue NewUse = DAG.getNode(Opcode,
	DL,
	OtherUses[i]->getValueType(0), NewOp1, NewOp2);
	DAG.ReplaceAllUsesOfValueWith(SDValue(OtherUses[i], 0), NewUse);
	deleteAndRecombine(OtherUses[i]);
	}

	// Replace the uses of Ptr with uses of the updated base value.
	DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(isLoad ? 1 : 0));
	deleteAndRecombine(Ptr.getNode());
	AddToWorklist(Result.getNode());

	return true;
	}

	/// Try to combine a load/store with a add/sub of the base pointer node into a
	/// post-indexed load/store. The transformation folded the add/subtract into the
	/// new indexed load/store effectively and all of its uses are redirected to the
	/// new load/store.
	bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
	if (Level < AfterLegalizeDAG)
	return false;

	bool isLoad = true;
	SDValue Ptr;
	EVT VT;
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
	if (LD->isIndexed())
	return false;
	VT = LD->getMemoryVT();
	if (!TLI.isIndexedLoadLegal(ISD::POST_INC, VT) &&
	!TLI.isIndexedLoadLegal(ISD::POST_DEC, VT))
	return false;
	Ptr = LD->getBasePtr();
	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
	if (ST->isIndexed())
	return false;
	VT = ST->getMemoryVT();
	if (!TLI.isIndexedStoreLegal(ISD::POST_INC, VT) &&
	!TLI.isIndexedStoreLegal(ISD::POST_DEC, VT))
	return false;
	Ptr = ST->getBasePtr();
	isLoad = false;
	} else {
	return false;
	}

	if (Ptr.getNode()->hasOneUse())
	return false;

	for (SDNode *Op : Ptr.getNode()->uses()) {
	if (Op == N \|\|
	(Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB))
	continue;

	SDValue BasePtr;
	SDValue Offset;
	ISD::MemIndexedMode AM = ISD::UNINDEXED;
	if (TLI.getPostIndexedAddressParts(N, Op, BasePtr, Offset, AM, DAG)) {
	// Don't create a indexed load / store with zero offset.
	if (isNullConstant(Offset))
	continue;

	// Try turning it into a post-indexed load / store except when
	// 1) All uses are load / store ops that use it as base ptr (and
	// it may be folded as addressing mmode).
	// 2) Op must be independent of N, i.e. Op is neither a predecessor
	// nor a successor of N. Otherwise, if Op is folded that would
	// create a cycle.

	if (isa<FrameIndexSDNode>(BasePtr) \|\| isa<RegisterSDNode>(BasePtr))
	continue;

	// Check for #1.
	bool TryNext = false;
	for (SDNode *Use : BasePtr.getNode()->uses()) {
	if (Use == Ptr.getNode())
	continue;

	// If all the uses are load / store addresses, then don't do the
	// transformation.
	if (Use->getOpcode() == ISD::ADD \|\| Use->getOpcode() == ISD::SUB){
	bool RealUse = false;
	for (SDNode *UseUse : Use->uses()) {
	if (!canFoldInAddressingMode(Use, UseUse, DAG, TLI))
	RealUse = true;
	}

	if (!RealUse) {
	TryNext = true;
	break;
	}
	}
	}

	if (TryNext)
	continue;

	// Check for #2
	if (!Op->isPredecessorOf(N) && !N->isPredecessorOf(Op)) {
	SDValue Result = isLoad
	? DAG.getIndexedLoad(SDValue(N,0), SDLoc(N),
	BasePtr, Offset, AM)
	: DAG.getIndexedStore(SDValue(N,0), SDLoc(N),
	BasePtr, Offset, AM);
	++PostIndexedNodes;
	++NodesCombined;
	DEBUG(dbgs() << "\nReplacing.5 ";
	N->dump(&DAG);
	dbgs() << "\nWith: ";
	Result.getNode()->dump(&DAG);
	dbgs() << '\n');
	WorklistRemover DeadNodes(*this);
	if (isLoad) {
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
	} else {
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));
	}

	// Finally, since the node is now dead, remove it from the graph.
	deleteAndRecombine(N);

	// Replace the uses of Use with uses of the updated base value.
	DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0),
	Result.getValue(isLoad ? 1 : 0));
	deleteAndRecombine(Op);
	return true;
	}
	}
	}

	return false;
	}

	/// \brief Return the base-pointer arithmetic from an indexed \p LD.
	SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) {
	ISD::MemIndexedMode AM = LD->getAddressingMode();
	assert(AM != ISD::UNINDEXED);
	SDValue BP = LD->getOperand(1);
	SDValue Inc = LD->getOperand(2);

	// Some backends use TargetConstants for load offsets, but don't expect
	// TargetConstants in general ADD nodes. We can convert these constants into
	// regular Constants (if the constant is not opaque).
	assert((Inc.getOpcode() != ISD::TargetConstant \|\|
	!cast<ConstantSDNode>(Inc)->isOpaque()) &&
	"Cannot split out indexing using opaque target constants");
	if (Inc.getOpcode() == ISD::TargetConstant) {
	ConstantSDNode *ConstInc = cast<ConstantSDNode>(Inc);
	Inc = DAG.getConstant(*ConstInc->getConstantIntValue(), SDLoc(Inc),
	ConstInc->getValueType(0));
	}

	unsigned Opc =
	(AM == ISD::PRE_INC \|\| AM == ISD::POST_INC ? ISD::ADD : ISD::SUB);
	return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc);
	}

	SDValue DAGCombiner::visitLOAD(SDNode *N) {
	LoadSDNode *LD = cast<LoadSDNode>(N);
	SDValue Chain = LD->getChain();
	SDValue Ptr = LD->getBasePtr();

	// If load is not volatile and there are no uses of the loaded value (and
	// the updated indexed value in case of indexed loads), change uses of the
	// chain value into uses of the chain input (i.e. delete the dead load).
	if (!LD->isVolatile()) {
	if (N->getValueType(1) == MVT::Other) {
	// Unindexed loads.
	if (!N->hasAnyUseOfValue(0)) {
	// It's not safe to use the two value CombineTo variant here. e.g.
	// v1, chain2 = load chain1, loc
	// v2, chain3 = load chain2, loc
	// v3 = add v2, c
	// Now we replace use of chain2 with chain1. This makes the second load
	// isomorphic to the one we are deleting, and thus makes this load live.
	DEBUG(dbgs() << "\nReplacing.6 ";
	N->dump(&DAG);
	dbgs() << "\nWith chain: ";
	Chain.getNode()->dump(&DAG);
	dbgs() << "\n");
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
	AddUsersToWorklist(Chain.getNode());
	if (N->use_empty())
	deleteAndRecombine(N);

	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	} else {
	// Indexed loads.
	assert(N->getValueType(2) == MVT::Other && "Malformed indexed loads?");

	// If this load has an opaque TargetConstant offset, then we cannot split
	// the indexing into an add/sub directly (that TargetConstant may not be
	// valid for a different type of node, and we cannot convert an opaque
	// target constant into a regular constant).
	bool HasOTCInc = LD->getOperand(2).getOpcode() == ISD::TargetConstant &&
	cast<ConstantSDNode>(LD->getOperand(2))->isOpaque();

	if (!N->hasAnyUseOfValue(0) &&
	((MaySplitLoadIndex && !HasOTCInc) \|\| !N->hasAnyUseOfValue(1))) {
	SDValue Undef = DAG.getUNDEF(N->getValueType(0));
	SDValue Index;
	if (N->hasAnyUseOfValue(1) && MaySplitLoadIndex && !HasOTCInc) {
	Index = SplitIndexingFromLoad(LD);
	// Try to fold the base pointer arithmetic into subsequent loads and
	// stores.
	AddUsersToWorklist(N);
	} else
	Index = DAG.getUNDEF(N->getValueType(1));
	DEBUG(dbgs() << "\nReplacing.7 ";
	N->dump(&DAG);
	dbgs() << "\nWith: ";
	Undef.getNode()->dump(&DAG);
	dbgs() << " and 2 other values\n");
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Index);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain);
	deleteAndRecombine(N);
	return SDValue(N, 0); // Return N so it doesn't get rechecked!
	}
	}
	}

	// If this load is directly stored, replace the load value with the stored
	// value.
	// TODO: Handle store large -> read small portion.
	// TODO: Handle TRUNCSTORE/LOADEXT
	if (OptLevel != CodeGenOpt::None &&
	ISD::isNormalLoad(N) && !LD->isVolatile()) {
	if (ISD::isNON_TRUNCStore(Chain.getNode())) {
	StoreSDNode *PrevST = cast<StoreSDNode>(Chain);
	if (PrevST->getBasePtr() == Ptr &&
	PrevST->getValue().getValueType() == N->getValueType(0))
	return CombineTo(N, PrevST->getOperand(1), Chain);
	}
	}

	// Try to infer better alignment information than the load already has.
	if (OptLevel != CodeGenOpt::None && LD->isUnindexed()) {
	if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
	if (Align > LD->getMemOperand()->getBaseAlignment()) {
	SDValue NewLoad = DAG.getExtLoad(
	LD->getExtensionType(), SDLoc(N), LD->getValueType(0), Chain, Ptr,
	LD->getPointerInfo(), LD->getMemoryVT(), Align,
	LD->getMemOperand()->getFlags(), LD->getAAInfo());
	if (NewLoad.getNode() != N)
	return CombineTo(N, NewLoad, SDValue(NewLoad.getNode(), 1), true);
	}
	}
	}

	if (LD->isUnindexed()) {
	// Walk up chain skipping non-aliasing memory nodes.
	SDValue BetterChain = FindBetterChain(N, Chain);

	// If there is a better chain.
	if (Chain != BetterChain) {
	SDValue ReplLoad;

	// Replace the chain to void dependency.
	if (LD->getExtensionType() == ISD::NON_EXTLOAD) {
	ReplLoad = DAG.getLoad(N->getValueType(0), SDLoc(LD),
	BetterChain, Ptr, LD->getMemOperand());
	} else {
	ReplLoad = DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD),
	LD->getValueType(0),
	BetterChain, Ptr, LD->getMemoryVT(),
	LD->getMemOperand());
	}

	// Create token factor to keep old chain connected.
	SDValue Token = DAG.getNode(ISD::TokenFactor, SDLoc(N),
	MVT::Other, Chain, ReplLoad.getValue(1));

	// Replace uses with load result and token factor
	return CombineTo(N, ReplLoad.getValue(0), Token);
	}
	}

	// Try transforming N to an indexed load.
	if (CombineToPreIndexedLoadStore(N) \|\| CombineToPostIndexedLoadStore(N))
	return SDValue(N, 0);

	// Try to slice up N to more direct loads if the slices are mapped to
	// different register banks or pairing can take place.
	if (SliceUpLoad(N))
	return SDValue(N, 0);

	return SDValue();
	}

	namespace {

	/// \brief Helper structure used to slice a load in smaller loads.
	/// Basically a slice is obtained from the following sequence:
	/// Origin = load Ty1, Base
	/// Shift = srl Ty1 Origin, CstTy Amount
	/// Inst = trunc Shift to Ty2
	///
	/// Then, it will be rewritten into:
	/// Slice = load SliceTy, Base + SliceOffset
	/// [Inst = zext Slice to Ty2], only if SliceTy <> Ty2
	///
	/// SliceTy is deduced from the number of bits that are actually used to
	/// build Inst.
	struct LoadedSlice {
	/// \brief Helper structure used to compute the cost of a slice.
	struct Cost {
	/// Are we optimizing for code size.
	bool ForCodeSize;

	/// Various cost.
	unsigned Loads = 0;
	unsigned Truncates = 0;
	unsigned CrossRegisterBanksCopies = 0;
	unsigned ZExts = 0;
	unsigned Shift = 0;

	Cost(bool ForCodeSize = false) : ForCodeSize(ForCodeSize) {}

	/// \brief Get the cost of one isolated slice.
	Cost(const LoadedSlice &LS, bool ForCodeSize = false)
	: ForCodeSize(ForCodeSize), Loads(1) {
	EVT TruncType = LS.Inst->getValueType(0);
	EVT LoadedType = LS.getLoadedType();
	if (TruncType != LoadedType &&
	!LS.DAG->getTargetLoweringInfo().isZExtFree(LoadedType, TruncType))
	ZExts = 1;
	}

	/// \brief Account for slicing gain in the current cost.
	/// Slicing provide a few gains like removing a shift or a
	/// truncate. This method allows to grow the cost of the original
	/// load with the gain from this slice.
	void addSliceGain(const LoadedSlice &LS) {
	// Each slice saves a truncate.
	const TargetLowering &TLI = LS.DAG->getTargetLoweringInfo();
	if (!TLI.isTruncateFree(LS.Inst->getOperand(0).getValueType(),
	LS.Inst->getValueType(0)))
	++Truncates;
	// If there is a shift amount, this slice gets rid of it.
	if (LS.Shift)
	++Shift;
	// If this slice can merge a cross register bank copy, account for it.
	if (LS.canMergeExpensiveCrossRegisterBankCopy())
	++CrossRegisterBanksCopies;
	}

	Cost &operator+=(const Cost &RHS) {
	Loads += RHS.Loads;
	Truncates += RHS.Truncates;
	CrossRegisterBanksCopies += RHS.CrossRegisterBanksCopies;
	ZExts += RHS.ZExts;
	Shift += RHS.Shift;
	return *this;
	}

	bool operator==(const Cost &RHS) const {
	return Loads == RHS.Loads && Truncates == RHS.Truncates &&
	CrossRegisterBanksCopies == RHS.CrossRegisterBanksCopies &&
	ZExts == RHS.ZExts && Shift == RHS.Shift;
	}

	bool operator!=(const Cost &RHS) const { return !(*this == RHS); }

	bool operator<(const Cost &RHS) const {
	// Assume cross register banks copies are as expensive as loads.
	// FIXME: Do we want some more target hooks?
	unsigned ExpensiveOpsLHS = Loads + CrossRegisterBanksCopies;
	unsigned ExpensiveOpsRHS = RHS.Loads + RHS.CrossRegisterBanksCopies;
	// Unless we are optimizing for code size, consider the
	// expensive operation first.
	if (!ForCodeSize && ExpensiveOpsLHS != ExpensiveOpsRHS)
	return ExpensiveOpsLHS < ExpensiveOpsRHS;
	return (Truncates + ZExts + Shift + ExpensiveOpsLHS) <
	(RHS.Truncates + RHS.ZExts + RHS.Shift + ExpensiveOpsRHS);
	}

	bool operator>(const Cost &RHS) const { return RHS < *this; }

	bool operator<=(const Cost &RHS) const { return !(RHS < *this); }

	bool operator>=(const Cost &RHS) const { return !(*this < RHS); }
	};

	// The last instruction that represent the slice. This should be a
	// truncate instruction.
	SDNode *Inst;

	// The original load instruction.
	LoadSDNode *Origin;

	// The right shift amount in bits from the original load.
	unsigned Shift;

	// The DAG from which Origin came from.
	// This is used to get some contextual information about legal types, etc.
	SelectionDAG *DAG;

	LoadedSlice(SDNode Inst = nullptr, LoadSDNode Origin = nullptr,
	unsigned Shift = 0, SelectionDAG *DAG = nullptr)
	: Inst(Inst), Origin(Origin), Shift(Shift), DAG(DAG) {}

	/// \brief Get the bits used in a chunk of bits \p BitWidth large.
	/// \return Result is \p BitWidth and has used bits set to 1 and
	/// not used bits set to 0.
	APInt getUsedBits() const {
	// Reproduce the trunc(lshr) sequence:
	// - Start from the truncated value.
	// - Zero extend to the desired bit width.
	// - Shift left.
	assert(Origin && "No original load to compare against.");
	unsigned BitWidth = Origin->getValueSizeInBits(0);
	assert(Inst && "This slice is not bound to an instruction");
	assert(Inst->getValueSizeInBits(0) <= BitWidth &&
	"Extracted slice is bigger than the whole type!");
	APInt UsedBits(Inst->getValueSizeInBits(0), 0);
	UsedBits.setAllBits();
	UsedBits = UsedBits.zext(BitWidth);
	UsedBits <<= Shift;
	return UsedBits;
	}

	/// \brief Get the size of the slice to be loaded in bytes.
	unsigned getLoadedSize() const {
	unsigned SliceSize = getUsedBits().countPopulation();
	assert(!(SliceSize & 0x7) && "Size is not a multiple of a byte.");
	return SliceSize / 8;
	}

	/// \brief Get the type that will be loaded for this slice.
	/// Note: This may not be the final type for the slice.
	EVT getLoadedType() const {
	assert(DAG && "Missing context");
	LLVMContext &Ctxt = *DAG->getContext();
	return EVT::getIntegerVT(Ctxt, getLoadedSize() * 8);
	}

	/// \brief Get the alignment of the load used for this slice.
	unsigned getAlignment() const {
	unsigned Alignment = Origin->getAlignment();
	unsigned Offset = getOffsetFromBase();
	if (Offset != 0)
	Alignment = MinAlign(Alignment, Alignment + Offset);
	return Alignment;
	}

	/// \brief Check if this slice can be rewritten with legal operations.
	bool isLegal() const {
	// An invalid slice is not legal.
	if (!Origin \|\| !Inst \|\| !DAG)
	return false;

	// Offsets are for indexed load only, we do not handle that.
	if (!Origin->getOffset().isUndef())
	return false;

	const TargetLowering &TLI = DAG->getTargetLoweringInfo();

	// Check that the type is legal.
	EVT SliceType = getLoadedType();
	if (!TLI.isTypeLegal(SliceType))
	return false;

	// Check that the load is legal for this type.
	if (!TLI.isOperationLegal(ISD::LOAD, SliceType))
	return false;

	// Check that the offset can be computed.
	// 1. Check its type.
	EVT PtrType = Origin->getBasePtr().getValueType();
	if (PtrType == MVT::Untyped \|\| PtrType.isExtended())
	return false;

	// 2. Check that it fits in the immediate.
	if (!TLI.isLegalAddImmediate(getOffsetFromBase()))
	return false;

	// 3. Check that the computation is legal.
	if (!TLI.isOperationLegal(ISD::ADD, PtrType))
	return false;

	// Check that the zext is legal if it needs one.
	EVT TruncateType = Inst->getValueType(0);
	if (TruncateType != SliceType &&
	!TLI.isOperationLegal(ISD::ZERO_EXTEND, TruncateType))
	return false;

	return true;
	}

	/// \brief Get the offset in bytes of this slice in the original chunk of
	/// bits.
	/// \pre DAG != nullptr.
	uint64_t getOffsetFromBase() const {
	assert(DAG && "Missing context.");
	bool IsBigEndian = DAG->getDataLayout().isBigEndian();
	assert(!(Shift & 0x7) && "Shifts not aligned on Bytes are not supported.");
	uint64_t Offset = Shift / 8;
	unsigned TySizeInBytes = Origin->getValueSizeInBits(0) / 8;
	assert(!(Origin->getValueSizeInBits(0) & 0x7) &&
	"The size of the original loaded type is not a multiple of a"
	" byte.");
	// If Offset is bigger than TySizeInBytes, it means we are loading all
	// zeros. This should have been optimized before in the process.
	assert(TySizeInBytes > Offset &&
	"Invalid shift amount for given loaded size");
	if (IsBigEndian)
	Offset = TySizeInBytes - Offset - getLoadedSize();
	return Offset;
	}

	/// \brief Generate the sequence of instructions to load the slice
	/// represented by this object and redirect the uses of this slice to
	/// this new sequence of instructions.
	/// \pre this->Inst && this->Origin are valid Instructions and this
	/// object passed the legal check: LoadedSlice::isLegal returned true.
	/// \return The last instruction of the sequence used to load the slice.
	SDValue loadSlice() const {
	assert(Inst && Origin && "Unable to replace a non-existing slice.");
	const SDValue &OldBaseAddr = Origin->getBasePtr();
	SDValue BaseAddr = OldBaseAddr;
	// Get the offset in that chunk of bytes w.r.t. the endianness.
	int64_t Offset = static_cast<int64_t>(getOffsetFromBase());
	assert(Offset >= 0 && "Offset too big to fit in int64_t!");
	if (Offset) {
	// BaseAddr = BaseAddr + Offset.
	EVT ArithType = BaseAddr.getValueType();
	SDLoc DL(Origin);
	BaseAddr = DAG->getNode(ISD::ADD, DL, ArithType, BaseAddr,
	DAG->getConstant(Offset, DL, ArithType));
	}

	// Create the type of the loaded slice according to its size.
	EVT SliceType = getLoadedType();

	// Create the load for the slice.
	SDValue LastInst =
	DAG->getLoad(SliceType, SDLoc(Origin), Origin->getChain(), BaseAddr,
	Origin->getPointerInfo().getWithOffset(Offset),
	getAlignment(), Origin->getMemOperand()->getFlags());
	// If the final type is not the same as the loaded type, this means that
	// we have to pad with zero. Create a zero extend for that.
	EVT FinalType = Inst->getValueType(0);
	if (SliceType != FinalType)
	LastInst =
	DAG->getNode(ISD::ZERO_EXTEND, SDLoc(LastInst), FinalType, LastInst);
	return LastInst;
	}

	/// \brief Check if this slice can be merged with an expensive cross register
	/// bank copy. E.g.,
	/// i = load i32
	/// f = bitcast i32 i to float
	bool canMergeExpensiveCrossRegisterBankCopy() const {
	if (!Inst \|\| !Inst->hasOneUse())
	return false;
	SDNode Use = Inst->use_begin();
	if (Use->getOpcode() != ISD::BITCAST)
	return false;
	assert(DAG && "Missing context");
	const TargetLowering &TLI = DAG->getTargetLoweringInfo();
	EVT ResVT = Use->getValueType(0);
	const TargetRegisterClass *ResRC = TLI.getRegClassFor(ResVT.getSimpleVT());
	const TargetRegisterClass *ArgRC =
	TLI.getRegClassFor(Use->getOperand(0).getValueType().getSimpleVT());
	if (ArgRC == ResRC \|\| !TLI.isOperationLegal(ISD::LOAD, ResVT))
	return false;

	// At this point, we know that we perform a cross-register-bank copy.
	// Check if it is expensive.
	const TargetRegisterInfo *TRI = DAG->getSubtarget().getRegisterInfo();
	// Assume bitcasts are cheap, unless both register classes do not
	// explicitly share a common sub class.
	if (!TRI \|\| TRI->getCommonSubClass(ArgRC, ResRC))
	return false;

	// Check if it will be merged with the load.
	// 1. Check the alignment constraint.
	unsigned RequiredAlignment = DAG->getDataLayout().getABITypeAlignment(
	ResVT.getTypeForEVT(*DAG->getContext()));

	if (RequiredAlignment > getAlignment())
	return false;

	// 2. Check that the load is a legal operation for that type.
	if (!TLI.isOperationLegal(ISD::LOAD, ResVT))
	return false;

	// 3. Check that we do not have a zext in the way.
	if (Inst->getValueType(0) != getLoadedType())
	return false;

	return true;
	}
	};

	} // end anonymous namespace

	/// \brief Check that all bits set in \p UsedBits form a dense region, i.e.,
	/// \p UsedBits looks like 0..0 1..1 0..0.
	static bool areUsedBitsDense(const APInt &UsedBits) {
	// If all the bits are one, this is dense!
	if (UsedBits.isAllOnesValue())
	return true;

	// Get rid of the unused bits on the right.
	APInt NarrowedUsedBits = UsedBits.lshr(UsedBits.countTrailingZeros());
	// Get rid of the unused bits on the left.
	if (NarrowedUsedBits.countLeadingZeros())
	NarrowedUsedBits = NarrowedUsedBits.trunc(NarrowedUsedBits.getActiveBits());
	// Check that the chunk of bits is completely used.
	return NarrowedUsedBits.isAllOnesValue();
	}

	/// \brief Check whether or not \p First and \p Second are next to each other
	/// in memory. This means that there is no hole between the bits loaded
	/// by \p First and the bits loaded by \p Second.
	static bool areSlicesNextToEachOther(const LoadedSlice &First,
	const LoadedSlice &Second) {
	assert(First.Origin == Second.Origin && First.Origin &&
	"Unable to match different memory origins.");
	APInt UsedBits = First.getUsedBits();
	assert((UsedBits & Second.getUsedBits()) == 0 &&
	"Slices are not supposed to overlap.");
	UsedBits \|= Second.getUsedBits();
	return areUsedBitsDense(UsedBits);
	}

	/// \brief Adjust the \p GlobalLSCost according to the target
	/// paring capabilities and the layout of the slices.
	/// \pre \p GlobalLSCost should account for at least as many loads as
	/// there is in the slices in \p LoadedSlices.
	static void adjustCostForPairing(SmallVectorImpl<LoadedSlice> &LoadedSlices,
	LoadedSlice::Cost &GlobalLSCost) {
	unsigned NumberOfSlices = LoadedSlices.size();
	// If there is less than 2 elements, no pairing is possible.
	if (NumberOfSlices < 2)
	return;

	// Sort the slices so that elements that are likely to be next to each
	// other in memory are next to each other in the list.
	std::sort(LoadedSlices.begin(), LoadedSlices.end(),
	[](const LoadedSlice &LHS, const LoadedSlice &RHS) {
	assert(LHS.Origin == RHS.Origin && "Different bases not implemented.");
	return LHS.getOffsetFromBase() < RHS.getOffsetFromBase();
	});
	const TargetLowering &TLI = LoadedSlices[0].DAG->getTargetLoweringInfo();
	// First (resp. Second) is the first (resp. Second) potentially candidate
	// to be placed in a paired load.
	const LoadedSlice *First = nullptr;
	const LoadedSlice *Second = nullptr;
	for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice,
	// Set the beginning of the pair.
	First = Second) {
	Second = &LoadedSlices[CurrSlice];

	// If First is NULL, it means we start a new pair.
	// Get to the next slice.
	if (!First)
	continue;

	EVT LoadedType = First->getLoadedType();

	// If the types of the slices are different, we cannot pair them.
	if (LoadedType != Second->getLoadedType())
	continue;

	// Check if the target supplies paired loads for this type.
	unsigned RequiredAlignment = 0;
	if (!TLI.hasPairedLoad(LoadedType, RequiredAlignment)) {
	// move to the next pair, this type is hopeless.
	Second = nullptr;
	continue;
	}
	// Check if we meet the alignment requirement.
	if (RequiredAlignment > First->getAlignment())
	continue;

	// Check that both loads are next to each other in memory.
	if (!areSlicesNextToEachOther(First, Second))
	continue;

	assert(GlobalLSCost.Loads > 0 && "We save more loads than we created!");
	--GlobalLSCost.Loads;
	// Move to the next pair.
	Second = nullptr;
	}
	}

	/// \brief Check the profitability of all involved LoadedSlice.
	/// Currently, it is considered profitable if there is exactly two
	/// involved slices (1) which are (2) next to each other in memory, and
	/// whose cost (\see LoadedSlice::Cost) is smaller than the original load (3).
	///
	/// Note: The order of the elements in \p LoadedSlices may be modified, but not
	/// the elements themselves.
	///
	/// FIXME: When the cost model will be mature enough, we can relax
	/// constraints (1) and (2).
	static bool isSlicingProfitable(SmallVectorImpl<LoadedSlice> &LoadedSlices,
	const APInt &UsedBits, bool ForCodeSize) {
	unsigned NumberOfSlices = LoadedSlices.size();
	if (StressLoadSlicing)
	return NumberOfSlices > 1;

	// Check (1).
	if (NumberOfSlices != 2)
	return false;

	// Check (2).
	if (!areUsedBitsDense(UsedBits))
	return false;

	// Check (3).
	LoadedSlice::Cost OrigCost(ForCodeSize), GlobalSlicingCost(ForCodeSize);
	// The original code has one big load.
	OrigCost.Loads = 1;
	for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice) {
	const LoadedSlice &LS = LoadedSlices[CurrSlice];
	// Accumulate the cost of all the slices.
	LoadedSlice::Cost SliceCost(LS, ForCodeSize);
	GlobalSlicingCost += SliceCost;

	// Account as cost in the original configuration the gain obtained
	// with the current slices.
	OrigCost.addSliceGain(LS);
	}

	// If the target supports paired load, adjust the cost accordingly.
	adjustCostForPairing(LoadedSlices, GlobalSlicingCost);
	return OrigCost > GlobalSlicingCost;
	}

	/// \brief If the given load, \p LI, is used only by trunc or trunc(lshr)
	/// operations, split it in the various pieces being extracted.
	///
	/// This sort of thing is introduced by SROA.
	/// This slicing takes care not to insert overlapping loads.
	/// \pre LI is a simple load (i.e., not an atomic or volatile load).
	bool DAGCombiner::SliceUpLoad(SDNode *N) {
	if (Level < AfterLegalizeDAG)
	return false;

	LoadSDNode *LD = cast<LoadSDNode>(N);
	if (LD->isVolatile() \|\| !ISD::isNormalLoad(LD) \|\|
	!LD->getValueType(0).isInteger())
	return false;

	// Keep track of already used bits to detect overlapping values.
	// In that case, we will just abort the transformation.
	APInt UsedBits(LD->getValueSizeInBits(0), 0);

	SmallVector<LoadedSlice, 4> LoadedSlices;

	// Check if this load is used as several smaller chunks of bits.
	// Basically, look for uses in trunc or trunc(lshr) and record a new chain
	// of computation for each trunc.
	for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
	UI != UIEnd; ++UI) {
	// Skip the uses of the chain.
	if (UI.getUse().getResNo() != 0)
	continue;

	SDNode User = UI;
	unsigned Shift = 0;

	// Check if this is a trunc(lshr).
	if (User->getOpcode() == ISD::SRL && User->hasOneUse() &&
	isa<ConstantSDNode>(User->getOperand(1))) {
	Shift = User->getConstantOperandVal(1);
	User = *User->use_begin();
	}

	// At this point, User is a Truncate, iff we encountered, trunc or
	// trunc(lshr).
	if (User->getOpcode() != ISD::TRUNCATE)
	return false;

	// The width of the type must be a power of 2 and greater than 8-bits.
	// Otherwise the load cannot be represented in LLVM IR.
	// Moreover, if we shifted with a non-8-bits multiple, the slice
	// will be across several bytes. We do not support that.
	unsigned Width = User->getValueSizeInBits(0);
	if (Width < 8 \|\| !isPowerOf2_32(Width) \|\| (Shift & 0x7))
	return false;

	// Build the slice for this chain of computations.
	LoadedSlice LS(User, LD, Shift, &DAG);
	APInt CurrentUsedBits = LS.getUsedBits();

	// Check if this slice overlaps with another.
	if ((CurrentUsedBits & UsedBits) != 0)
	return false;
	// Update the bits used globally.
	UsedBits \|= CurrentUsedBits;

	// Check if the new slice would be legal.
	if (!LS.isLegal())
	return false;

	// Record the slice.
	LoadedSlices.push_back(LS);
	}

	// Abort slicing if it does not seem to be profitable.
	if (!isSlicingProfitable(LoadedSlices, UsedBits, ForCodeSize))
	return false;

	++SlicedLoads;

	// Rewrite each chain to use an independent load.
	// By construction, each chain can be represented by a unique load.

	// Prepare the argument for the new token factor for all the slices.
	SmallVector<SDValue, 8> ArgChains;
	for (SmallVectorImpl<LoadedSlice>::const_iterator
	LSIt = LoadedSlices.begin(),
	LSItEnd = LoadedSlices.end();
	LSIt != LSItEnd; ++LSIt) {
	SDValue SliceInst = LSIt->loadSlice();
	CombineTo(LSIt->Inst, SliceInst, true);
	if (SliceInst.getOpcode() != ISD::LOAD)
	SliceInst = SliceInst.getOperand(0);
	assert(SliceInst->getOpcode() == ISD::LOAD &&
	"It takes more than a zext to get to the loaded slice!!");
	ArgChains.push_back(SliceInst.getValue(1));
	}

	SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other,
	ArgChains);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
	AddToWorklist(Chain.getNode());
	return true;
	}

	/// Check to see if V is (and load (ptr), imm), where the load is having
	/// specific bytes cleared out. If so, return the byte size being masked out
	/// and the shift amount.
	static std::pair<unsigned, unsigned>
	CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) {
	std::pair<unsigned, unsigned> Result(0, 0);

	// Check for the structure we're looking for.
	if (V->getOpcode() != ISD::AND \|\|
	!isa<ConstantSDNode>(V->getOperand(1)) \|\|
	!ISD::isNormalLoad(V->getOperand(0).getNode()))
	return Result;

	// Check the chain and pointer.
	LoadSDNode *LD = cast<LoadSDNode>(V->getOperand(0));
	if (LD->getBasePtr() != Ptr) return Result; // Not from same pointer.

	// The store should be chained directly to the load or be an operand of a
	// tokenfactor.
	if (LD == Chain.getNode())
	; // ok.
	else if (Chain->getOpcode() != ISD::TokenFactor)
	return Result; // Fail.
	else {
	bool isOk = false;
	for (const SDValue &ChainOp : Chain->op_values())
	if (ChainOp.getNode() == LD) {
	isOk = true;
	break;
	}
	if (!isOk) return Result;
	}

	// This only handles simple types.
	if (V.getValueType() != MVT::i16 &&
	V.getValueType() != MVT::i32 &&
	V.getValueType() != MVT::i64)
	return Result;

	// Check the constant mask. Invert it so that the bits being masked out are
	// 0 and the bits being kept are 1. Use getSExtValue so that leading bits
	// follow the sign bit for uniformity.
	uint64_t NotMask = ~cast<ConstantSDNode>(V->getOperand(1))->getSExtValue();
	unsigned NotMaskLZ = countLeadingZeros(NotMask);
	if (NotMaskLZ & 7) return Result; // Must be multiple of a byte.
	unsigned NotMaskTZ = countTrailingZeros(NotMask);
	if (NotMaskTZ & 7) return Result; // Must be multiple of a byte.
	if (NotMaskLZ == 64) return Result; // All zero mask.

	// See if we have a continuous run of bits. If so, we have 01+0
	if (countTrailingOnes(NotMask >> NotMaskTZ) + NotMaskTZ + NotMaskLZ != 64)
	return Result;

	// Adjust NotMaskLZ down to be from the actual size of the int instead of i64.
	if (V.getValueType() != MVT::i64 && NotMaskLZ)
	NotMaskLZ -= 64-V.getValueSizeInBits();

	unsigned MaskedBytes = (V.getValueSizeInBits()-NotMaskLZ-NotMaskTZ)/8;
	switch (MaskedBytes) {
	case 1:
	case 2:
	case 4: break;
	default: return Result; // All one mask, or 5-byte mask.
	}

	// Verify that the first bit starts at a multiple of mask so that the access
	// is aligned the same as the access width.
	if (NotMaskTZ && NotMaskTZ/8 % MaskedBytes) return Result;

	Result.first = MaskedBytes;
	Result.second = NotMaskTZ/8;
	return Result;
	}

	/// Check to see if IVal is something that provides a value as specified by
	/// MaskInfo. If so, replace the specified store with a narrower store of
	/// truncated IVal.
	static SDNode *
	ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
	SDValue IVal, StoreSDNode *St,
	DAGCombiner *DC) {
	unsigned NumBytes = MaskInfo.first;
	unsigned ByteShift = MaskInfo.second;
	SelectionDAG &DAG = DC->getDAG();

	// Check to see if IVal is all zeros in the part being masked in by the 'or'
	// that uses this. If not, this is not a replacement.
	APInt Mask = ~APInt::getBitsSet(IVal.getValueSizeInBits(),
	ByteShift8, (ByteShift+NumBytes)8);
	if (!DAG.MaskedValueIsZero(IVal, Mask)) return nullptr;

	// Check that it is legal on the target to do this. It is legal if the new
	// VT we're shrinking to (i8/i16/i32) is legal or we're still before type
	// legalization.
	MVT VT = MVT::getIntegerVT(NumBytes*8);
	if (!DC->isTypeLegal(VT))
	return nullptr;

	// Okay, we can do this! Replace the 'St' store with a store of IVal that is
	// shifted by ByteShift and truncated down to NumBytes.
	if (ByteShift) {
	SDLoc DL(IVal);
	IVal = DAG.getNode(ISD::SRL, DL, IVal.getValueType(), IVal,
	DAG.getConstant(ByteShift*8, DL,
	DC->getShiftAmountTy(IVal.getValueType())));
	}

	// Figure out the offset for the store and the alignment of the access.
	unsigned StOffset;
	unsigned NewAlign = St->getAlignment();

	if (DAG.getDataLayout().isLittleEndian())
	StOffset = ByteShift;
	else
	StOffset = IVal.getValueType().getStoreSize() - ByteShift - NumBytes;

	SDValue Ptr = St->getBasePtr();
	if (StOffset) {
	SDLoc DL(IVal);
	Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(),
	Ptr, DAG.getConstant(StOffset, DL, Ptr.getValueType()));
	NewAlign = MinAlign(NewAlign, StOffset);
	}

	// Truncate down to the new size.
	IVal = DAG.getNode(ISD::TRUNCATE, SDLoc(IVal), VT, IVal);

	++OpsNarrowed;
	return DAG
	.getStore(St->getChain(), SDLoc(St), IVal, Ptr,
	St->getPointerInfo().getWithOffset(StOffset), NewAlign)
	.getNode();
	}

	/// Look for sequence of load / op / store where op is one of 'or', 'xor', and
	/// 'and' of immediates. If 'op' is only touching some of the loaded bits, try
	/// narrowing the load and store if it would end up being a win for performance
	/// or code size.
	SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
	StoreSDNode *ST = cast<StoreSDNode>(N);
	if (ST->isVolatile())
	return SDValue();

	SDValue Chain = ST->getChain();
	SDValue Value = ST->getValue();
	SDValue Ptr = ST->getBasePtr();
	EVT VT = Value.getValueType();

	if (ST->isTruncatingStore() \|\| VT.isVector() \|\| !Value.hasOneUse())
	return SDValue();

	unsigned Opc = Value.getOpcode();

	// If this is "store (or X, Y), P" and X is "(and (load P), cst)", where cst
	// is a byte mask indicating a consecutive number of bytes, check to see if
	// Y is known to provide just those bytes. If so, we try to replace the
	// load + replace + store sequence with a single (narrower) store, which makes
	// the load dead.
	if (Opc == ISD::OR) {
	std::pair<unsigned, unsigned> MaskedLoad;
	MaskedLoad = CheckForMaskedLoad(Value.getOperand(0), Ptr, Chain);
	if (MaskedLoad.first)
	if (SDNode *NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
	Value.getOperand(1), ST,this))
	return SDValue(NewST, 0);

	// Or is commutative, so try swapping X and Y.
	MaskedLoad = CheckForMaskedLoad(Value.getOperand(1), Ptr, Chain);
	if (MaskedLoad.first)
	if (SDNode *NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
	Value.getOperand(0), ST,this))
	return SDValue(NewST, 0);
	}

	if ((Opc != ISD::OR && Opc != ISD::XOR && Opc != ISD::AND) \|\|
	Value.getOperand(1).getOpcode() != ISD::Constant)
	return SDValue();

	SDValue N0 = Value.getOperand(0);
	if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
	Chain == SDValue(N0.getNode(), 1)) {
	LoadSDNode *LD = cast<LoadSDNode>(N0);
	if (LD->getBasePtr() != Ptr \|\|
	LD->getPointerInfo().getAddrSpace() !=
	ST->getPointerInfo().getAddrSpace())
	return SDValue();

	// Find the type to narrow it the load / op / store to.
	SDValue N1 = Value.getOperand(1);
	unsigned BitWidth = N1.getValueSizeInBits();
	APInt Imm = cast<ConstantSDNode>(N1)->getAPIntValue();
	if (Opc == ISD::AND)
	Imm ^= APInt::getAllOnesValue(BitWidth);
	if (Imm == 0 \|\| Imm.isAllOnesValue())
	return SDValue();
	unsigned ShAmt = Imm.countTrailingZeros();
	unsigned MSB = BitWidth - Imm.countLeadingZeros() - 1;
	unsigned NewBW = NextPowerOf2(MSB - ShAmt);
	EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
	// The narrowing should be profitable, the load/store operation should be
	// legal (or custom) and the store size should be equal to the NewVT width.
	while (NewBW < BitWidth &&
	(NewVT.getStoreSizeInBits() != NewBW \|\|
	!TLI.isOperationLegalOrCustom(Opc, NewVT) \|\|
	!TLI.isNarrowingProfitable(VT, NewVT))) {
	NewBW = NextPowerOf2(NewBW);
	NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
	}
	if (NewBW >= BitWidth)
	return SDValue();

	// If the lsb changed does not start at the type bitwidth boundary,
	// start at the previous one.
	if (ShAmt % NewBW)
	ShAmt = (((ShAmt + NewBW - 1) / NewBW) * NewBW) - NewBW;
	APInt Mask = APInt::getBitsSet(BitWidth, ShAmt,
	std::min(BitWidth, ShAmt + NewBW));
	if ((Imm & Mask) == Imm) {
	APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW);
	if (Opc == ISD::AND)
	NewImm ^= APInt::getAllOnesValue(NewBW);
	uint64_t PtrOff = ShAmt / 8;
	// For big endian targets, we need to adjust the offset to the pointer to
	// load the correct bytes.
	if (DAG.getDataLayout().isBigEndian())
	PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff;

	unsigned NewAlign = MinAlign(LD->getAlignment(), PtrOff);
	Type NewVTTy = NewVT.getTypeForEVT(DAG.getContext());
	if (NewAlign < DAG.getDataLayout().getABITypeAlignment(NewVTTy))
	return SDValue();

	SDValue NewPtr = DAG.getNode(ISD::ADD, SDLoc(LD),
	Ptr.getValueType(), Ptr,
	DAG.getConstant(PtrOff, SDLoc(LD),
	Ptr.getValueType()));
	SDValue NewLD =
	DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr,
	LD->getPointerInfo().getWithOffset(PtrOff), NewAlign,
	LD->getMemOperand()->getFlags(), LD->getAAInfo());
	SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
	DAG.getConstant(NewImm, SDLoc(Value),
	NewVT));
	SDValue NewST =
	DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr,
	ST->getPointerInfo().getWithOffset(PtrOff), NewAlign);

	AddToWorklist(NewPtr.getNode());
	AddToWorklist(NewLD.getNode());
	AddToWorklist(NewVal.getNode());
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1));
	++OpsNarrowed;
	return NewST;
	}
	}

	return SDValue();
	}

	/// For a given floating point load / store pair, if the load value isn't used
	/// by any other operations, then consider transforming the pair to integer
	/// load / store operations if the target deems the transformation profitable.
	SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {
	StoreSDNode *ST = cast<StoreSDNode>(N);
	SDValue Chain = ST->getChain();
	SDValue Value = ST->getValue();
	if (ISD::isNormalStore(ST) && ISD::isNormalLoad(Value.getNode()) &&
	Value.hasOneUse() &&
	Chain == SDValue(Value.getNode(), 1)) {
	LoadSDNode *LD = cast<LoadSDNode>(Value);
	EVT VT = LD->getMemoryVT();
	if (!VT.isFloatingPoint() \|\|
	VT != ST->getMemoryVT() \|\|
	LD->isNonTemporal() \|\|
	ST->isNonTemporal() \|\|
	LD->getPointerInfo().getAddrSpace() != 0 \|\|
	ST->getPointerInfo().getAddrSpace() != 0)
	return SDValue();

	EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
	if (!TLI.isOperationLegal(ISD::LOAD, IntVT) \|\|
	!TLI.isOperationLegal(ISD::STORE, IntVT) \|\|
	!TLI.isDesirableToTransformToIntegerOp(ISD::LOAD, VT) \|\|
	!TLI.isDesirableToTransformToIntegerOp(ISD::STORE, VT))
	return SDValue();

	unsigned LDAlign = LD->getAlignment();
	unsigned STAlign = ST->getAlignment();
	Type IntVTTy = IntVT.getTypeForEVT(DAG.getContext());
	unsigned ABIAlign = DAG.getDataLayout().getABITypeAlignment(IntVTTy);
	if (LDAlign < ABIAlign \|\| STAlign < ABIAlign)
	return SDValue();

	SDValue NewLD =
	DAG.getLoad(IntVT, SDLoc(Value), LD->getChain(), LD->getBasePtr(),
	LD->getPointerInfo(), LDAlign);

	SDValue NewST =
	DAG.getStore(NewLD.getValue(1), SDLoc(N), NewLD, ST->getBasePtr(),
	ST->getPointerInfo(), STAlign);

	AddToWorklist(NewLD.getNode());
	AddToWorklist(NewST.getNode());
	WorklistRemover DeadNodes(*this);
	DAG.ReplaceAllUsesOfValueWith(Value.getValue(1), NewLD.getValue(1));
	++LdStFP2Int;
	return NewST;
	}

	return SDValue();
	}

	// This is a helper function for visitMUL to check the profitability
	// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).
	// MulNode is the original multiply, AddNode is (add x, c1),
	// and ConstNode is c2.
	//
	// If the (add x, c1) has multiple uses, we could increase
	// the number of adds if we make this transformation.
	// It would only be worth doing this if we can remove a
	// multiply in the process. Check for that here.
	// To illustrate:
	// (A + c1) * c3
	// (A + c2) * c3
	// We're checking for cases where we have common "c3 * A" expressions.
	bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode,
	SDValue &AddNode,
	SDValue &ConstNode) {
	APInt Val;

	// If the add only has one use, this would be OK to do.
	if (AddNode.getNode()->hasOneUse())
	return true;

	// Walk all the users of the constant with which we're multiplying.
	for (SDNode *Use : ConstNode->uses()) {
	if (Use == MulNode) // This use is the one we're on right now. Skip it.
	continue;

	if (Use->getOpcode() == ISD::MUL) { // We have another multiply use.
	SDNode *OtherOp;
	SDNode *MulVar = AddNode.getOperand(0).getNode();

	// OtherOp is what we're multiplying against the constant.
	if (Use->getOperand(0) == ConstNode)
	OtherOp = Use->getOperand(1).getNode();
	else
	OtherOp = Use->getOperand(0).getNode();

	// Check to see if multiply is with the same operand of our "add".
	//
	// ConstNode = CONST
	// Use = ConstNode * A <-- visiting Use. OtherOp is A.
	// ...
	// AddNode = (A + c1) <-- MulVar is A.
	// = AddNode * ConstNode <-- current visiting instruction.
	//
	// If we make this transformation, we will have a common
	// multiply (ConstNode * A) that we can save.
	if (OtherOp == MulVar)
	return true;

	// Now check to see if a future expansion will give us a common
	// multiply.
	//
	// ConstNode = CONST
	// AddNode = (A + c1)
	// ... = AddNode * ConstNode <-- current visiting instruction.
	// ...
	// OtherOp = (A + c2)
	// Use = OtherOp * ConstNode <-- visiting Use.
	//
	// If we make this transformation, we will have a common
	// multiply (CONST * A) after we also do the same transformation
	// to the "t2" instruction.
	if (OtherOp->getOpcode() == ISD::ADD &&
	DAG.isConstantIntBuildVectorOrConstantInt(OtherOp->getOperand(1)) &&
	OtherOp->getOperand(0).getNode() == MulVar)
	return true;
	}
	}

	// Didn't find a case where this would be profitable.
	return false;
	}

	static SDValue peekThroughBitcast(SDValue V) {
	while (V.getOpcode() == ISD::BITCAST)
	V = V.getOperand(0);
	return V;
	}

	SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
	unsigned NumStores) {
	SmallVector<SDValue, 8> Chains;
	SmallPtrSet<const SDNode *, 8> Visited;
	SDLoc StoreDL(StoreNodes[0].MemNode);

	for (unsigned i = 0; i < NumStores; ++i) {
	Visited.insert(StoreNodes[i].MemNode);
	}

	// don't include nodes that are children
	for (unsigned i = 0; i < NumStores; ++i) {
	if (Visited.count(StoreNodes[i].MemNode->getChain().getNode()) == 0)
	Chains.push_back(StoreNodes[i].MemNode->getChain());
	}

	assert(Chains.size() > 0 && "Chain should have generated a chain");
	return DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, Chains);
	}

	bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
	SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT, unsigned NumStores,
	bool IsConstantSrc, bool UseVector, bool UseTrunc) {
	// Make sure we have something to merge.
	if (NumStores < 2)
	return false;

	// The latest Node in the DAG.
	SDLoc DL(StoreNodes[0].MemNode);

	int64_t ElementSizeBits = MemVT.getStoreSizeInBits();
	unsigned SizeInBits = NumStores * ElementSizeBits;
	unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;

	EVT StoreTy;
	if (UseVector) {
	unsigned Elts = NumStores * NumMemElts;
	// Get the type for the merged vector store.
	StoreTy = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
	} else
	StoreTy = EVT::getIntegerVT(*DAG.getContext(), SizeInBits);

	SDValue StoredVal;
	if (UseVector) {
	if (IsConstantSrc) {
	SmallVector<SDValue, 8> BuildVector;
	for (unsigned I = 0; I != NumStores; ++I) {
	StoreSDNode *St = cast<StoreSDNode>(StoreNodes[I].MemNode);
	SDValue Val = St->getValue();
	// If constant is of the wrong type, convert it now.
	if (MemVT != Val.getValueType()) {
	Val = peekThroughBitcast(Val);
	// Deal with constants of wrong size.
	if (ElementSizeBits != Val.getValueSizeInBits()) {
	EVT IntMemVT =
	EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
	if (isa<ConstantFPSDNode>(Val)) {
	// Not clear how to truncate FP values.
	return false;
	} else if (auto *C = dyn_cast<ConstantSDNode>(Val))
	Val = DAG.getConstant(C->getAPIntValue()
	.zextOrTrunc(Val.getValueSizeInBits())
	.zextOrTrunc(ElementSizeBits),
	SDLoc(C), IntMemVT);
	}
	// Make sure correctly size type is the correct type.
	Val = DAG.getBitcast(MemVT, Val);
	}
	BuildVector.push_back(Val);
	}
	StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS
	: ISD::BUILD_VECTOR,
	DL, StoreTy, BuildVector);
	} else {
	SmallVector<SDValue, 8> Ops;
	for (unsigned i = 0; i < NumStores; ++i) {
	StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
	SDValue Val = peekThroughBitcast(St->getValue());
	// All operands of BUILD_VECTOR / CONCAT_VECTOR must be of
	// type MemVT. If the underlying value is not the correct
	// type, but it is an extraction of an appropriate vector we
	// can recast Val to be of the correct type. This may require
	// converting between EXTRACT_VECTOR_ELT and
	// EXTRACT_SUBVECTOR.
	if ((MemVT != Val.getValueType()) &&
	(Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT \|\|
	Val.getOpcode() == ISD::EXTRACT_SUBVECTOR)) {
	SDValue Vec = Val.getOperand(0);
	EVT MemVTScalarTy = MemVT.getScalarType();
	// We may need to add a bitcast here to get types to line up.
	if (MemVTScalarTy != Vec.getValueType()) {
	unsigned Elts = Vec.getValueType().getSizeInBits() /
	MemVTScalarTy.getSizeInBits();
	EVT NewVecTy =
	EVT::getVectorVT(*DAG.getContext(), MemVTScalarTy, Elts);
	Vec = DAG.getBitcast(NewVecTy, Vec);
	}
	auto OpC = (MemVT.isVector()) ? ISD::EXTRACT_SUBVECTOR
	: ISD::EXTRACT_VECTOR_ELT;
	Val = DAG.getNode(OpC, SDLoc(Val), MemVT, Vec, Val.getOperand(1));
	}
	Ops.push_back(Val);
	}

	// Build the extracted vector elements back into a vector.
	StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS
	: ISD::BUILD_VECTOR,
	DL, StoreTy, Ops);
	}
	} else {
	// We should always use a vector store when merging extracted vector
	// elements, so this path implies a store of constants.
	assert(IsConstantSrc && "Merged vector elements should use vector store");

	APInt StoreInt(SizeInBits, 0);

	// Construct a single integer constant which is made of the smaller
	// constant inputs.
	bool IsLE = DAG.getDataLayout().isLittleEndian();
	for (unsigned i = 0; i < NumStores; ++i) {
	unsigned Idx = IsLE ? (NumStores - 1 - i) : i;
	StoreSDNode *St = cast<StoreSDNode>(StoreNodes[Idx].MemNode);

	SDValue Val = St->getValue();
	StoreInt <<= ElementSizeBits;
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) {
	StoreInt \|= C->getAPIntValue()
	.zextOrTrunc(ElementSizeBits)
	.zextOrTrunc(SizeInBits);
	} else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) {
	StoreInt \|= C->getValueAPF()
	.bitcastToAPInt()
	.zextOrTrunc(ElementSizeBits)
	.zextOrTrunc(SizeInBits);
	// If fp truncation is necessary give up for now.
	if (MemVT.getSizeInBits() != ElementSizeBits)
	return false;
	} else {
	llvm_unreachable("Invalid constant element type");
	}
	}

	// Create the new Load and Store operations.
	StoredVal = DAG.getConstant(StoreInt, DL, StoreTy);
	}

	LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
	SDValue NewChain = getMergeStoreChains(StoreNodes, NumStores);

	// make sure we use trunc store if it's necessary to be legal.
	SDValue NewStore;
	if (!UseTrunc) {
	NewStore = DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(),
	FirstInChain->getPointerInfo(),
	FirstInChain->getAlignment());
	} else { // Must be realized as a trunc store
	EVT LegalizedStoredValueTy =
	TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType());
	unsigned LegalizedStoreSize = LegalizedStoredValueTy.getSizeInBits();
	ConstantSDNode *C = cast<ConstantSDNode>(StoredVal);
	SDValue ExtendedStoreVal =
	DAG.getConstant(C->getAPIntValue().zextOrTrunc(LegalizedStoreSize), DL,
	LegalizedStoredValueTy);
	NewStore = DAG.getTruncStore(
	NewChain, DL, ExtendedStoreVal, FirstInChain->getBasePtr(),
	FirstInChain->getPointerInfo(), StoredVal.getValueType() /TVT/,
	FirstInChain->getAlignment(),
	FirstInChain->getMemOperand()->getFlags());
	}

	// Replace all merged stores with the new store.
	for (unsigned i = 0; i < NumStores; ++i)
	CombineTo(StoreNodes[i].MemNode, NewStore);

	AddToWorklist(NewChain.getNode());
	return true;
	}

	void DAGCombiner::getStoreMergeCandidates(
	StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes) {
	// This holds the base pointer, index, and the offset in bytes from the base
	// pointer.
	BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);
	EVT MemVT = St->getMemoryVT();

	SDValue Val = peekThroughBitcast(St->getValue());
	// We must have a base and an offset.
	if (!BasePtr.getBase().getNode())
	return;

	// Do not handle stores to undef base pointers.
	if (BasePtr.getBase().isUndef())
	return;

	bool IsConstantSrc = isa<ConstantSDNode>(Val) \|\| isa<ConstantFPSDNode>(Val);
	bool IsExtractVecSrc = (Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT \|\|
	Val.getOpcode() == ISD::EXTRACT_SUBVECTOR);
	bool IsLoadSrc = isa<LoadSDNode>(Val);
	BaseIndexOffset LBasePtr;
	// Match on loadbaseptr if relevant.
	EVT LoadVT;
	if (IsLoadSrc) {
	auto *Ld = cast<LoadSDNode>(Val);
	LBasePtr = BaseIndexOffset::match(Ld, DAG);
	LoadVT = Ld->getMemoryVT();
	// Load and store should be the same type.
	if (MemVT != LoadVT)
	return;
	}
	auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr,
	int64_t &Offset) -> bool {
	if (Other->isVolatile() \|\| Other->isIndexed())
	return false;
	SDValue Val = peekThroughBitcast(Other->getValue());
	// Allow merging constants of different types as integers.
	bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT())
	: Other->getMemoryVT() != MemVT;
	if (IsLoadSrc) {
	if (NoTypeMatch)
	return false;
	// The Load's Base Ptr must also match
	if (LoadSDNode *OtherLd = dyn_cast<LoadSDNode>(Val)) {
	auto LPtr = BaseIndexOffset::match(OtherLd, DAG);
	if (LoadVT != OtherLd->getMemoryVT())
	return false;
	if (!(LBasePtr.equalBaseIndex(LPtr, DAG)))
	return false;
	} else
	return false;
	}
	if (IsConstantSrc) {
	if (NoTypeMatch)
	return false;
	if (!(isa<ConstantSDNode>(Val) \|\| isa<ConstantFPSDNode>(Val)))
	return false;
	}
	if (IsExtractVecSrc) {
	// Do not merge truncated stores here.
	if (Other->isTruncatingStore())
	return false;
	if (!MemVT.bitsEq(Val.getValueType()))
	return false;
	if (Val.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
	Val.getOpcode() != ISD::EXTRACT_SUBVECTOR)
	return false;
	}
	Ptr = BaseIndexOffset::match(Other, DAG);
	return (BasePtr.equalBaseIndex(Ptr, DAG, Offset));
	};

	// We looking for a root node which is an ancestor to all mergable
	// stores. We search up through a load, to our root and then down
	// through all children. For instance we will find Store{1,2,3} if
	// St is Store1, Store2. or Store3 where the root is not a load
	// which always true for nonvolatile ops. TODO: Expand
	// the search to find all valid candidates through multiple layers of loads.
	//
	// Root
	// \|-------\|-------\|
	// Load Load Store3
	// \| \|
	// Store1 Store2
	//
	// FIXME: We should be able to climb and
	// descend TokenFactors to find candidates as well.

	SDNode *RootNode = (St->getChain()).getNode();

	if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(RootNode)) {
	RootNode = Ldn->getChain().getNode();
	for (auto I = RootNode->use_begin(), E = RootNode->use_end(); I != E; ++I)
	if (I.getOperandNo() == 0 && isa<LoadSDNode>(*I)) // walk down chain
	for (auto I2 = (I)->use_begin(), E2 = (I)->use_end(); I2 != E2; ++I2)
	if (I2.getOperandNo() == 0)
	if (StoreSDNode OtherST = dyn_cast<StoreSDNode>(I2)) {
	BaseIndexOffset Ptr;
	int64_t PtrDiff;
	if (CandidateMatch(OtherST, Ptr, PtrDiff))
	StoreNodes.push_back(MemOpLink(OtherST, PtrDiff));
	}
	} else
	for (auto I = RootNode->use_begin(), E = RootNode->use_end(); I != E; ++I)
	if (I.getOperandNo() == 0)
	if (StoreSDNode OtherST = dyn_cast<StoreSDNode>(I)) {
	BaseIndexOffset Ptr;
	int64_t PtrDiff;
	if (CandidateMatch(OtherST, Ptr, PtrDiff))
	StoreNodes.push_back(MemOpLink(OtherST, PtrDiff));
	}
	}

	// We need to check that merging these stores does not cause a loop in
	// the DAG. Any store candidate may depend on another candidate
	// indirectly through its operand (we already consider dependencies
	// through the chain). Check in parallel by searching up from
	// non-chain operands of candidates.
	bool DAGCombiner::checkMergeStoreCandidatesForDependencies(
	SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores) {
	// FIXME: We should be able to truncate a full search of
	// predecessors by doing a BFS and keeping tabs the originating
	// stores from which worklist nodes come from in a similar way to
	// TokenFactor simplfication.

	SmallPtrSet<const SDNode *, 16> Visited;
	SmallVector<const SDNode *, 8> Worklist;
	unsigned int Max = 8192;
	// Search Ops of store candidates.
	for (unsigned i = 0; i < NumStores; ++i) {
	SDNode *n = StoreNodes[i].MemNode;
	// Potential loops may happen only through non-chain operands
	for (unsigned j = 1; j < n->getNumOperands(); ++j)
	Worklist.push_back(n->getOperand(j).getNode());
	}
	// Search through DAG. We can stop early if we find a store node.
	for (unsigned i = 0; i < NumStores; ++i) {
	if (SDNode::hasPredecessorHelper(StoreNodes[i].MemNode, Visited, Worklist,
	Max))
	return false;
	// Check if we ended early, failing conservatively if so.
	if (Visited.size() >= Max)
	return false;
	}
	return true;
	}

	bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
	if (OptLevel == CodeGenOpt::None)
	return false;

	EVT MemVT = St->getMemoryVT();
	int64_t ElementSizeBytes = MemVT.getStoreSize();
	unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;

	if (MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits)
	return false;

	bool NoVectors = DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat);

	// This function cannot currently deal with non-byte-sized memory sizes.
	if (ElementSizeBytes * 8 != MemVT.getSizeInBits())
	return false;

	if (!MemVT.isSimple())
	return false;

	// Perform an early exit check. Do not bother looking at stored values that
	// are not constants, loads, or extracted vector elements.
	SDValue StoredVal = peekThroughBitcast(St->getValue());
	bool IsLoadSrc = isa<LoadSDNode>(StoredVal);
	bool IsConstantSrc = isa<ConstantSDNode>(StoredVal) \|\|
	isa<ConstantFPSDNode>(StoredVal);
	bool IsExtractVecSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT \|\|
	StoredVal.getOpcode() == ISD::EXTRACT_SUBVECTOR);

	if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecSrc)
	return false;

	SmallVector<MemOpLink, 8> StoreNodes;
	// Find potential store merge candidates by searching through chain sub-DAG
	getStoreMergeCandidates(St, StoreNodes);

	// Check if there is anything to merge.
	if (StoreNodes.size() < 2)
	return false;

	// Sort the memory operands according to their distance from the
	// base pointer.
	std::sort(StoreNodes.begin(), StoreNodes.end(),
	[](MemOpLink LHS, MemOpLink RHS) {
	return LHS.OffsetFromBase < RHS.OffsetFromBase;
	});

	// Store Merge attempts to merge the lowest stores. This generally
	// works out as if successful, as the remaining stores are checked
	// after the first collection of stores is merged. However, in the
	// case that a non-mergeable store is found first, e.g., {p[-2],
	// p[0], p[1], p[2], p[3]}, we would fail and miss the subsequent
	// mergeable cases. To prevent this, we prune such stores from the
	// front of StoreNodes here.

	bool RV = false;
	while (StoreNodes.size() > 1) {
	unsigned StartIdx = 0;
	while ((StartIdx + 1 < StoreNodes.size()) &&
	StoreNodes[StartIdx].OffsetFromBase + ElementSizeBytes !=
	StoreNodes[StartIdx + 1].OffsetFromBase)
	++StartIdx;

	// Bail if we don't have enough candidates to merge.
	if (StartIdx + 1 >= StoreNodes.size())
	return RV;

	if (StartIdx)
	StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + StartIdx);

	// Scan the memory operations on the chain and find the first
	// non-consecutive store memory address.
	unsigned NumConsecutiveStores = 1;
	int64_t StartAddress = StoreNodes[0].OffsetFromBase;
	// Check that the addresses are consecutive starting from the second
	// element in the list of stores.
	for (unsigned i = 1, e = StoreNodes.size(); i < e; ++i) {
	int64_t CurrAddress = StoreNodes[i].OffsetFromBase;
	if (CurrAddress - StartAddress != (ElementSizeBytes * i))
	break;
	NumConsecutiveStores = i + 1;
	}

	if (NumConsecutiveStores < 2) {
	StoreNodes.erase(StoreNodes.begin(),
	StoreNodes.begin() + NumConsecutiveStores);
	continue;
	}

	// Check that we can merge these candidates without causing a cycle
	if (!checkMergeStoreCandidatesForDependencies(StoreNodes,
	NumConsecutiveStores)) {
	StoreNodes.erase(StoreNodes.begin(),
	StoreNodes.begin() + NumConsecutiveStores);
	continue;
	}

	// The node with the lowest store address.
	LLVMContext &Context = *DAG.getContext();
	const DataLayout &DL = DAG.getDataLayout();

	// Store the constants into memory as one consecutive store.
	if (IsConstantSrc) {
	LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
	unsigned FirstStoreAS = FirstInChain->getAddressSpace();
	unsigned FirstStoreAlign = FirstInChain->getAlignment();
	unsigned LastLegalType = 1;
	unsigned LastLegalVectorType = 1;
	bool LastIntegerTrunc = false;
	bool NonZero = false;
	unsigned FirstZeroAfterNonZero = NumConsecutiveStores;
	for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
	StoreSDNode *ST = cast<StoreSDNode>(StoreNodes[i].MemNode);
	SDValue StoredVal = ST->getValue();
	bool IsElementZero = false;
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal))
	IsElementZero = C->isNullValue();
	else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal))
	IsElementZero = C->getConstantFPValue()->isNullValue();
	if (IsElementZero) {
	if (NonZero && FirstZeroAfterNonZero == NumConsecutiveStores)
	FirstZeroAfterNonZero = i;
	}
	NonZero \|= !IsElementZero;

	// Find a legal type for the constant store.
	unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
	EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits);
	bool IsFast = false;
	if (TLI.isTypeLegal(StoreTy) &&
	TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
	TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
	FirstStoreAlign, &IsFast) &&
	IsFast) {
	LastIntegerTrunc = false;
	LastLegalType = i + 1;
	// Or check whether a truncstore is legal.
	} else if (TLI.getTypeAction(Context, StoreTy) ==
	TargetLowering::TypePromoteInteger) {
	EVT LegalizedStoredValueTy =
	TLI.getTypeToTransformTo(Context, StoredVal.getValueType());
	if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
	TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValueTy, DAG) &&
	TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
	FirstStoreAlign, &IsFast) &&
	IsFast) {
	LastIntegerTrunc = true;
	LastLegalType = i + 1;
	}
	}

	// We only use vectors if the constant is known to be zero or the target
	// allows it and the function is not marked with the noimplicitfloat
	// attribute.
	if ((!NonZero \|\|
	TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) &&
	!NoVectors) {
	// Find a legal type for the vector store.
	unsigned Elts = (i + 1) * NumMemElts;
	EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
	if (TLI.isTypeLegal(Ty) && TLI.isTypeLegal(MemVT) &&
	TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&
	TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
	FirstStoreAlign, &IsFast) &&
	IsFast)
	LastLegalVectorType = i + 1;
	}
	}

	bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors;
	unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType;

	// Check if we found a legal integer type that creates a meaningful merge.
	if (NumElem < 2) {
	// We know that candidate stores are in order and of correct
	// shape. While there is no mergeable sequence from the
	// beginning one may start later in the sequence. The only
	// reason a merge of size N could have failed where another of
	// the same size would not have, is if the alignment has
	// improved or we've dropped a non-zero value. Drop as many
	// candidates as we can here.
	unsigned NumSkip = 1;
	while (
	(NumSkip < NumConsecutiveStores) &&
	(NumSkip < FirstZeroAfterNonZero) &&
	(StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign)) {
	NumSkip++;
	}
	StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
	continue;
	}

	bool Merged = MergeStoresOfConstantsOrVecElts(
	StoreNodes, MemVT, NumElem, true, UseVector, LastIntegerTrunc);
	RV \|= Merged;

	// Remove merged stores for next iteration.
	StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
	continue;
	}

	// When extracting multiple vector elements, try to store them
	// in one vector store rather than a sequence of scalar stores.
	if (IsExtractVecSrc) {
	LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
	unsigned FirstStoreAS = FirstInChain->getAddressSpace();
	unsigned FirstStoreAlign = FirstInChain->getAlignment();
	unsigned NumStoresToMerge = 1;
	for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
	StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
	SDValue StVal = peekThroughBitcast(St->getValue());
	// This restriction could be loosened.
	// Bail out if any stored values are not elements extracted from a
	// vector. It should be possible to handle mixed sources, but load
	// sources need more careful handling (see the block of code below that
	// handles consecutive loads).
	if (StVal.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
	StVal.getOpcode() != ISD::EXTRACT_SUBVECTOR)
	return RV;

	// Find a legal type for the vector store.
	unsigned Elts = (i + 1) * NumMemElts;
	EVT Ty =
	EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
	bool IsFast;
	if (TLI.isTypeLegal(Ty) &&
	TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&
	TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
	FirstStoreAlign, &IsFast) &&
	IsFast)
	NumStoresToMerge = i + 1;
	}

	// Check if we found a legal integer type that creates a meaningful merge.
	if (NumStoresToMerge < 2) {
	// We know that candidate stores are in order and of correct
	// shape. While there is no mergeable sequence from the
	// beginning one may start later in the sequence. The only
	// reason a merge of size N could have failed where another of
	// the same size would not have, is if the alignment has
	// improved. Drop as many candidates as we can here.
	unsigned NumSkip = 1;
	while ((NumSkip < NumConsecutiveStores) &&
	(StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))
	NumSkip++;

	StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
	continue;
	}

	bool Merged = MergeStoresOfConstantsOrVecElts(
	StoreNodes, MemVT, NumStoresToMerge, false, true, false);
	if (!Merged) {
	StoreNodes.erase(StoreNodes.begin(),
	StoreNodes.begin() + NumStoresToMerge);
	continue;
	}
	// Remove merged stores for next iteration.
	StoreNodes.erase(StoreNodes.begin(),
	StoreNodes.begin() + NumStoresToMerge);
	RV = true;
	continue;
	}

	// Below we handle the case of multiple consecutive stores that
	// come from multiple consecutive loads. We merge them into a single
	// wide load and a single wide store.

	// Look for load nodes which are used by the stored values.
	SmallVector<MemOpLink, 8> LoadNodes;

	// Find acceptable loads. Loads need to have the same chain (token factor),
	// must not be zext, volatile, indexed, and they must be consecutive.
	BaseIndexOffset LdBasePtr;
	for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
	StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
	SDValue Val = peekThroughBitcast(St->getValue());
	LoadSDNode *Ld = dyn_cast<LoadSDNode>(Val);
	if (!Ld)
	break;

	// Loads must only have one use.
	if (!Ld->hasNUsesOfValue(1, 0))
	break;

	// The memory operands must not be volatile.
	if (Ld->isVolatile() \|\| Ld->isIndexed())
	break;

	// The stored memory type must be the same.
	if (Ld->getMemoryVT() != MemVT)
	break;

	BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld, DAG);
	// If this is not the first ptr that we check.
	int64_t LdOffset = 0;
	if (LdBasePtr.getBase().getNode()) {
	// The base ptr must be the same.
	if (!LdBasePtr.equalBaseIndex(LdPtr, DAG, LdOffset))
	break;
	} else {
	// Check that all other base pointers are the same as this one.
	LdBasePtr = LdPtr;
	}

	// We found a potential memory operand to merge.
	LoadNodes.push_back(MemOpLink(Ld, LdOffset));
	}

	if (LoadNodes.size() < 2) {
	StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1);
	continue;
	}

	// If we have load/store pair instructions and we only have two values,
	// don't bother merging.
	unsigned RequiredAlignment;
	if (LoadNodes.size() == 2 && TLI.hasPairedLoad(MemVT, RequiredAlignment) &&
	StoreNodes[0].MemNode->getAlignment() >= RequiredAlignment) {
	StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 2);
	continue;
	}
	LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
	unsigned FirstStoreAS = FirstInChain->getAddressSpace();
	unsigned FirstStoreAlign = FirstInChain->getAlignment();
	LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode);
	unsigned FirstLoadAS = FirstLoad->getAddressSpace();
	unsigned FirstLoadAlign = FirstLoad->getAlignment();

	// Scan the memory operations on the chain and find the first
	// non-consecutive load memory address. These variables hold the index in
	// the store node array.
	unsigned LastConsecutiveLoad = 1;
	// This variable refers to the size and not index in the array.
	unsigned LastLegalVectorType = 1;
	unsigned LastLegalIntegerType = 1;
	bool isDereferenceable = true;
	bool DoIntegerTruncate = false;
	StartAddress = LoadNodes[0].OffsetFromBase;
	SDValue FirstChain = FirstLoad->getChain();
	for (unsigned i = 1; i < LoadNodes.size(); ++i) {
	// All loads must share the same chain.
	if (LoadNodes[i].MemNode->getChain() != FirstChain)
	break;

	int64_t CurrAddress = LoadNodes[i].OffsetFromBase;
	if (CurrAddress - StartAddress != (ElementSizeBytes * i))
	break;
	LastConsecutiveLoad = i;

	if (isDereferenceable && !LoadNodes[i].MemNode->isDereferenceable())
	isDereferenceable = false;

	// Find a legal type for the vector store.
	unsigned Elts = (i + 1) * NumMemElts;
	EVT StoreTy = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);

	bool IsFastSt, IsFastLd;
	if (TLI.isTypeLegal(StoreTy) &&
	TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
	TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
	FirstStoreAlign, &IsFastSt) &&
	IsFastSt &&
	TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
	FirstLoadAlign, &IsFastLd) &&
	IsFastLd) {
	LastLegalVectorType = i + 1;
	}

	// Find a legal type for the integer store.
	unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
	StoreTy = EVT::getIntegerVT(Context, SizeInBits);
	if (TLI.isTypeLegal(StoreTy) &&
	TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
	TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
	FirstStoreAlign, &IsFastSt) &&
	IsFastSt &&
	TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
	FirstLoadAlign, &IsFastLd) &&
	IsFastLd) {
	LastLegalIntegerType = i + 1;
	DoIntegerTruncate = false;
	// Or check whether a truncstore and extload is legal.
	} else if (TLI.getTypeAction(Context, StoreTy) ==
	TargetLowering::TypePromoteInteger) {
	EVT LegalizedStoredValueTy = TLI.getTypeToTransformTo(Context, StoreTy);
	if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
	TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValueTy, DAG) &&
	TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValueTy,
	StoreTy) &&
	TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy,
	StoreTy) &&
	TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy) &&
	TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
	FirstStoreAlign, &IsFastSt) &&
	IsFastSt &&
	TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
	FirstLoadAlign, &IsFastLd) &&
	IsFastLd) {
	LastLegalIntegerType = i + 1;
	DoIntegerTruncate = true;
	}
	}
	}

	// Only use vector types if the vector type is larger than the integer type.
	// If they are the same, use integers.
	bool UseVectorTy = LastLegalVectorType > LastLegalIntegerType && !NoVectors;
	unsigned LastLegalType =
	std::max(LastLegalVectorType, LastLegalIntegerType);

	// We add +1 here because the LastXXX variables refer to location while
	// the NumElem refers to array/index size.
	unsigned NumElem = std::min(NumConsecutiveStores, LastConsecutiveLoad + 1);
	NumElem = std::min(LastLegalType, NumElem);

	if (NumElem < 2) {
	// We know that candidate stores are in order and of correct
	// shape. While there is no mergeable sequence from the
	// beginning one may start later in the sequence. The only
	// reason a merge of size N could have failed where another of
	// the same size would not have is if the alignment or either
	// the load or store has improved. Drop as many candidates as we
	// can here.
	unsigned NumSkip = 1;
	while ((NumSkip < LoadNodes.size()) &&
	(LoadNodes[NumSkip].MemNode->getAlignment() <= FirstLoadAlign) &&
	(StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))
	NumSkip++;
	StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
	continue;
	}

	// Find if it is better to use vectors or integers to load and store
	// to memory.
	EVT JointMemOpVT;
	if (UseVectorTy) {
	// Find a legal type for the vector store.
	unsigned Elts = NumElem * NumMemElts;
	JointMemOpVT = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
	} else {
	unsigned SizeInBits = NumElem * ElementSizeBytes * 8;
	JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits);
	}

	SDLoc LoadDL(LoadNodes[0].MemNode);
	SDLoc StoreDL(StoreNodes[0].MemNode);

	// The merged loads are required to have the same incoming chain, so
	// using the first's chain is acceptable.

	SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem);
	AddToWorklist(NewStoreChain.getNode());

	MachineMemOperand::Flags MMOFlags = isDereferenceable ?
	MachineMemOperand::MODereferenceable:
	MachineMemOperand::MONone;

	SDValue NewLoad, NewStore;
	if (UseVectorTy \|\| !DoIntegerTruncate) {
	NewLoad = DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(),
	FirstLoad->getBasePtr(),
	FirstLoad->getPointerInfo(), FirstLoadAlign,
	MMOFlags);
	NewStore = DAG.getStore(NewStoreChain, StoreDL, NewLoad,
	FirstInChain->getBasePtr(),
	FirstInChain->getPointerInfo(), FirstStoreAlign);
	} else { // This must be the truncstore/extload case
	EVT ExtendedTy =
	TLI.getTypeToTransformTo(*DAG.getContext(), JointMemOpVT);
	NewLoad =
	DAG.getExtLoad(ISD::EXTLOAD, LoadDL, ExtendedTy, FirstLoad->getChain(),
	FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(),
	JointMemOpVT, FirstLoadAlign, MMOFlags);
	NewStore = DAG.getTruncStore(NewStoreChain, StoreDL, NewLoad,
	FirstInChain->getBasePtr(),
	FirstInChain->getPointerInfo(), JointMemOpVT,
	FirstInChain->getAlignment(),
	FirstInChain->getMemOperand()->getFlags());
	}

	// Transfer chain users from old loads to the new load.
	for (unsigned i = 0; i < NumElem; ++i) {
	LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1),
	SDValue(NewLoad.getNode(), 1));
	}

	// Replace the all stores with the new store. Recursively remove
	// corresponding value if its no longer used.
	for (unsigned i = 0; i < NumElem; ++i) {
	SDValue Val = StoreNodes[i].MemNode->getOperand(1);
	CombineTo(StoreNodes[i].MemNode, NewStore);
	if (Val.getNode()->use_empty())
	recursivelyDeleteUnusedNodes(Val.getNode());
	}

	RV = true;
	StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
	}
	return RV;
	}

	SDValue DAGCombiner::replaceStoreChain(StoreSDNode *ST, SDValue BetterChain) {
	SDLoc SL(ST);
	SDValue ReplStore;

	// Replace the chain to avoid dependency.
	if (ST->isTruncatingStore()) {
	ReplStore = DAG.getTruncStore(BetterChain, SL, ST->getValue(),
	ST->getBasePtr(), ST->getMemoryVT(),
	ST->getMemOperand());
	} else {
	ReplStore = DAG.getStore(BetterChain, SL, ST->getValue(), ST->getBasePtr(),
	ST->getMemOperand());
	}

	// Create token to keep both nodes around.
	SDValue Token = DAG.getNode(ISD::TokenFactor, SL,
	MVT::Other, ST->getChain(), ReplStore);

	// Make sure the new and old chains are cleaned up.
	AddToWorklist(Token.getNode());

	// Don't add users to work list.
	return CombineTo(ST, Token, false);
	}

	SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) {
	SDValue Value = ST->getValue();
	if (Value.getOpcode() == ISD::TargetConstantFP)
	return SDValue();

	SDLoc DL(ST);

	SDValue Chain = ST->getChain();
	SDValue Ptr = ST->getBasePtr();

	const ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Value);

	// NOTE: If the original store is volatile, this transform must not increase
	// the number of stores. For example, on x86-32 an f64 can be stored in one
	// processor operation but an i64 (which is not legal) requires two. So the
	// transform should not be done in this case.

	SDValue Tmp;
	switch (CFP->getSimpleValueType(0).SimpleTy) {
	default:
	llvm_unreachable("Unknown FP type");
	case MVT::f16: // We don't do this for these yet.
	case MVT::f80:
	case MVT::f128:
	case MVT::ppcf128:
	return SDValue();
	case MVT::f32:
	if ((isTypeLegal(MVT::i32) && !LegalOperations && !ST->isVolatile()) \|\|
	TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
	;
	Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF().
	bitcastToAPInt().getZExtValue(), SDLoc(CFP),
	MVT::i32);
	return DAG.getStore(Chain, DL, Tmp, Ptr, ST->getMemOperand());
	}

	return SDValue();
	case MVT::f64:
	if ((TLI.isTypeLegal(MVT::i64) && !LegalOperations &&
	!ST->isVolatile()) \|\|
	TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i64)) {
	;
	Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
	getZExtValue(), SDLoc(CFP), MVT::i64);
	return DAG.getStore(Chain, DL, Tmp,
	Ptr, ST->getMemOperand());
	}

	if (!ST->isVolatile() &&
	TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
	// Many FP stores are not made apparent until after legalize, e.g. for
	// argument passing. Since this is so common, custom legalize the
	// 64-bit integer store into two 32-bit stores.
	uint64_t Val = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
	SDValue Lo = DAG.getConstant(Val & 0xFFFFFFFF, SDLoc(CFP), MVT::i32);
	SDValue Hi = DAG.getConstant(Val >> 32, SDLoc(CFP), MVT::i32);
	if (DAG.getDataLayout().isBigEndian())
	std::swap(Lo, Hi);

	unsigned Alignment = ST->getAlignment();
	MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
	AAMDNodes AAInfo = ST->getAAInfo();

	SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(),
	ST->getAlignment(), MMOFlags, AAInfo);
	Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
	DAG.getConstant(4, DL, Ptr.getValueType()));
	Alignment = MinAlign(Alignment, 4U);
	SDValue St1 = DAG.getStore(Chain, DL, Hi, Ptr,
	ST->getPointerInfo().getWithOffset(4),
	Alignment, MMOFlags, AAInfo);
	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
	St0, St1);
	}

	return SDValue();
	}
	}

	SDValue DAGCombiner::visitSTORE(SDNode *N) {
	StoreSDNode *ST = cast<StoreSDNode>(N);
	SDValue Chain = ST->getChain();
	SDValue Value = ST->getValue();
	SDValue Ptr = ST->getBasePtr();

	// If this is a store of a bit convert, store the input value if the
	// resultant store does not need a higher alignment than the original.
	if (Value.getOpcode() == ISD::BITCAST && !ST->isTruncatingStore() &&
	ST->isUnindexed()) {
	EVT SVT = Value.getOperand(0).getValueType();
	if (((!LegalOperations && !ST->isVolatile()) \|\|
	TLI.isOperationLegalOrCustom(ISD::STORE, SVT)) &&
	TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT)) {
	unsigned OrigAlign = ST->getAlignment();
	bool Fast = false;
	if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), SVT,
	ST->getAddressSpace(), OrigAlign, &Fast) &&
	Fast) {
	return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
	ST->getPointerInfo(), OrigAlign,
	ST->getMemOperand()->getFlags(), ST->getAAInfo());
	}
	}
	}

	// Turn 'store undef, Ptr' -> nothing.
	if (Value.isUndef() && ST->isUnindexed())
	return Chain;

	// Try to infer better alignment information than the store already has.
	if (OptLevel != CodeGenOpt::None && ST->isUnindexed()) {
	if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
	if (Align > ST->getAlignment()) {
	SDValue NewStore =
	DAG.getTruncStore(Chain, SDLoc(N), Value, Ptr, ST->getPointerInfo(),
	ST->getMemoryVT(), Align,
	ST->getMemOperand()->getFlags(), ST->getAAInfo());
	if (NewStore.getNode() != N)
	return CombineTo(ST, NewStore, true);
	}
	}
	}

	// Try transforming a pair floating point load / store ops to integer
	// load / store ops.
	if (SDValue NewST = TransformFPLoadStorePair(N))
	return NewST;

	if (ST->isUnindexed()) {
	// Walk up chain skipping non-aliasing memory nodes, on this store and any
	// adjacent stores.
	if (findBetterNeighborChains(ST)) {
	// replaceStoreChain uses CombineTo, which handled all of the worklist
	// manipulation. Return the original node to not do anything else.
	return SDValue(ST, 0);
	}
	Chain = ST->getChain();
	}

	// FIXME: is there such a thing as a truncating indexed store?
	if (ST->isTruncatingStore() && ST->isUnindexed() &&
	Value.getValueType().isInteger()) {
	// See if we can simplify the input to this truncstore with knowledge that
	// only the low bits are being used. For example:
	// "truncstore (or (shl x, 8), y), i8" -> "truncstore y, i8"
	SDValue Shorter = DAG.GetDemandedBits(
	Value, APInt::getLowBitsSet(Value.getScalarValueSizeInBits(),
	ST->getMemoryVT().getScalarSizeInBits()));
	AddToWorklist(Value.getNode());
	if (Shorter.getNode())
	return DAG.getTruncStore(Chain, SDLoc(N), Shorter,
	Ptr, ST->getMemoryVT(), ST->getMemOperand());

	// Otherwise, see if we can simplify the operation with
	// SimplifyDemandedBits, which only works if the value has a single use.
	if (SimplifyDemandedBits(
	Value,
	APInt::getLowBitsSet(Value.getScalarValueSizeInBits(),
	ST->getMemoryVT().getScalarSizeInBits()))) {
	// Re-visit the store if anything changed and the store hasn't been merged
	// with another node (N is deleted) SimplifyDemandedBits will add Value's
	// node back to the worklist if necessary, but we also need to re-visit
	// the Store node itself.
	if (N->getOpcode() != ISD::DELETED_NODE)
	AddToWorklist(N);
	return SDValue(N, 0);
	}
	}

	// If this is a load followed by a store to the same location, then the store
	// is dead/noop.
	if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Value)) {
	if (Ld->getBasePtr() == Ptr && ST->getMemoryVT() == Ld->getMemoryVT() &&
	ST->isUnindexed() && !ST->isVolatile() &&
	// There can't be any side effects between the load and store, such as
	// a call or store.
	Chain.reachesChainWithoutSideEffects(SDValue(Ld, 1))) {
	// The store is dead, remove it.
	return Chain;
	}
	}

	if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) {
	if (ST->isUnindexed() && !ST->isVolatile() && ST1->isUnindexed() &&
	!ST1->isVolatile() && ST1->getBasePtr() == Ptr &&
	ST->getMemoryVT() == ST1->getMemoryVT()) {
	// If this is a store followed by a store with the same value to the same
	// location, then the store is dead/noop.
	if (ST1->getValue() == Value) {
	// The store is dead, remove it.
	return Chain;
	}

	// If this is a store who's preceeding store to the same location
	// and no one other node is chained to that store we can effectively
	// drop the store. Do not remove stores to undef as they may be used as
	// data sinks.
	if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() &&
	!ST1->getBasePtr().isUndef()) {
	// ST1 is fully overwritten and can be elided. Combine with it's chain
	// value.
	CombineTo(ST1, ST1->getChain());
	return SDValue();
	}
	}
	}

	// If this is an FP_ROUND or TRUNC followed by a store, fold this into a
	// truncating store. We can do this even if this is already a truncstore.
	if ((Value.getOpcode() == ISD::FP_ROUND \|\| Value.getOpcode() == ISD::TRUNCATE)
	&& Value.getNode()->hasOneUse() && ST->isUnindexed() &&
	TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
	ST->getMemoryVT())) {
	return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0),
	Ptr, ST->getMemoryVT(), ST->getMemOperand());
	}

	// Always perform this optimization before types are legal. If the target
	// prefers, also try this after legalization to catch stores that were created
	// by intrinsics or other nodes.
	if (!LegalTypes \|\| (TLI.mergeStoresAfterLegalization())) {
	while (true) {
	// There can be multiple store sequences on the same chain.
	// Keep trying to merge store sequences until we are unable to do so
	// or until we merge the last store on the chain.
	bool Changed = MergeConsecutiveStores(ST);
	if (!Changed) break;
	// Return N as merge only uses CombineTo and no worklist clean
	// up is necessary.
	if (N->getOpcode() == ISD::DELETED_NODE \|\| !isa<StoreSDNode>(N))
	return SDValue(N, 0);
	}
	}

	// Try transforming N to an indexed store.
	if (CombineToPreIndexedLoadStore(N) \|\| CombineToPostIndexedLoadStore(N))
	return SDValue(N, 0);

	// Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr'
	//
	// Make sure to do this only after attempting to merge stores in order to
	// avoid changing the types of some subset of stores due to visit order,
	// preventing their merging.
	if (isa<ConstantFPSDNode>(ST->getValue())) {
	if (SDValue NewSt = replaceStoreOfFPConstant(ST))
	return NewSt;
	}

	if (SDValue NewSt = splitMergedValStore(ST))
	return NewSt;

	return ReduceLoadOpStoreWidth(N);
	}

	/// For the instruction sequence of store below, F and I values
	/// are bundled together as an i64 value before being stored into memory.
	/// Sometimes it is more efficent to generate separate stores for F and I,
	/// which can remove the bitwise instructions or sink them to colder places.
	///
	/// (store (or (zext (bitcast F to i32) to i64),
	/// (shl (zext I to i64), 32)), addr) -->
	/// (store F, addr) and (store I, addr+4)
	///
	/// Similarly, splitting for other merged store can also be beneficial, like:
	/// For pair of {i32, i32}, i64 store --> two i32 stores.
	/// For pair of {i32, i16}, i64 store --> two i32 stores.
	/// For pair of {i16, i16}, i32 store --> two i16 stores.
	/// For pair of {i16, i8}, i32 store --> two i16 stores.
	/// For pair of {i8, i8}, i16 store --> two i8 stores.
	///
	/// We allow each target to determine specifically which kind of splitting is
	/// supported.
	///
	/// The store patterns are commonly seen from the simple code snippet below
	/// if only std::make_pair(...) is sroa transformed before inlined into hoo.
	/// void goo(const std::pair<int, float> &);
	/// hoo() {
	/// ...
	/// goo(std::make_pair(tmp, ftmp));
	/// ...
	/// }
	///
	SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) {
	if (OptLevel == CodeGenOpt::None)
	return SDValue();

	SDValue Val = ST->getValue();
	SDLoc DL(ST);

	// Match OR operand.
	if (!Val.getValueType().isScalarInteger() \|\| Val.getOpcode() != ISD::OR)
	return SDValue();

	// Match SHL operand and get Lower and Higher parts of Val.
	SDValue Op1 = Val.getOperand(0);
	SDValue Op2 = Val.getOperand(1);
	SDValue Lo, Hi;
	if (Op1.getOpcode() != ISD::SHL) {
	std::swap(Op1, Op2);
	if (Op1.getOpcode() != ISD::SHL)
	return SDValue();
	}
	Lo = Op2;
	Hi = Op1.getOperand(0);
	if (!Op1.hasOneUse())
	return SDValue();

	// Match shift amount to HalfValBitSize.
	unsigned HalfValBitSize = Val.getValueSizeInBits() / 2;
	ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(Op1.getOperand(1));
	if (!ShAmt \|\| ShAmt->getAPIntValue() != HalfValBitSize)
	return SDValue();

	// Lo and Hi are zero-extended from int with size less equal than 32
	// to i64.
	if (Lo.getOpcode() != ISD::ZERO_EXTEND \|\| !Lo.hasOneUse() \|\|
	!Lo.getOperand(0).getValueType().isScalarInteger() \|\|
	Lo.getOperand(0).getValueSizeInBits() > HalfValBitSize \|\|
	Hi.getOpcode() != ISD::ZERO_EXTEND \|\| !Hi.hasOneUse() \|\|
	!Hi.getOperand(0).getValueType().isScalarInteger() \|\|
	Hi.getOperand(0).getValueSizeInBits() > HalfValBitSize)
	return SDValue();

	// Use the EVT of low and high parts before bitcast as the input
	// of target query.
	EVT LowTy = (Lo.getOperand(0).getOpcode() == ISD::BITCAST)
	? Lo.getOperand(0).getValueType()
	: Lo.getValueType();
	EVT HighTy = (Hi.getOperand(0).getOpcode() == ISD::BITCAST)
	? Hi.getOperand(0).getValueType()
	: Hi.getValueType();
	if (!TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy))
	return SDValue();

	// Start to split store.
	unsigned Alignment = ST->getAlignment();
	MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
	AAMDNodes AAInfo = ST->getAAInfo();

	// Change the sizes of Lo and Hi's value types to HalfValBitSize.
	EVT VT = EVT::getIntegerVT(*DAG.getContext(), HalfValBitSize);
	Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Lo.getOperand(0));
	Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Hi.getOperand(0));

	SDValue Chain = ST->getChain();
	SDValue Ptr = ST->getBasePtr();
	// Lower value store.
	SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(),
	ST->getAlignment(), MMOFlags, AAInfo);
	Ptr =
	DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
	DAG.getConstant(HalfValBitSize / 8, DL, Ptr.getValueType()));
	// Higher value store.
	SDValue St1 =
	DAG.getStore(St0, DL, Hi, Ptr,
	ST->getPointerInfo().getWithOffset(HalfValBitSize / 8),
	Alignment / 2, MMOFlags, AAInfo);
	return St1;
	}

	/// Convert a disguised subvector insertion into a shuffle:
	/// insert_vector_elt V, (bitcast X from vector type), IdxC -->
	/// bitcast(shuffle (bitcast V), (extended X), Mask)
	/// Note: We do not use an insert_subvector node because that requires a legal
	/// subvector type.
	SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) {
	SDValue InsertVal = N->getOperand(1);
	if (InsertVal.getOpcode() != ISD::BITCAST \|\| !InsertVal.hasOneUse() \|\|
	!InsertVal.getOperand(0).getValueType().isVector())
	return SDValue();

	SDValue SubVec = InsertVal.getOperand(0);
	SDValue DestVec = N->getOperand(0);
	EVT SubVecVT = SubVec.getValueType();
	EVT VT = DestVec.getValueType();
	unsigned NumSrcElts = SubVecVT.getVectorNumElements();
	unsigned ExtendRatio = VT.getSizeInBits() / SubVecVT.getSizeInBits();
	unsigned NumMaskVals = ExtendRatio * NumSrcElts;

	// Step 1: Create a shuffle mask that implements this insert operation. The
	// vector that we are inserting into will be operand 0 of the shuffle, so
	// those elements are just 'i'. The inserted subvector is in the first
	// positions of operand 1 of the shuffle. Example:
	// insert v4i32 V, (v2i16 X), 2 --> shuffle v8i16 V', X', {0,1,2,3,8,9,6,7}
	SmallVector<int, 16> Mask(NumMaskVals);
	for (unsigned i = 0; i != NumMaskVals; ++i) {
	if (i / NumSrcElts == InsIndex)
	Mask[i] = (i % NumSrcElts) + NumMaskVals;
	else
	Mask[i] = i;
	}

	// Bail out if the target can not handle the shuffle we want to create.
	EVT SubVecEltVT = SubVecVT.getVectorElementType();
	EVT ShufVT = EVT::getVectorVT(*DAG.getContext(), SubVecEltVT, NumMaskVals);
	if (!TLI.isShuffleMaskLegal(Mask, ShufVT))
	return SDValue();

	// Step 2: Create a wide vector from the inserted source vector by appending
	// undefined elements. This is the same size as our destination vector.
	SDLoc DL(N);
	SmallVector<SDValue, 8> ConcatOps(ExtendRatio, DAG.getUNDEF(SubVecVT));
	ConcatOps[0] = SubVec;
	SDValue PaddedSubV = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShufVT, ConcatOps);

	// Step 3: Shuffle in the padded subvector.
	SDValue DestVecBC = DAG.getBitcast(ShufVT, DestVec);
	SDValue Shuf = DAG.getVectorShuffle(ShufVT, DL, DestVecBC, PaddedSubV, Mask);
	AddToWorklist(PaddedSubV.getNode());
	AddToWorklist(DestVecBC.getNode());
	AddToWorklist(Shuf.getNode());
	return DAG.getBitcast(VT, Shuf);
	}

	SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
	SDValue InVec = N->getOperand(0);
	SDValue InVal = N->getOperand(1);
	SDValue EltNo = N->getOperand(2);
	SDLoc DL(N);

	// If the inserted element is an UNDEF, just use the input vector.
	if (InVal.isUndef())
	return InVec;

	EVT VT = InVec.getValueType();

	// Remove redundant insertions:
	// (insert_vector_elt x (extract_vector_elt x idx) idx) -> x
	if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	InVec == InVal.getOperand(0) && EltNo == InVal.getOperand(1))
	return InVec;

	// We must know which element is being inserted for folds below here.
	auto *IndexC = dyn_cast<ConstantSDNode>(EltNo);
	if (!IndexC)
	return SDValue();
	unsigned Elt = IndexC->getZExtValue();

	if (SDValue Shuf = combineInsertEltToShuffle(N, Elt))
	return Shuf;

	// Canonicalize insert_vector_elt dag nodes.
	// Example:
	// (insert_vector_elt (insert_vector_elt A, Idx0), Idx1)
	// -> (insert_vector_elt (insert_vector_elt A, Idx1), Idx0)
	//
	// Do this only if the child insert_vector node has one use; also
	// do this only if indices are both constants and Idx1 < Idx0.
	if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT && InVec.hasOneUse()
	&& isa<ConstantSDNode>(InVec.getOperand(2))) {
	unsigned OtherElt = InVec.getConstantOperandVal(2);
	if (Elt < OtherElt) {
	// Swap nodes.
	SDValue NewOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
	InVec.getOperand(0), InVal, EltNo);
	AddToWorklist(NewOp.getNode());
	return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(InVec.getNode()),
	VT, NewOp, InVec.getOperand(1), InVec.getOperand(2));
	}
	}

	// If we can't generate a legal BUILD_VECTOR, exit
	if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
	return SDValue();

	// Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
	// be converted to a BUILD_VECTOR). Fill in the Ops vector with the
	// vector elements.
	SmallVector<SDValue, 8> Ops;
	// Do not combine these two vectors if the output vector will not replace
	// the input vector.
	if (InVec.getOpcode() == ISD::BUILD_VECTOR && InVec.hasOneUse()) {
	Ops.append(InVec.getNode()->op_begin(),
	InVec.getNode()->op_end());
	} else if (InVec.isUndef()) {
	unsigned NElts = VT.getVectorNumElements();
	Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
	} else {
	return SDValue();
	}

	// Insert the element
	if (Elt < Ops.size()) {
	// All the operands of BUILD_VECTOR must have the same type;
	// we enforce that here.
	EVT OpVT = Ops[0].getValueType();
	Ops[Elt] = OpVT.isInteger() ? DAG.getAnyExtOrTrunc(InVal, DL, OpVT) : InVal;
	}

	// Return the new vector
	return DAG.getBuildVector(VT, DL, Ops);
	}

	SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad(
	SDNode EVE, EVT InVecVT, SDValue EltNo, LoadSDNode OriginalLoad) {
	assert(!OriginalLoad->isVolatile());

	EVT ResultVT = EVE->getValueType(0);
	EVT VecEltVT = InVecVT.getVectorElementType();
	unsigned Align = OriginalLoad->getAlignment();
	unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
	VecEltVT.getTypeForEVT(*DAG.getContext()));

	if (NewAlign > Align \|\| !TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT))
	return SDValue();

	ISD::LoadExtType ExtTy = ResultVT.bitsGT(VecEltVT) ?
	ISD::NON_EXTLOAD : ISD::EXTLOAD;
	if (!TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT))
	return SDValue();

	Align = NewAlign;

	SDValue NewPtr = OriginalLoad->getBasePtr();
	SDValue Offset;
	EVT PtrType = NewPtr.getValueType();
	MachinePointerInfo MPI;
	SDLoc DL(EVE);
	if (auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo)) {
	int Elt = ConstEltNo->getZExtValue();
	unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8;
	Offset = DAG.getConstant(PtrOff, DL, PtrType);
	MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff);
	} else {
	Offset = DAG.getZExtOrTrunc(EltNo, DL, PtrType);
	Offset = DAG.getNode(
	ISD::MUL, DL, PtrType, Offset,
	DAG.getConstant(VecEltVT.getStoreSize(), DL, PtrType));
	MPI = OriginalLoad->getPointerInfo();
	}
	NewPtr = DAG.getNode(ISD::ADD, DL, PtrType, NewPtr, Offset);

	// The replacement we need to do here is a little tricky: we need to
	// replace an extractelement of a load with a load.
	// Use ReplaceAllUsesOfValuesWith to do the replacement.
	// Note that this replacement assumes that the extractvalue is the only
	// use of the load; that's okay because we don't want to perform this
	// transformation in other cases anyway.
	SDValue Load;
	SDValue Chain;
	if (ResultVT.bitsGT(VecEltVT)) {
	// If the result type of vextract is wider than the load, then issue an
	// extending load instead.
	ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, ResultVT,
	VecEltVT)
	? ISD::ZEXTLOAD
	: ISD::EXTLOAD;
	Load = DAG.getExtLoad(ExtType, SDLoc(EVE), ResultVT,
	OriginalLoad->getChain(), NewPtr, MPI, VecEltVT,
	Align, OriginalLoad->getMemOperand()->getFlags(),
	OriginalLoad->getAAInfo());
	Chain = Load.getValue(1);
	} else {
	Load = DAG.getLoad(VecEltVT, SDLoc(EVE), OriginalLoad->getChain(), NewPtr,
	MPI, Align, OriginalLoad->getMemOperand()->getFlags(),
	OriginalLoad->getAAInfo());
	Chain = Load.getValue(1);
	if (ResultVT.bitsLT(VecEltVT))
	Load = DAG.getNode(ISD::TRUNCATE, SDLoc(EVE), ResultVT, Load);
	else
	Load = DAG.getBitcast(ResultVT, Load);
	}
	WorklistRemover DeadNodes(*this);
	SDValue From[] = { SDValue(EVE, 0), SDValue(OriginalLoad, 1) };
	SDValue To[] = { Load, Chain };
	DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
	// Since we're explicitly calling ReplaceAllUses, add the new node to the
	// worklist explicitly as well.
	AddToWorklist(Load.getNode());
	AddUsersToWorklist(Load.getNode()); // Add users too
	// Make sure to revisit this node to clean it up; it will usually be dead.
	AddToWorklist(EVE);
	++OpsNarrowed;
	return SDValue(EVE, 0);
	}

	SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
	// (vextract (scalar_to_vector val, 0) -> val
	SDValue InVec = N->getOperand(0);
	EVT VT = InVec.getValueType();
	EVT NVT = N->getValueType(0);

	if (InVec.isUndef())
	return DAG.getUNDEF(NVT);

	if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
	// Check if the result type doesn't match the inserted element type. A
	// SCALAR_TO_VECTOR may truncate the inserted element and the
	// EXTRACT_VECTOR_ELT may widen the extracted vector.
	SDValue InOp = InVec.getOperand(0);
	if (InOp.getValueType() != NVT) {
	assert(InOp.getValueType().isInteger() && NVT.isInteger());
	return DAG.getSExtOrTrunc(InOp, SDLoc(InVec), NVT);
	}
	return InOp;
	}

	SDValue EltNo = N->getOperand(1);
	ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);

	// extract_vector_elt (build_vector x, y), 1 -> y
	if (ConstEltNo &&
	InVec.getOpcode() == ISD::BUILD_VECTOR &&
	TLI.isTypeLegal(VT) &&
	(InVec.hasOneUse() \|\|
	TLI.aggressivelyPreferBuildVectorSources(VT))) {
	SDValue Elt = InVec.getOperand(ConstEltNo->getZExtValue());
	EVT InEltVT = Elt.getValueType();

	// Sometimes build_vector's scalar input types do not match result type.
	if (NVT == InEltVT)
	return Elt;

	// TODO: It may be useful to truncate if free if the build_vector implicitly
	// converts.
	}

	// extract_vector_elt (v2i32 (bitcast i64:x)), EltTrunc -> i32 (trunc i64:x)
	bool isLE = DAG.getDataLayout().isLittleEndian();
	unsigned EltTrunc = isLE ? 0 : VT.getVectorNumElements() - 1;
	if (ConstEltNo && InVec.getOpcode() == ISD::BITCAST && InVec.hasOneUse() &&
	ConstEltNo->getZExtValue() == EltTrunc && VT.isInteger()) {
	SDValue BCSrc = InVec.getOperand(0);
	if (BCSrc.getValueType().isScalarInteger())
	return DAG.getNode(ISD::TRUNCATE, SDLoc(N), NVT, BCSrc);
	}

	// extract_vector_elt (insert_vector_elt vec, val, idx), idx) -> val
	//
	// This only really matters if the index is non-constant since other combines
	// on the constant elements already work.
	if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT &&
	EltNo == InVec.getOperand(2)) {
	SDValue Elt = InVec.getOperand(1);
	return VT.isInteger() ? DAG.getAnyExtOrTrunc(Elt, SDLoc(N), NVT) : Elt;
	}

	// Transform: (EXTRACT_VECTOR_ELT( VECTOR_SHUFFLE )) -> EXTRACT_VECTOR_ELT.
	// We only perform this optimization before the op legalization phase because
	// we may introduce new vector instructions which are not backed by TD
	// patterns. For example on AVX, extracting elements from a wide vector
	// without using extract_subvector. However, if we can find an underlying
	// scalar value, then we can always use that.
	if (ConstEltNo && InVec.getOpcode() == ISD::VECTOR_SHUFFLE) {
	int NumElem = VT.getVectorNumElements();
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(InVec);
	// Find the new index to extract from.
	int OrigElt = SVOp->getMaskElt(ConstEltNo->getZExtValue());

	// Extracting an undef index is undef.
	if (OrigElt == -1)
	return DAG.getUNDEF(NVT);

	// Select the right vector half to extract from.
	SDValue SVInVec;
	if (OrigElt < NumElem) {
	SVInVec = InVec->getOperand(0);
	} else {
	SVInVec = InVec->getOperand(1);
	OrigElt -= NumElem;
	}

	if (SVInVec.getOpcode() == ISD::BUILD_VECTOR) {
	SDValue InOp = SVInVec.getOperand(OrigElt);
	if (InOp.getValueType() != NVT) {
	assert(InOp.getValueType().isInteger() && NVT.isInteger());
	InOp = DAG.getSExtOrTrunc(InOp, SDLoc(SVInVec), NVT);
	}

	return InOp;
	}

	// FIXME: We should handle recursing on other vector shuffles and
	// scalar_to_vector here as well.

	if (!LegalOperations \|\|
	// FIXME: Should really be just isOperationLegalOrCustom.
	TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VT) \|\|
	TLI.isOperationExpand(ISD::VECTOR_SHUFFLE, VT)) {
	EVT IndexTy = TLI.getVectorIdxTy(DAG.getDataLayout());
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), NVT, SVInVec,
	DAG.getConstant(OrigElt, SDLoc(SVOp), IndexTy));
	}
	}

	bool BCNumEltsChanged = false;
	EVT ExtVT = VT.getVectorElementType();
	EVT LVT = ExtVT;

	// If the result of load has to be truncated, then it's not necessarily
	// profitable.
	if (NVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, NVT))
	return SDValue();

	if (InVec.getOpcode() == ISD::BITCAST) {
	// Don't duplicate a load with other uses.
	if (!InVec.hasOneUse())
	return SDValue();

	EVT BCVT = InVec.getOperand(0).getValueType();
	if (!BCVT.isVector() \|\| ExtVT.bitsGT(BCVT.getVectorElementType()))
	return SDValue();
	if (VT.getVectorNumElements() != BCVT.getVectorNumElements())
	BCNumEltsChanged = true;
	InVec = InVec.getOperand(0);
	ExtVT = BCVT.getVectorElementType();
	}

	// (vextract (vN[if]M load $addr), i) -> ([if]M load $addr + i * size)
	if (!LegalOperations && !ConstEltNo && InVec.hasOneUse() &&
	ISD::isNormalLoad(InVec.getNode()) &&
	!N->getOperand(1)->hasPredecessor(InVec.getNode())) {
	SDValue Index = N->getOperand(1);
	if (LoadSDNode *OrigLoad = dyn_cast<LoadSDNode>(InVec)) {
	if (!OrigLoad->isVolatile()) {
	return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, Index,
	OrigLoad);
	}
	}
	}

	// Perform only after legalization to ensure build_vector / vector_shuffle
	// optimizations have already been done.
	if (!LegalOperations) return SDValue();

	// (vextract (v4f32 load $addr), c) -> (f32 load $addr+c*size)
	// (vextract (v4f32 s2v (f32 load $addr)), c) -> (f32 load $addr+c*size)
	// (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), 0) -> (f32 load $addr)

	if (ConstEltNo) {
	int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();

	LoadSDNode *LN0 = nullptr;
	const ShuffleVectorSDNode *SVN = nullptr;
	if (ISD::isNormalLoad(InVec.getNode())) {
	LN0 = cast<LoadSDNode>(InVec);
	} else if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	InVec.getOperand(0).getValueType() == ExtVT &&
	ISD::isNormalLoad(InVec.getOperand(0).getNode())) {
	// Don't duplicate a load with other uses.
	if (!InVec.hasOneUse())
	return SDValue();

	LN0 = cast<LoadSDNode>(InVec.getOperand(0));
	} else if ((SVN = dyn_cast<ShuffleVectorSDNode>(InVec))) {
	// (vextract (vector_shuffle (load $addr), v2, <1, u, u, u>), 1)
	// =>
	// (load $addr+1*size)

	// Don't duplicate a load with other uses.
	if (!InVec.hasOneUse())
	return SDValue();

	// If the bit convert changed the number of elements, it is unsafe
	// to examine the mask.
	if (BCNumEltsChanged)
	return SDValue();

	// Select the input vector, guarding against out of range extract vector.
	unsigned NumElems = VT.getVectorNumElements();
	int Idx = (Elt > (int)NumElems) ? -1 : SVN->getMaskElt(Elt);
	InVec = (Idx < (int)NumElems) ? InVec.getOperand(0) : InVec.getOperand(1);

	if (InVec.getOpcode() == ISD::BITCAST) {
	// Don't duplicate a load with other uses.
	if (!InVec.hasOneUse())
	return SDValue();

	InVec = InVec.getOperand(0);
	}
	if (ISD::isNormalLoad(InVec.getNode())) {
	LN0 = cast<LoadSDNode>(InVec);
	Elt = (Idx < (int)NumElems) ? Idx : Idx - (int)NumElems;
	EltNo = DAG.getConstant(Elt, SDLoc(EltNo), EltNo.getValueType());
	}
	}

	// Make sure we found a non-volatile load and the extractelement is
	// the only use.
	if (!LN0 \|\| !LN0->hasNUsesOfValue(1,0) \|\| LN0->isVolatile())
	return SDValue();

	// If Idx was -1 above, Elt is going to be -1, so just return undef.
	if (Elt == -1)
	return DAG.getUNDEF(LVT);

	return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, EltNo, LN0);
	}

	return SDValue();
	}

	// Simplify (build_vec (ext )) to (bitcast (build_vec ))
	SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) {
	// We perform this optimization post type-legalization because
	// the type-legalizer often scalarizes integer-promoted vectors.
	// Performing this optimization before may create bit-casts which
	// will be type-legalized to complex code sequences.
	// We perform this optimization only before the operation legalizer because we
	// may introduce illegal operations.
	if (Level != AfterLegalizeVectorOps && Level != AfterLegalizeTypes)
	return SDValue();

	unsigned NumInScalars = N->getNumOperands();
	SDLoc DL(N);
	EVT VT = N->getValueType(0);

	// Check to see if this is a BUILD_VECTOR of a bunch of values
	// which come from any_extend or zero_extend nodes. If so, we can create
	// a new BUILD_VECTOR using bit-casts which may enable other BUILD_VECTOR
	// optimizations. We do not handle sign-extend because we can't fill the sign
	// using shuffles.
	EVT SourceType = MVT::Other;
	bool AllAnyExt = true;

	for (unsigned i = 0; i != NumInScalars; ++i) {
	SDValue In = N->getOperand(i);
	// Ignore undef inputs.
	if (In.isUndef()) continue;

	bool AnyExt = In.getOpcode() == ISD::ANY_EXTEND;
	bool ZeroExt = In.getOpcode() == ISD::ZERO_EXTEND;

	// Abort if the element is not an extension.
	if (!ZeroExt && !AnyExt) {
	SourceType = MVT::Other;
	break;
	}

	// The input is a ZeroExt or AnyExt. Check the original type.
	EVT InTy = In.getOperand(0).getValueType();

	// Check that all of the widened source types are the same.
	if (SourceType == MVT::Other)
	// First time.
	SourceType = InTy;
	else if (InTy != SourceType) {
	// Multiple income types. Abort.
	SourceType = MVT::Other;
	break;
	}

	// Check if all of the extends are ANY_EXTENDs.
	AllAnyExt &= AnyExt;
	}

	// In order to have valid types, all of the inputs must be extended from the
	// same source type and all of the inputs must be any or zero extend.
	// Scalar sizes must be a power of two.
	EVT OutScalarTy = VT.getScalarType();
	bool ValidTypes = SourceType != MVT::Other &&
	isPowerOf2_32(OutScalarTy.getSizeInBits()) &&
	isPowerOf2_32(SourceType.getSizeInBits());

	// Create a new simpler BUILD_VECTOR sequence which other optimizations can
	// turn into a single shuffle instruction.
	if (!ValidTypes)
	return SDValue();

	bool isLE = DAG.getDataLayout().isLittleEndian();
	unsigned ElemRatio = OutScalarTy.getSizeInBits()/SourceType.getSizeInBits();
	assert(ElemRatio > 1 && "Invalid element size ratio");
	SDValue Filler = AllAnyExt ? DAG.getUNDEF(SourceType):
	DAG.getConstant(0, DL, SourceType);

	unsigned NewBVElems = ElemRatio * VT.getVectorNumElements();
	SmallVector<SDValue, 8> Ops(NewBVElems, Filler);

	// Populate the new build_vector
	for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
	SDValue Cast = N->getOperand(i);
	assert((Cast.getOpcode() == ISD::ANY_EXTEND \|\|
	Cast.getOpcode() == ISD::ZERO_EXTEND \|\|
	Cast.isUndef()) && "Invalid cast opcode");
	SDValue In;
	if (Cast.isUndef())
	In = DAG.getUNDEF(SourceType);
	else
	In = Cast->getOperand(0);
	unsigned Index = isLE ? (i * ElemRatio) :
	(i * ElemRatio + (ElemRatio - 1));

	assert(Index < Ops.size() && "Invalid index");
	Ops[Index] = In;
	}

	// The type of the new BUILD_VECTOR node.
	EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SourceType, NewBVElems);
	assert(VecVT.getSizeInBits() == VT.getSizeInBits() &&
	"Invalid vector size");
	// Check if the new vector type is legal.
	if (!isTypeLegal(VecVT)) return SDValue();

	// Make the new BUILD_VECTOR.
	SDValue BV = DAG.getBuildVector(VecVT, DL, Ops);

	// The new BUILD_VECTOR node has the potential to be further optimized.
	AddToWorklist(BV.getNode());
	// Bitcast to the desired type.
	return DAG.getBitcast(VT, BV);
	}

	SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) {
	EVT VT = N->getValueType(0);

	unsigned NumInScalars = N->getNumOperands();
	SDLoc DL(N);

	EVT SrcVT = MVT::Other;
	unsigned Opcode = ISD::DELETED_NODE;
	unsigned NumDefs = 0;

	for (unsigned i = 0; i != NumInScalars; ++i) {
	SDValue In = N->getOperand(i);
	unsigned Opc = In.getOpcode();

	if (Opc == ISD::UNDEF)
	continue;

	// If all scalar values are floats and converted from integers.
	if (Opcode == ISD::DELETED_NODE &&
	(Opc == ISD::UINT_TO_FP \|\| Opc == ISD::SINT_TO_FP)) {
	Opcode = Opc;
	}

	if (Opc != Opcode)
	return SDValue();

	EVT InVT = In.getOperand(0).getValueType();

	// If all scalar values are typed differently, bail out. It's chosen to
	// simplify BUILD_VECTOR of integer types.
	if (SrcVT == MVT::Other)
	SrcVT = InVT;
	if (SrcVT != InVT)
	return SDValue();
	NumDefs++;
	}

	// If the vector has just one element defined, it's not worth to fold it into
	// a vectorized one.
	if (NumDefs < 2)
	return SDValue();

	assert((Opcode == ISD::UINT_TO_FP \|\| Opcode == ISD::SINT_TO_FP)
	&& "Should only handle conversion from integer to float.");
	assert(SrcVT != MVT::Other && "Cannot determine source type!");

	EVT NVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumInScalars);

	if (!TLI.isOperationLegalOrCustom(Opcode, NVT))
	return SDValue();

	// Just because the floating-point vector type is legal does not necessarily
	// mean that the corresponding integer vector type is.
	if (!isTypeLegal(NVT))
	return SDValue();

	SmallVector<SDValue, 8> Opnds;
	for (unsigned i = 0; i != NumInScalars; ++i) {
	SDValue In = N->getOperand(i);

	if (In.isUndef())
	Opnds.push_back(DAG.getUNDEF(SrcVT));
	else
	Opnds.push_back(In.getOperand(0));
	}
	SDValue BV = DAG.getBuildVector(NVT, DL, Opnds);
	AddToWorklist(BV.getNode());

	return DAG.getNode(Opcode, DL, VT, BV);
	}

	SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,
	ArrayRef<int> VectorMask,
	SDValue VecIn1, SDValue VecIn2,
	unsigned LeftIdx) {
	MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
	SDValue ZeroIdx = DAG.getConstant(0, DL, IdxTy);

	EVT VT = N->getValueType(0);
	EVT InVT1 = VecIn1.getValueType();
	EVT InVT2 = VecIn2.getNode() ? VecIn2.getValueType() : InVT1;

	unsigned Vec2Offset = 0;
	unsigned NumElems = VT.getVectorNumElements();
	unsigned ShuffleNumElems = NumElems;

	// In case both the input vectors are extracted from same base
	// vector we do not need extra addend (Vec2Offset) while
	// computing shuffle mask.
	if (!VecIn2 \|\| !(VecIn1.getOpcode() == ISD::EXTRACT_SUBVECTOR) \|\|
	!(VecIn2.getOpcode() == ISD::EXTRACT_SUBVECTOR) \|\|
	!(VecIn1.getOperand(0) == VecIn2.getOperand(0)))
	Vec2Offset = InVT1.getVectorNumElements();

	// We can't generate a shuffle node with mismatched input and output types.
	// Try to make the types match the type of the output.
	if (InVT1 != VT \|\| InVT2 != VT) {
	if ((VT.getSizeInBits() % InVT1.getSizeInBits() == 0) && InVT1 == InVT2) {
	// If the output vector length is a multiple of both input lengths,
	// we can concatenate them and pad the rest with undefs.
	unsigned NumConcats = VT.getSizeInBits() / InVT1.getSizeInBits();
	assert(NumConcats >= 2 && "Concat needs at least two inputs!");
	SmallVector<SDValue, 2> ConcatOps(NumConcats, DAG.getUNDEF(InVT1));
	ConcatOps[0] = VecIn1;
	ConcatOps[1] = VecIn2 ? VecIn2 : DAG.getUNDEF(InVT1);
	VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
	VecIn2 = SDValue();
	} else if (InVT1.getSizeInBits() == VT.getSizeInBits() * 2) {
	if (!TLI.isExtractSubvectorCheap(VT, InVT1, NumElems))
	return SDValue();

	if (!VecIn2.getNode()) {
	// If we only have one input vector, and it's twice the size of the
	// output, split it in two.
	VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, VecIn1,
	DAG.getConstant(NumElems, DL, IdxTy));
	VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, VecIn1, ZeroIdx);
	// Since we now have shorter input vectors, adjust the offset of the
	// second vector's start.
	Vec2Offset = NumElems;
	} else if (InVT2.getSizeInBits() <= InVT1.getSizeInBits()) {
	// VecIn1 is wider than the output, and we have another, possibly
	// smaller input. Pad the smaller input with undefs, shuffle at the
	// input vector width, and extract the output.
	// The shuffle type is different than VT, so check legality again.
	if (LegalOperations &&
	!TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, InVT1))
	return SDValue();

	// Legalizing INSERT_SUBVECTOR is tricky - you basically have to
	// lower it back into a BUILD_VECTOR. So if the inserted type is
	// illegal, don't even try.
	if (InVT1 != InVT2) {
	if (!TLI.isTypeLegal(InVT2))
	return SDValue();
	VecIn2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT1,
	DAG.getUNDEF(InVT1), VecIn2, ZeroIdx);
	}
	ShuffleNumElems = NumElems * 2;
	} else {
	// Both VecIn1 and VecIn2 are wider than the output, and VecIn2 is wider
	// than VecIn1. We can't handle this for now - this case will disappear
	// when we start sorting the vectors by type.
	return SDValue();
	}
	} else if (InVT2.getSizeInBits() * 2 == VT.getSizeInBits() &&
	InVT1.getSizeInBits() == VT.getSizeInBits()) {
	SmallVector<SDValue, 2> ConcatOps(2, DAG.getUNDEF(InVT2));
	ConcatOps[0] = VecIn2;
	VecIn2 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
	} else {
	// TODO: Support cases where the length mismatch isn't exactly by a
	// factor of 2.
	// TODO: Move this check upwards, so that if we have bad type
	// mismatches, we don't create any DAG nodes.
	return SDValue();
	}
	}

	// Initialize mask to undef.
	SmallVector<int, 8> Mask(ShuffleNumElems, -1);

	// Only need to run up to the number of elements actually used, not the
	// total number of elements in the shuffle - if we are shuffling a wider
	// vector, the high lanes should be set to undef.
	for (unsigned i = 0; i != NumElems; ++i) {
	if (VectorMask[i] <= 0)
	continue;

	unsigned ExtIndex = N->getOperand(i).getConstantOperandVal(1);
	if (VectorMask[i] == (int)LeftIdx) {
	Mask[i] = ExtIndex;
	} else if (VectorMask[i] == (int)LeftIdx + 1) {
	Mask[i] = Vec2Offset + ExtIndex;
	}
	}

	// The type the input vectors may have changed above.
	InVT1 = VecIn1.getValueType();

	// If we already have a VecIn2, it should have the same type as VecIn1.
	// If we don't, get an undef/zero vector of the appropriate type.
	VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(InVT1);
	assert(InVT1 == VecIn2.getValueType() && "Unexpected second input type.");

	SDValue Shuffle = DAG.getVectorShuffle(InVT1, DL, VecIn1, VecIn2, Mask);
	if (ShuffleNumElems > NumElems)
	Shuffle = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuffle, ZeroIdx);

	return Shuffle;
	}

	// Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT
	// operations. If the types of the vectors we're extracting from allow it,
	// turn this into a vector_shuffle node.
	SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
	SDLoc DL(N);
	EVT VT = N->getValueType(0);

	// Only type-legal BUILD_VECTOR nodes are converted to shuffle nodes.
	if (!isTypeLegal(VT))
	return SDValue();

	// May only combine to shuffle after legalize if shuffle is legal.
	if (LegalOperations && !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, VT))
	return SDValue();

	bool UsesZeroVector = false;
	unsigned NumElems = N->getNumOperands();

	// Record, for each element of the newly built vector, which input vector
	// that element comes from. -1 stands for undef, 0 for the zero vector,
	// and positive values for the input vectors.
	// VectorMask maps each element to its vector number, and VecIn maps vector
	// numbers to their initial SDValues.

	SmallVector<int, 8> VectorMask(NumElems, -1);
	SmallVector<SDValue, 8> VecIn;
	VecIn.push_back(SDValue());

	for (unsigned i = 0; i != NumElems; ++i) {
	SDValue Op = N->getOperand(i);

	if (Op.isUndef())
	continue;

	// See if we can use a blend with a zero vector.
	// TODO: Should we generalize this to a blend with an arbitrary constant
	// vector?
	if (isNullConstant(Op) \|\| isNullFPConstant(Op)) {
	UsesZeroVector = true;
	VectorMask[i] = 0;
	continue;
	}

	// Not an undef or zero. If the input is something other than an
	// EXTRACT_VECTOR_ELT with a constant index, bail out.
	if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(Op.getOperand(1)))
	return SDValue();
	SDValue ExtractedFromVec = Op.getOperand(0);

	// All inputs must have the same element type as the output.
	if (VT.getVectorElementType() !=
	ExtractedFromVec.getValueType().getVectorElementType())
	return SDValue();

	// Have we seen this input vector before?
	// The vectors are expected to be tiny (usually 1 or 2 elements), so using
	// a map back from SDValues to numbers isn't worth it.
	unsigned Idx = std::distance(
	VecIn.begin(), std::find(VecIn.begin(), VecIn.end(), ExtractedFromVec));
	if (Idx == VecIn.size())
	VecIn.push_back(ExtractedFromVec);

	VectorMask[i] = Idx;
	}

	// If we didn't find at least one input vector, bail out.
	if (VecIn.size() < 2)
	return SDValue();

	// If all the Operands of BUILD_VECTOR extract from same
	// vector, then split the vector efficiently based on the maximum
	// vector access index and adjust the VectorMask and
	// VecIn accordingly.
	if (VecIn.size() == 2) {
	unsigned MaxIndex = 0;
	unsigned NearestPow2 = 0;
	SDValue Vec = VecIn.back();
	EVT InVT = Vec.getValueType();
	MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
	SmallVector<unsigned, 8> IndexVec(NumElems, 0);

	for (unsigned i = 0; i < NumElems; i++) {
	if (VectorMask[i] <= 0)
	continue;
	unsigned Index = N->getOperand(i).getConstantOperandVal(1);
	IndexVec[i] = Index;
	MaxIndex = std::max(MaxIndex, Index);
	}

	NearestPow2 = PowerOf2Ceil(MaxIndex);
	if (InVT.isSimple() && NearestPow2 > 2 && MaxIndex < NearestPow2 &&
	NumElems * 2 < NearestPow2) {
	unsigned SplitSize = NearestPow2 / 2;
	EVT SplitVT = EVT::getVectorVT(*DAG.getContext(),
	InVT.getVectorElementType(), SplitSize);
	if (TLI.isTypeLegal(SplitVT)) {
	SDValue VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec,
	DAG.getConstant(SplitSize, DL, IdxTy));
	SDValue VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec,
	DAG.getConstant(0, DL, IdxTy));
	VecIn.pop_back();
	VecIn.push_back(VecIn1);
	VecIn.push_back(VecIn2);

	for (unsigned i = 0; i < NumElems; i++) {
	if (VectorMask[i] <= 0)
	continue;
	VectorMask[i] = (IndexVec[i] < SplitSize) ? 1 : 2;
	}
	}
	}
	}

	// TODO: We want to sort the vectors by descending length, so that adjacent
	// pairs have similar length, and the longer vector is always first in the
	// pair.

	// TODO: Should this fire if some of the input vectors has illegal type (like
	// it does now), or should we let legalization run its course first?

	// Shuffle phase:
	// Take pairs of vectors, and shuffle them so that the result has elements
	// from these vectors in the correct places.
	// For example, given:
	// t10: i32 = extract_vector_elt t1, Constant:i64<0>
	// t11: i32 = extract_vector_elt t2, Constant:i64<0>
	// t12: i32 = extract_vector_elt t3, Constant:i64<0>
	// t13: i32 = extract_vector_elt t1, Constant:i64<1>
	// t14: v4i32 = BUILD_VECTOR t10, t11, t12, t13
	// We will generate:
	// t20: v4i32 = vector_shuffle<0,4,u,1> t1, t2
	// t21: v4i32 = vector_shuffle<u,u,0,u> t3, undef
	SmallVector<SDValue, 4> Shuffles;
	for (unsigned In = 0, Len = (VecIn.size() / 2); In < Len; ++In) {
	unsigned LeftIdx = 2 * In + 1;
	SDValue VecLeft = VecIn[LeftIdx];
	SDValue VecRight =
	(LeftIdx + 1) < VecIn.size() ? VecIn[LeftIdx + 1] : SDValue();

	if (SDValue Shuffle = createBuildVecShuffle(DL, N, VectorMask, VecLeft,
	VecRight, LeftIdx))
	Shuffles.push_back(Shuffle);
	else
	return SDValue();
	}

	// If we need the zero vector as an "ingredient" in the blend tree, add it
	// to the list of shuffles.
	if (UsesZeroVector)
	Shuffles.push_back(VT.isInteger() ? DAG.getConstant(0, DL, VT)
	: DAG.getConstantFP(0.0, DL, VT));

	// If we only have one shuffle, we're done.
	if (Shuffles.size() == 1)
	return Shuffles[0];

	// Update the vector mask to point to the post-shuffle vectors.
	for (int &Vec : VectorMask)
	if (Vec == 0)
	Vec = Shuffles.size() - 1;
	else
	Vec = (Vec - 1) / 2;

	// More than one shuffle. Generate a binary tree of blends, e.g. if from
	// the previous step we got the set of shuffles t10, t11, t12, t13, we will
	// generate:
	// t10: v8i32 = vector_shuffle<0,8,u,u,u,u,u,u> t1, t2
	// t11: v8i32 = vector_shuffle<u,u,0,8,u,u,u,u> t3, t4
	// t12: v8i32 = vector_shuffle<u,u,u,u,0,8,u,u> t5, t6
	// t13: v8i32 = vector_shuffle<u,u,u,u,u,u,0,8> t7, t8
	// t20: v8i32 = vector_shuffle<0,1,10,11,u,u,u,u> t10, t11
	// t21: v8i32 = vector_shuffle<u,u,u,u,4,5,14,15> t12, t13
	// t30: v8i32 = vector_shuffle<0,1,2,3,12,13,14,15> t20, t21

	// Make sure the initial size of the shuffle list is even.
	if (Shuffles.size() % 2)
	Shuffles.push_back(DAG.getUNDEF(VT));

	for (unsigned CurSize = Shuffles.size(); CurSize > 1; CurSize /= 2) {
	if (CurSize % 2) {
	Shuffles[CurSize] = DAG.getUNDEF(VT);
	CurSize++;
	}
	for (unsigned In = 0, Len = CurSize / 2; In < Len; ++In) {
	int Left = 2 * In;
	int Right = 2 * In + 1;
	SmallVector<int, 8> Mask(NumElems, -1);
	for (unsigned i = 0; i != NumElems; ++i) {
	if (VectorMask[i] == Left) {
	Mask[i] = i;
	VectorMask[i] = In;
	} else if (VectorMask[i] == Right) {
	Mask[i] = i + NumElems;
	VectorMask[i] = In;
	}
	}

	Shuffles[In] =
	DAG.getVectorShuffle(VT, DL, Shuffles[Left], Shuffles[Right], Mask);
	}
	}
	return Shuffles[0];
	}

	SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
	EVT VT = N->getValueType(0);

	// A vector built entirely of undefs is undef.
	if (ISD::allOperandsUndef(N))
	return DAG.getUNDEF(VT);

	// Check if we can express BUILD VECTOR via subvector extract.
	if (!LegalTypes && (N->getNumOperands() > 1)) {
	SDValue Op0 = N->getOperand(0);
	auto checkElem = [&](SDValue Op) -> uint64_t {
	if ((Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) &&
	(Op0.getOperand(0) == Op.getOperand(0)))
	if (auto CNode = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
	return CNode->getZExtValue();
	return -1;
	};

	int Offset = checkElem(Op0);
	for (unsigned i = 0; i < N->getNumOperands(); ++i) {
	if (Offset + i != checkElem(N->getOperand(i))) {
	Offset = -1;
	break;
	}
	}

	if ((Offset == 0) &&
	(Op0.getOperand(0).getValueType() == N->getValueType(0)))
	return Op0.getOperand(0);
	if ((Offset != -1) &&
	((Offset % N->getValueType(0).getVectorNumElements()) ==
	0)) // IDX must be multiple of output size.
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), N->getValueType(0),
	Op0.getOperand(0), Op0.getOperand(1));
	}

	if (SDValue V = reduceBuildVecExtToExtBuildVec(N))
	return V;

	if (SDValue V = reduceBuildVecConvertToConvertBuildVec(N))
	return V;

	if (SDValue V = reduceBuildVecToShuffle(N))
	return V;

	return SDValue();
	}

	static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT OpVT = N->getOperand(0).getValueType();

	// If the operands are legal vectors, leave them alone.
	if (TLI.isTypeLegal(OpVT))
	return SDValue();

	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	SmallVector<SDValue, 8> Ops;

	EVT SVT = EVT::getIntegerVT(*DAG.getContext(), OpVT.getSizeInBits());
	SDValue ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT);

	// Keep track of what we encounter.
	bool AnyInteger = false;
	bool AnyFP = false;
	for (const SDValue &Op : N->ops()) {
	if (ISD::BITCAST == Op.getOpcode() &&
	!Op.getOperand(0).getValueType().isVector())
	Ops.push_back(Op.getOperand(0));
	else if (ISD::UNDEF == Op.getOpcode())
	Ops.push_back(ScalarUndef);
	else
	return SDValue();

	// Note whether we encounter an integer or floating point scalar.
	// If it's neither, bail out, it could be something weird like x86mmx.
	EVT LastOpVT = Ops.back().getValueType();
	if (LastOpVT.isFloatingPoint())
	AnyFP = true;
	else if (LastOpVT.isInteger())
	AnyInteger = true;
	else
	return SDValue();
	}

	// If any of the operands is a floating point scalar bitcast to a vector,
	// use floating point types throughout, and bitcast everything.
	// Replace UNDEFs by another scalar UNDEF node, of the final desired type.
	if (AnyFP) {
	SVT = EVT::getFloatingPointVT(OpVT.getSizeInBits());
	ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT);
	if (AnyInteger) {
	for (SDValue &Op : Ops) {
	if (Op.getValueType() == SVT)
	continue;
	if (Op.isUndef())
	Op = ScalarUndef;
	else
	Op = DAG.getBitcast(SVT, Op);
	}
	}
	}

	EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SVT,
	VT.getSizeInBits() / SVT.getSizeInBits());
	return DAG.getBitcast(VT, DAG.getBuildVector(VecVT, DL, Ops));
	}

	// Check to see if this is a CONCAT_VECTORS of a bunch of EXTRACT_SUBVECTOR
	// operations. If so, and if the EXTRACT_SUBVECTOR vector inputs come from at
	// most two distinct vectors the same size as the result, attempt to turn this
	// into a legal shuffle.
	static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) {
	EVT VT = N->getValueType(0);
	EVT OpVT = N->getOperand(0).getValueType();
	int NumElts = VT.getVectorNumElements();
	int NumOpElts = OpVT.getVectorNumElements();

	SDValue SV0 = DAG.getUNDEF(VT), SV1 = DAG.getUNDEF(VT);
	SmallVector<int, 8> Mask;

	for (SDValue Op : N->ops()) {
	// Peek through any bitcast.
	Op = peekThroughBitcast(Op);

	// UNDEF nodes convert to UNDEF shuffle mask values.
	if (Op.isUndef()) {
	Mask.append((unsigned)NumOpElts, -1);
	continue;
	}

	if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR)
	return SDValue();

	// What vector are we extracting the subvector from and at what index?
	SDValue ExtVec = Op.getOperand(0);

	// We want the EVT of the original extraction to correctly scale the
	// extraction index.
	EVT ExtVT = ExtVec.getValueType();

	// Peek through any bitcast.
	ExtVec = peekThroughBitcast(ExtVec);

	// UNDEF nodes convert to UNDEF shuffle mask values.
	if (ExtVec.isUndef()) {
	Mask.append((unsigned)NumOpElts, -1);
	continue;
	}

	if (!isa<ConstantSDNode>(Op.getOperand(1)))
	return SDValue();
	int ExtIdx = Op.getConstantOperandVal(1);

	// Ensure that we are extracting a subvector from a vector the same
	// size as the result.
	if (ExtVT.getSizeInBits() != VT.getSizeInBits())
	return SDValue();

	// Scale the subvector index to account for any bitcast.
	int NumExtElts = ExtVT.getVectorNumElements();
	if (0 == (NumExtElts % NumElts))
	ExtIdx /= (NumExtElts / NumElts);
	else if (0 == (NumElts % NumExtElts))
	ExtIdx *= (NumElts / NumExtElts);
	else
	return SDValue();

	// At most we can reference 2 inputs in the final shuffle.
	if (SV0.isUndef() \|\| SV0 == ExtVec) {
	SV0 = ExtVec;
	for (int i = 0; i != NumOpElts; ++i)
	Mask.push_back(i + ExtIdx);
	} else if (SV1.isUndef() \|\| SV1 == ExtVec) {
	SV1 = ExtVec;
	for (int i = 0; i != NumOpElts; ++i)
	Mask.push_back(i + ExtIdx + NumElts);
	} else {
	return SDValue();
	}
	}

	if (!DAG.getTargetLoweringInfo().isShuffleMaskLegal(Mask, VT))
	return SDValue();

	return DAG.getVectorShuffle(VT, SDLoc(N), DAG.getBitcast(VT, SV0),
	DAG.getBitcast(VT, SV1), Mask);
	}

	SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
	// If we only have one input vector, we don't need to do any concatenation.
	if (N->getNumOperands() == 1)
	return N->getOperand(0);

	// Check if all of the operands are undefs.
	EVT VT = N->getValueType(0);
	if (ISD::allOperandsUndef(N))
	return DAG.getUNDEF(VT);

	// Optimize concat_vectors where all but the first of the vectors are undef.
	if (std::all_of(std::next(N->op_begin()), N->op_end(), [](const SDValue &Op) {
	return Op.isUndef();
	})) {
	SDValue In = N->getOperand(0);
	assert(In.getValueType().isVector() && "Must concat vectors");

	// Transform: concat_vectors(scalar, undef) -> scalar_to_vector(sclr).
	if (In->getOpcode() == ISD::BITCAST &&
	!In->getOperand(0).getValueType().isVector()) {
	SDValue Scalar = In->getOperand(0);

	// If the bitcast type isn't legal, it might be a trunc of a legal type;
	// look through the trunc so we can still do the transform:
	// concat_vectors(trunc(scalar), undef) -> scalar_to_vector(scalar)
	if (Scalar->getOpcode() == ISD::TRUNCATE &&
	!TLI.isTypeLegal(Scalar.getValueType()) &&
	TLI.isTypeLegal(Scalar->getOperand(0).getValueType()))
	Scalar = Scalar->getOperand(0);

	EVT SclTy = Scalar->getValueType(0);

	if (!SclTy.isFloatingPoint() && !SclTy.isInteger())
	return SDValue();

	unsigned VNTNumElms = VT.getSizeInBits() / SclTy.getSizeInBits();
	if (VNTNumElms < 2)
	return SDValue();

	EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy, VNTNumElms);
	if (!TLI.isTypeLegal(NVT) \|\| !TLI.isTypeLegal(Scalar.getValueType()))
	return SDValue();

	SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), NVT, Scalar);
	return DAG.getBitcast(VT, Res);
	}
	}

	// Fold any combination of BUILD_VECTOR or UNDEF nodes into one BUILD_VECTOR.
	// We have already tested above for an UNDEF only concatenation.
	// fold (concat_vectors (BUILD_VECTOR A, B, ...), (BUILD_VECTOR C, D, ...))
	// -> (BUILD_VECTOR A, B, ..., C, D, ...)
	auto IsBuildVectorOrUndef = [](const SDValue &Op) {
	return ISD::UNDEF == Op.getOpcode() \|\| ISD::BUILD_VECTOR == Op.getOpcode();
	};
	if (llvm::all_of(N->ops(), IsBuildVectorOrUndef)) {
	SmallVector<SDValue, 8> Opnds;
	EVT SVT = VT.getScalarType();

	EVT MinVT = SVT;
	if (!SVT.isFloatingPoint()) {
	// If BUILD_VECTOR are from built from integer, they may have different
	// operand types. Get the smallest type and truncate all operands to it.
	bool FoundMinVT = false;
	for (const SDValue &Op : N->ops())
	if (ISD::BUILD_VECTOR == Op.getOpcode()) {
	EVT OpSVT = Op.getOperand(0).getValueType();
	MinVT = (!FoundMinVT \|\| OpSVT.bitsLE(MinVT)) ? OpSVT : MinVT;
	FoundMinVT = true;
	}
	assert(FoundMinVT && "Concat vector type mismatch");
	}

	for (const SDValue &Op : N->ops()) {
	EVT OpVT = Op.getValueType();
	unsigned NumElts = OpVT.getVectorNumElements();

	if (ISD::UNDEF == Op.getOpcode())
	Opnds.append(NumElts, DAG.getUNDEF(MinVT));

	if (ISD::BUILD_VECTOR == Op.getOpcode()) {
	if (SVT.isFloatingPoint()) {
	assert(SVT == OpVT.getScalarType() && "Concat vector type mismatch");
	Opnds.append(Op->op_begin(), Op->op_begin() + NumElts);
	} else {
	for (unsigned i = 0; i != NumElts; ++i)
	Opnds.push_back(
	DAG.getNode(ISD::TRUNCATE, SDLoc(N), MinVT, Op.getOperand(i)));
	}
	}
	}

	assert(VT.getVectorNumElements() == Opnds.size() &&
	"Concat vector type mismatch");
	return DAG.getBuildVector(VT, SDLoc(N), Opnds);
	}

	// Fold CONCAT_VECTORS of only bitcast scalars (or undef) to BUILD_VECTOR.
	if (SDValue V = combineConcatVectorOfScalars(N, DAG))
	return V;

	// Fold CONCAT_VECTORS of EXTRACT_SUBVECTOR (or undef) to VECTOR_SHUFFLE.
	if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT))
	if (SDValue V = combineConcatVectorOfExtracts(N, DAG))
	return V;

	// Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR
	// nodes often generate nop CONCAT_VECTOR nodes.
	// Scan the CONCAT_VECTOR operands and look for a CONCAT operations that
	// place the incoming vectors at the exact same location.
	SDValue SingleSource = SDValue();
	unsigned PartNumElem = N->getOperand(0).getValueType().getVectorNumElements();

	for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
	SDValue Op = N->getOperand(i);

	if (Op.isUndef())
	continue;

	// Check if this is the identity extract:
	if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR)
	return SDValue();

	// Find the single incoming vector for the extract_subvector.
	if (SingleSource.getNode()) {
	if (Op.getOperand(0) != SingleSource)
	return SDValue();
	} else {
	SingleSource = Op.getOperand(0);

	// Check the source type is the same as the type of the result.
	// If not, this concat may extend the vector, so we can not
	// optimize it away.
	if (SingleSource.getValueType() != N->getValueType(0))
	return SDValue();
	}

	unsigned IdentityIndex = i * PartNumElem;
	ConstantSDNode *CS = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	// The extract index must be constant.
	if (!CS)
	return SDValue();

	// Check that we are reading from the identity index.
	if (CS->getZExtValue() != IdentityIndex)
	return SDValue();
	}

	if (SingleSource.getNode())
	return SingleSource;

	return SDValue();
	}

	/// If we are extracting a subvector produced by a wide binary operator with at
	/// at least one operand that was the result of a vector concatenation, then try
	/// to use the narrow vector operands directly to avoid the concatenation and
	/// extraction.
	static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
	// TODO: Refactor with the caller (visitEXTRACT_SUBVECTOR), so we can share
	// some of these bailouts with other transforms.

	// The extract index must be a constant, so we can map it to a concat operand.
	auto *ExtractIndex = dyn_cast<ConstantSDNode>(Extract->getOperand(1));
	if (!ExtractIndex)
	return SDValue();

	// Only handle the case where we are doubling and then halving. A larger ratio
	// may require more than two narrow binops to replace the wide binop.
	EVT VT = Extract->getValueType(0);
	unsigned NumElems = VT.getVectorNumElements();
	assert((ExtractIndex->getZExtValue() % NumElems) == 0 &&
	"Extract index is not a multiple of the vector length.");
	if (Extract->getOperand(0).getValueSizeInBits() != VT.getSizeInBits() * 2)
	return SDValue();

	// We are looking for an optionally bitcasted wide vector binary operator
	// feeding an extract subvector.
	SDValue BinOp = peekThroughBitcast(Extract->getOperand(0));

	// TODO: The motivating case for this transform is an x86 AVX1 target. That
	// target has temptingly almost legal versions of bitwise logic ops in 256-bit
	// flavors, but no other 256-bit integer support. This could be extended to
	// handle any binop, but that may require fixing/adding other folds to avoid
	// codegen regressions.
	unsigned BOpcode = BinOp.getOpcode();
	if (BOpcode != ISD::AND && BOpcode != ISD::OR && BOpcode != ISD::XOR)
	return SDValue();

	// The binop must be a vector type, so we can chop it in half.
	EVT WideBVT = BinOp.getValueType();
	if (!WideBVT.isVector())
	return SDValue();

	// Bail out if the target does not support a narrower version of the binop.
	EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(),
	WideBVT.getVectorNumElements() / 2);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT))
	return SDValue();

	// Peek through bitcasts of the binary operator operands if needed.
	SDValue LHS = peekThroughBitcast(BinOp.getOperand(0));
	SDValue RHS = peekThroughBitcast(BinOp.getOperand(1));

	// We need at least one concatenation operation of a binop operand to make
	// this transform worthwhile. The concat must double the input vector sizes.
	// TODO: Should we also handle INSERT_SUBVECTOR patterns?
	bool ConcatL =
	LHS.getOpcode() == ISD::CONCAT_VECTORS && LHS.getNumOperands() == 2;
	bool ConcatR =
	RHS.getOpcode() == ISD::CONCAT_VECTORS && RHS.getNumOperands() == 2;
	if (!ConcatL && !ConcatR)
	return SDValue();

	// If one of the binop operands was not the result of a concat, we must
	// extract a half-sized operand for our new narrow binop. We can't just reuse
	// the original extract index operand because we may have bitcasted.
	unsigned ConcatOpNum = ExtractIndex->getZExtValue() / NumElems;
	unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements();
	EVT ExtBOIdxVT = Extract->getOperand(1).getValueType();
	SDLoc DL(Extract);

	// extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
	// extract (binop (concat X1, X2), Y), N --> binop XN, (extract Y, N)
	// extract (binop X, (concat Y1, Y2)), N --> binop (extract X, N), YN
	SDValue X = ConcatL ? DAG.getBitcast(NarrowBVT, LHS.getOperand(ConcatOpNum))
	: DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
	BinOp.getOperand(0),
	DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT));

	SDValue Y = ConcatR ? DAG.getBitcast(NarrowBVT, RHS.getOperand(ConcatOpNum))
	: DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
	BinOp.getOperand(1),
	DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT));

	SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y);
	return DAG.getBitcast(VT, NarrowBinOp);
	}

	/// If we are extracting a subvector from a wide vector load, convert to a
	/// narrow load to eliminate the extraction:
	/// (extract_subvector (load wide vector)) --> (load narrow vector)
	static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {
	// TODO: Add support for big-endian. The offset calculation must be adjusted.
	if (DAG.getDataLayout().isBigEndian())
	return SDValue();

	// TODO: The one-use check is overly conservative. Check the cost of the
	// extract instead or remove that condition entirely.
	auto *Ld = dyn_cast<LoadSDNode>(Extract->getOperand(0));
	auto *ExtIdx = dyn_cast<ConstantSDNode>(Extract->getOperand(1));
	if (!Ld \|\| !Ld->hasOneUse() \|\| Ld->getExtensionType() \|\| Ld->isVolatile() \|\|
	!ExtIdx)
	return SDValue();

	// The narrow load will be offset from the base address of the old load if
	// we are extracting from something besides index 0 (little-endian).
	EVT VT = Extract->getValueType(0);
	SDLoc DL(Extract);
	SDValue BaseAddr = Ld->getOperand(1);
	unsigned Offset = ExtIdx->getZExtValue() * VT.getScalarType().getStoreSize();

	// TODO: Use "BaseIndexOffset" to make this more effective.
	SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
	MachineFunction &MF = DAG.getMachineFunction();
	MachineMemOperand *MMO = MF.getMachineMemOperand(Ld->getMemOperand(), Offset,
	VT.getStoreSize());
	SDValue NewLd = DAG.getLoad(VT, DL, Ld->getChain(), NewAddr, MMO);
	DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
	return NewLd;
	}

	SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
	EVT NVT = N->getValueType(0);
	SDValue V = N->getOperand(0);

	// Extract from UNDEF is UNDEF.
	if (V.isUndef())
	return DAG.getUNDEF(NVT);

	if (TLI.isOperationLegalOrCustomOrPromote(ISD::LOAD, NVT))
	if (SDValue NarrowLoad = narrowExtractedVectorLoad(N, DAG))
	return NarrowLoad;

	// Combine:
	// (extract_subvec (concat V1, V2, ...), i)
	// Into:
	// Vi if possible
	// Only operand 0 is checked as 'concat' assumes all inputs of the same
	// type.
	if (V->getOpcode() == ISD::CONCAT_VECTORS &&
	isa<ConstantSDNode>(N->getOperand(1)) &&
	V->getOperand(0).getValueType() == NVT) {
	unsigned Idx = N->getConstantOperandVal(1);
	unsigned NumElems = NVT.getVectorNumElements();
	assert((Idx % NumElems) == 0 &&
	"IDX in concat is not a multiple of the result vector length.");
	return V->getOperand(Idx / NumElems);
	}

	// Skip bitcasting
	V = peekThroughBitcast(V);

	// If the input is a build vector. Try to make a smaller build vector.
	if (V->getOpcode() == ISD::BUILD_VECTOR) {
	if (auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
	EVT InVT = V->getValueType(0);
	unsigned ExtractSize = NVT.getSizeInBits();
	unsigned EltSize = InVT.getScalarSizeInBits();
	// Only do this if we won't split any elements.
	if (ExtractSize % EltSize == 0) {
	unsigned NumElems = ExtractSize / EltSize;
	EVT ExtractVT = EVT::getVectorVT(*DAG.getContext(),
	InVT.getVectorElementType(), NumElems);
	if ((!LegalOperations \|\|
	TLI.isOperationLegal(ISD::BUILD_VECTOR, ExtractVT)) &&
	(!LegalTypes \|\| TLI.isTypeLegal(ExtractVT))) {
	unsigned IdxVal = (Idx->getZExtValue() * NVT.getScalarSizeInBits()) /
	EltSize;

	// Extract the pieces from the original build_vector.
	SDValue BuildVec = DAG.getBuildVector(ExtractVT, SDLoc(N),
	makeArrayRef(V->op_begin() + IdxVal,
	NumElems));
	return DAG.getBitcast(NVT, BuildVec);
	}
	}
	}
	}

	if (V->getOpcode() == ISD::INSERT_SUBVECTOR) {
	// Handle only simple case where vector being inserted and vector
	// being extracted are of same size.
	EVT SmallVT = V->getOperand(1).getValueType();
	if (!NVT.bitsEq(SmallVT))
	return SDValue();

	// Only handle cases where both indexes are constants.
	ConstantSDNode *ExtIdx = dyn_cast<ConstantSDNode>(N->getOperand(1));
	ConstantSDNode *InsIdx = dyn_cast<ConstantSDNode>(V->getOperand(2));

	if (InsIdx && ExtIdx) {
	// Combine:
	// (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx)
	// Into:
	// indices are equal or bit offsets are equal => V1
	// otherwise => (extract_subvec V1, ExtIdx)
	if (InsIdx->getZExtValue() * SmallVT.getScalarSizeInBits() ==
	ExtIdx->getZExtValue() * NVT.getScalarSizeInBits())
	return DAG.getBitcast(NVT, V->getOperand(1));
	return DAG.getNode(
	ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT,
	DAG.getBitcast(N->getOperand(0).getValueType(), V->getOperand(0)),
	N->getOperand(1));
	}
	}

	if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG))
	return NarrowBOp;

	return SDValue();
	}

	static SDValue simplifyShuffleOperandRecursively(SmallBitVector &UsedElements,
	SDValue V, SelectionDAG &DAG) {
	SDLoc DL(V);
	EVT VT = V.getValueType();

	switch (V.getOpcode()) {
	default:
	return V;

	case ISD::CONCAT_VECTORS: {
	EVT OpVT = V->getOperand(0).getValueType();
	int OpSize = OpVT.getVectorNumElements();
	SmallBitVector OpUsedElements(OpSize, false);
	bool FoundSimplification = false;
	SmallVector<SDValue, 4> NewOps;
	NewOps.reserve(V->getNumOperands());
	for (int i = 0, NumOps = V->getNumOperands(); i < NumOps; ++i) {
	SDValue Op = V->getOperand(i);
	bool OpUsed = false;
	for (int j = 0; j < OpSize; ++j)
	if (UsedElements[i * OpSize + j]) {
	OpUsedElements[j] = true;
	OpUsed = true;
	}
	NewOps.push_back(
	OpUsed ? simplifyShuffleOperandRecursively(OpUsedElements, Op, DAG)
	: DAG.getUNDEF(OpVT));
	FoundSimplification \|= Op == NewOps.back();
	OpUsedElements.reset();
	}
	if (FoundSimplification)
	V = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, NewOps);
	return V;
	}

	case ISD::INSERT_SUBVECTOR: {
	SDValue BaseV = V->getOperand(0);
	SDValue SubV = V->getOperand(1);
	auto *IdxN = dyn_cast<ConstantSDNode>(V->getOperand(2));
	if (!IdxN)
	return V;

	int SubSize = SubV.getValueType().getVectorNumElements();
	int Idx = IdxN->getZExtValue();
	bool SubVectorUsed = false;
	SmallBitVector SubUsedElements(SubSize, false);
	for (int i = 0; i < SubSize; ++i)
	if (UsedElements[i + Idx]) {
	SubVectorUsed = true;
	SubUsedElements[i] = true;
	UsedElements[i + Idx] = false;
	}

	// Now recurse on both the base and sub vectors.
	SDValue SimplifiedSubV =
	SubVectorUsed
	? simplifyShuffleOperandRecursively(SubUsedElements, SubV, DAG)
	: DAG.getUNDEF(SubV.getValueType());
	SDValue SimplifiedBaseV = simplifyShuffleOperandRecursively(UsedElements, BaseV, DAG);
	if (SimplifiedSubV != SubV \|\| SimplifiedBaseV != BaseV)
	V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
	SimplifiedBaseV, SimplifiedSubV, V->getOperand(2));
	return V;
	}
	}
	}

	static SDValue simplifyShuffleOperands(ShuffleVectorSDNode *SVN, SDValue N0,
	SDValue N1, SelectionDAG &DAG) {
	EVT VT = SVN->getValueType(0);
	int NumElts = VT.getVectorNumElements();
	SmallBitVector N0UsedElements(NumElts, false), N1UsedElements(NumElts, false);
	for (int M : SVN->getMask())
	if (M >= 0 && M < NumElts)
	N0UsedElements[M] = true;
	else if (M >= NumElts)
	N1UsedElements[M - NumElts] = true;

	SDValue S0 = simplifyShuffleOperandRecursively(N0UsedElements, N0, DAG);
	SDValue S1 = simplifyShuffleOperandRecursively(N1UsedElements, N1, DAG);
	if (S0 == N0 && S1 == N1)
	return SDValue();

	return DAG.getVectorShuffle(VT, SDLoc(SVN), S0, S1, SVN->getMask());
	}

	static SDValue simplifyShuffleMask(ShuffleVectorSDNode *SVN, SDValue N0,
	SDValue N1, SelectionDAG &DAG) {
	auto isUndefElt = [](SDValue V, int Idx) {
	// TODO - handle more cases as required.
	if (V.getOpcode() == ISD::BUILD_VECTOR)
	return V.getOperand(Idx).isUndef();
	if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
	return (Idx != 0) \|\| V.getOperand(0).isUndef();
	return false;
	};

	EVT VT = SVN->getValueType(0);
	unsigned NumElts = VT.getVectorNumElements();

	bool Changed = false;
	SmallVector<int, 8> NewMask;
	for (unsigned i = 0; i != NumElts; ++i) {
	int Idx = SVN->getMaskElt(i);
	if ((0 <= Idx && Idx < (int)NumElts && isUndefElt(N0, Idx)) \|\|
	((int)NumElts < Idx && isUndefElt(N1, Idx - NumElts))) {
	Changed = true;
	Idx = -1;
	}
	NewMask.push_back(Idx);
	}
	if (Changed)
	return DAG.getVectorShuffle(VT, SDLoc(SVN), N0, N1, NewMask);

	return SDValue();
	}

	// Tries to turn a shuffle of two CONCAT_VECTORS into a single concat,
	// or turn a shuffle of a single concat into simpler shuffle then concat.
	static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) {
	EVT VT = N->getValueType(0);
	unsigned NumElts = VT.getVectorNumElements();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);

	SmallVector<SDValue, 4> Ops;
	EVT ConcatVT = N0.getOperand(0).getValueType();
	unsigned NumElemsPerConcat = ConcatVT.getVectorNumElements();
	unsigned NumConcats = NumElts / NumElemsPerConcat;

	// Special case: shuffle(concat(A,B)) can be more efficiently represented
	// as concat(shuffle(A,B),UNDEF) if the shuffle doesn't set any of the high
	// half vector elements.
	if (NumElemsPerConcat * 2 == NumElts && N1.isUndef() &&
	std::all_of(SVN->getMask().begin() + NumElemsPerConcat,
	SVN->getMask().end(), [](int i) { return i == -1; })) {
	N0 = DAG.getVectorShuffle(ConcatVT, SDLoc(N), N0.getOperand(0), N0.getOperand(1),
	makeArrayRef(SVN->getMask().begin(), NumElemsPerConcat));
	N1 = DAG.getUNDEF(ConcatVT);
	return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0, N1);
	}

	// Look at every vector that's inserted. We're looking for exact
	// subvector-sized copies from a concatenated vector
	for (unsigned I = 0; I != NumConcats; ++I) {
	// Make sure we're dealing with a copy.
	unsigned Begin = I * NumElemsPerConcat;
	bool AllUndef = true, NoUndef = true;
	for (unsigned J = Begin; J != Begin + NumElemsPerConcat; ++J) {
	if (SVN->getMaskElt(J) >= 0)
	AllUndef = false;
	else
	NoUndef = false;
	}

	if (NoUndef) {
	if (SVN->getMaskElt(Begin) % NumElemsPerConcat != 0)
	return SDValue();

	for (unsigned J = 1; J != NumElemsPerConcat; ++J)
	if (SVN->getMaskElt(Begin + J - 1) + 1 != SVN->getMaskElt(Begin + J))
	return SDValue();

	unsigned FirstElt = SVN->getMaskElt(Begin) / NumElemsPerConcat;
	if (FirstElt < N0.getNumOperands())
	Ops.push_back(N0.getOperand(FirstElt));
	else
	Ops.push_back(N1.getOperand(FirstElt - N0.getNumOperands()));

	} else if (AllUndef) {
	Ops.push_back(DAG.getUNDEF(N0.getOperand(0).getValueType()));
	} else { // Mixed with general masks and undefs, can't do optimization.
	return SDValue();
	}
	}

	return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
	}

	// Attempt to combine a shuffle of 2 inputs of 'scalar sources' -
	// BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR.
	//
	// SHUFFLE(BUILD_VECTOR(), BUILD_VECTOR()) -> BUILD_VECTOR() is always
	// a simplification in some sense, but it isn't appropriate in general: some
	// BUILD_VECTORs are substantially cheaper than others. The general case
	// of a BUILD_VECTOR requires inserting each element individually (or
	// performing the equivalent in a temporary stack variable). A BUILD_VECTOR of
	// all constants is a single constant pool load. A BUILD_VECTOR where each
	// element is identical is a splat. A BUILD_VECTOR where most of the operands
	// are undef lowers to a small number of element insertions.
	//
	// To deal with this, we currently use a bunch of mostly arbitrary heuristics.
	// We don't fold shuffles where one side is a non-zero constant, and we don't
	// fold shuffles if the resulting (non-splat) BUILD_VECTOR would have duplicate
	// non-constant operands. This seems to work out reasonably well in practice.
	static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN,
	SelectionDAG &DAG,
	const TargetLowering &TLI) {
	EVT VT = SVN->getValueType(0);
	unsigned NumElts = VT.getVectorNumElements();
	SDValue N0 = SVN->getOperand(0);
	SDValue N1 = SVN->getOperand(1);

	if (!N0->hasOneUse() \|\| !N1->hasOneUse())
	return SDValue();

	// If only one of N1,N2 is constant, bail out if it is not ALL_ZEROS as
	// discussed above.
	if (!N1.isUndef()) {
	bool N0AnyConst = isAnyConstantBuildVector(N0.getNode());
	bool N1AnyConst = isAnyConstantBuildVector(N1.getNode());
	if (N0AnyConst && !N1AnyConst && !ISD::isBuildVectorAllZeros(N0.getNode()))
	return SDValue();
	if (!N0AnyConst && N1AnyConst && !ISD::isBuildVectorAllZeros(N1.getNode()))
	return SDValue();
	}

	// If both inputs are splats of the same value then we can safely merge this
	// to a single BUILD_VECTOR with undef elements based on the shuffle mask.
	bool IsSplat = false;
	auto *BV0 = dyn_cast<BuildVectorSDNode>(N0);
	auto *BV1 = dyn_cast<BuildVectorSDNode>(N1);
	if (BV0 && BV1)
	if (SDValue Splat0 = BV0->getSplatValue())
	IsSplat = (Splat0 == BV1->getSplatValue());

	SmallVector<SDValue, 8> Ops;
	SmallSet<SDValue, 16> DuplicateOps;
	for (int M : SVN->getMask()) {
	SDValue Op = DAG.getUNDEF(VT.getScalarType());
	if (M >= 0) {
	int Idx = M < (int)NumElts ? M : M - NumElts;
	SDValue &S = (M < (int)NumElts ? N0 : N1);
	if (S.getOpcode() == ISD::BUILD_VECTOR) {
	Op = S.getOperand(Idx);
	} else if (S.getOpcode() == ISD::SCALAR_TO_VECTOR) {
	assert(Idx == 0 && "Unexpected SCALAR_TO_VECTOR operand index.");
	Op = S.getOperand(0);
	} else {
	// Operand can't be combined - bail out.
	return SDValue();
	}
	}

	// Don't duplicate a non-constant BUILD_VECTOR operand unless we're
	// generating a splat; semantically, this is fine, but it's likely to
	// generate low-quality code if the target can't reconstruct an appropriate
	// shuffle.
	if (!Op.isUndef() && !isa<ConstantSDNode>(Op) && !isa<ConstantFPSDNode>(Op))
	if (!IsSplat && !DuplicateOps.insert(Op).second)
	return SDValue();

	Ops.push_back(Op);
	}

	// BUILD_VECTOR requires all inputs to be of the same type, find the
	// maximum type and extend them all.
	EVT SVT = VT.getScalarType();
	if (SVT.isInteger())
	for (SDValue &Op : Ops)
	SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT);
	if (SVT != VT.getScalarType())
	for (SDValue &Op : Ops)
	Op = TLI.isZExtFree(Op.getValueType(), SVT)
	? DAG.getZExtOrTrunc(Op, SDLoc(SVN), SVT)
	: DAG.getSExtOrTrunc(Op, SDLoc(SVN), SVT);
	return DAG.getBuildVector(VT, SDLoc(SVN), Ops);
	}

	// Match shuffles that can be converted to any_vector_extend_in_reg.
	// This is often generated during legalization.
	// e.g. v4i32 <0,u,1,u> -> (v2i64 any_vector_extend_in_reg(v4i32 src))
	// TODO Add support for ZERO_EXTEND_VECTOR_INREG when we have a test case.
	static SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN,
	SelectionDAG &DAG,
	const TargetLowering &TLI,
	bool LegalOperations,
	bool LegalTypes) {
	EVT VT = SVN->getValueType(0);
	bool IsBigEndian = DAG.getDataLayout().isBigEndian();

	// TODO Add support for big-endian when we have a test case.
	if (!VT.isInteger() \|\| IsBigEndian)
	return SDValue();

	unsigned NumElts = VT.getVectorNumElements();
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	ArrayRef<int> Mask = SVN->getMask();
	SDValue N0 = SVN->getOperand(0);

	// shuffle<0,-1,1,-1> == (v2i64 anyextend_vector_inreg(v4i32))
	auto isAnyExtend = [&Mask, &NumElts](unsigned Scale) {
	for (unsigned i = 0; i != NumElts; ++i) {
	if (Mask[i] < 0)
	continue;
	if ((i % Scale) == 0 && Mask[i] == (int)(i / Scale))
	continue;
	return false;
	}
	return true;
	};

	// Attempt to match a '*_extend_vector_inreg' shuffle, we just search for
	// power-of-2 extensions as they are the most likely.
	for (unsigned Scale = 2; Scale < NumElts; Scale *= 2) {
	// Check for non power of 2 vector sizes
	if (NumElts % Scale != 0)
	continue;
	if (!isAnyExtend(Scale))
	continue;

	EVT OutSVT = EVT::getIntegerVT(DAG.getContext(), EltSizeInBits Scale);
	EVT OutVT = EVT::getVectorVT(*DAG.getContext(), OutSVT, NumElts / Scale);
	if (!LegalTypes \|\| TLI.isTypeLegal(OutVT))
	if (!LegalOperations \|\|
	TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND_VECTOR_INREG, OutVT))
	return DAG.getBitcast(VT,
	DAG.getAnyExtendVectorInReg(N0, SDLoc(SVN), OutVT));
	}

	return SDValue();
	}

	// Detect 'truncate_vector_inreg' style shuffles that pack the lower parts of
	// each source element of a large type into the lowest elements of a smaller
	// destination type. This is often generated during legalization.
	// If the source node itself was a '*_extend_vector_inreg' node then we should
	// then be able to remove it.
	static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN,
	SelectionDAG &DAG) {
	EVT VT = SVN->getValueType(0);
	bool IsBigEndian = DAG.getDataLayout().isBigEndian();

	// TODO Add support for big-endian when we have a test case.
	if (!VT.isInteger() \|\| IsBigEndian)
	return SDValue();

	SDValue N0 = peekThroughBitcast(SVN->getOperand(0));

	unsigned Opcode = N0.getOpcode();
	if (Opcode != ISD::ANY_EXTEND_VECTOR_INREG &&
	Opcode != ISD::SIGN_EXTEND_VECTOR_INREG &&
	Opcode != ISD::ZERO_EXTEND_VECTOR_INREG)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	ArrayRef<int> Mask = SVN->getMask();
	unsigned NumElts = VT.getVectorNumElements();
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	unsigned ExtSrcSizeInBits = N00.getScalarValueSizeInBits();
	unsigned ExtDstSizeInBits = N0.getScalarValueSizeInBits();

	if (ExtDstSizeInBits % ExtSrcSizeInBits != 0)
	return SDValue();
	unsigned ExtScale = ExtDstSizeInBits / ExtSrcSizeInBits;

	// (v4i32 truncate_vector_inreg(v2i64)) == shuffle<0,2-1,-1>
	// (v8i16 truncate_vector_inreg(v4i32)) == shuffle<0,2,4,6,-1,-1,-1,-1>
	// (v8i16 truncate_vector_inreg(v2i64)) == shuffle<0,4,-1,-1,-1,-1,-1,-1>
	auto isTruncate = [&Mask, &NumElts](unsigned Scale) {
	for (unsigned i = 0; i != NumElts; ++i) {
	if (Mask[i] < 0)
	continue;
	if ((i * Scale) < NumElts && Mask[i] == (int)(i * Scale))
	continue;
	return false;
	}
	return true;
	};

	// At the moment we just handle the case where we've truncated back to the
	// same size as before the extension.
	// TODO: handle more extension/truncation cases as cases arise.
	if (EltSizeInBits != ExtSrcSizeInBits)
	return SDValue();

	// We can remove *extend_vector_inreg only if the truncation happens at
	// the same scale as the extension.
	if (isTruncate(ExtScale))
	return DAG.getBitcast(VT, N00);

	return SDValue();
	}

	// Combine shuffles of splat-shuffles of the form:
	// shuffle (shuffle V, undef, splat-mask), undef, M
	// If splat-mask contains undef elements, we need to be careful about
	// introducing undef's in the folded mask which are not the result of composing
	// the masks of the shuffles.
	static SDValue combineShuffleOfSplat(ArrayRef<int> UserMask,
	ShuffleVectorSDNode *Splat,
	SelectionDAG &DAG) {
	ArrayRef<int> SplatMask = Splat->getMask();
	assert(UserMask.size() == SplatMask.size() && "Mask length mismatch");

	// Prefer simplifying to the splat-shuffle, if possible. This is legal if
	// every undef mask element in the splat-shuffle has a corresponding undef
	// element in the user-shuffle's mask or if the composition of mask elements
	// would result in undef.
	// Examples for (shuffle (shuffle v, undef, SplatMask), undef, UserMask):
	// * UserMask=[0,2,u,u], SplatMask=[2,u,2,u] -> [2,2,u,u]
	// In this case it is not legal to simplify to the splat-shuffle because we
	// may be exposing the users of the shuffle an undef element at index 1
	// which was not there before the combine.
	// * UserMask=[0,u,2,u], SplatMask=[2,u,2,u] -> [2,u,2,u]
	// In this case the composition of masks yields SplatMask, so it's ok to
	// simplify to the splat-shuffle.
	// * UserMask=[3,u,2,u], SplatMask=[2,u,2,u] -> [u,u,2,u]
	// In this case the composed mask includes all undef elements of SplatMask
	// and in addition sets element zero to undef. It is safe to simplify to
	// the splat-shuffle.
	auto CanSimplifyToExistingSplat = [](ArrayRef<int> UserMask,
	ArrayRef<int> SplatMask) {
	for (unsigned i = 0, e = UserMask.size(); i != e; ++i)
	if (UserMask[i] != -1 && SplatMask[i] == -1 &&
	SplatMask[UserMask[i]] != -1)
	return false;
	return true;
	};
	if (CanSimplifyToExistingSplat(UserMask, SplatMask))
	return SDValue(Splat, 0);

	// Create a new shuffle with a mask that is composed of the two shuffles'
	// masks.
	SmallVector<int, 32> NewMask;
	for (int Idx : UserMask)
	NewMask.push_back(Idx == -1 ? -1 : SplatMask[Idx]);

	return DAG.getVectorShuffle(Splat->getValueType(0), SDLoc(Splat),
	Splat->getOperand(0), Splat->getOperand(1),
	NewMask);
	}

	/// If the shuffle mask is taking exactly one element from the first vector
	/// operand and passing through all other elements from the second vector
	/// operand, return the index of the mask element that is choosing an element
	/// from the first operand. Otherwise, return -1.
	static int getShuffleMaskIndexOfOneElementFromOp0IntoOp1(ArrayRef<int> Mask) {
	int MaskSize = Mask.size();
	int EltFromOp0 = -1;
	// TODO: This does not match if there are undef elements in the shuffle mask.
	// Should we ignore undefs in the shuffle mask instead? The trade-off is
	// removing an instruction (a shuffle), but losing the knowledge that some
	// vector lanes are not needed.
	for (int i = 0; i != MaskSize; ++i) {
	if (Mask[i] >= 0 && Mask[i] < MaskSize) {
	// We're looking for a shuffle of exactly one element from operand 0.
	if (EltFromOp0 != -1)
	return -1;
	EltFromOp0 = i;
	} else if (Mask[i] != i + MaskSize) {
	// Nothing from operand 1 can change lanes.
	return -1;
	}
	}
	return EltFromOp0;
	}

	/// If a shuffle inserts exactly one element from a source vector operand into
	/// another vector operand and we can access the specified element as a scalar,
	/// then we can eliminate the shuffle.
	static SDValue replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf,
	SelectionDAG &DAG) {
	// First, check if we are taking one element of a vector and shuffling that
	// element into another vector.
	ArrayRef<int> Mask = Shuf->getMask();
	SmallVector<int, 16> CommutedMask(Mask.begin(), Mask.end());
	SDValue Op0 = Shuf->getOperand(0);
	SDValue Op1 = Shuf->getOperand(1);
	int ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(Mask);
	if (ShufOp0Index == -1) {
	// Commute mask and check again.
	ShuffleVectorSDNode::commuteMask(CommutedMask);
	ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(CommutedMask);
	if (ShufOp0Index == -1)
	return SDValue();
	// Commute operands to match the commuted shuffle mask.
	std::swap(Op0, Op1);
	Mask = CommutedMask;
	}

	// The shuffle inserts exactly one element from operand 0 into operand 1.
	// Now see if we can access that element as a scalar via a real insert element
	// instruction.
	// TODO: We can try harder to locate the element as a scalar. Examples: it
	// could be an operand of SCALAR_TO_VECTOR, BUILD_VECTOR, or a constant.
	assert(Mask[ShufOp0Index] >= 0 && Mask[ShufOp0Index] < (int)Mask.size() &&
	"Shuffle mask value must be from operand 0");
	if (Op0.getOpcode() != ISD::INSERT_VECTOR_ELT)
	return SDValue();

	auto *InsIndexC = dyn_cast<ConstantSDNode>(Op0.getOperand(2));
	if (!InsIndexC \|\| InsIndexC->getSExtValue() != Mask[ShufOp0Index])
	return SDValue();

	// There's an existing insertelement with constant insertion index, so we
	// don't need to check the legality/profitability of a replacement operation
	// that differs at most in the constant value. The target should be able to
	// lower any of those in a similar way. If not, legalization will expand this
	// to a scalar-to-vector plus shuffle.
	//
	// Note that the shuffle may move the scalar from the position that the insert
	// element used. Therefore, our new insert element occurs at the shuffle's
	// mask index value, not the insert's index value.
	// shuffle (insertelt v1, x, C), v2, mask --> insertelt v2, x, C'
	SDValue NewInsIndex = DAG.getConstant(ShufOp0Index, SDLoc(Shuf),
	Op0.getOperand(2).getValueType());
	return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Shuf), Op0.getValueType(),
	Op1, Op0.getOperand(1), NewInsIndex);
	}

	SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
	EVT VT = N->getValueType(0);
	unsigned NumElts = VT.getVectorNumElements();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	assert(N0.getValueType() == VT && "Vector shuffle must be normalized in DAG");

	// Canonicalize shuffle undef, undef -> undef
	if (N0.isUndef() && N1.isUndef())
	return DAG.getUNDEF(VT);

	ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);

	// Canonicalize shuffle v, v -> v, undef
	if (N0 == N1) {
	SmallVector<int, 8> NewMask;
	for (unsigned i = 0; i != NumElts; ++i) {
	int Idx = SVN->getMaskElt(i);
	if (Idx >= (int)NumElts) Idx -= NumElts;
	NewMask.push_back(Idx);
	}
	return DAG.getVectorShuffle(VT, SDLoc(N), N0, DAG.getUNDEF(VT), NewMask);
	}

	// Canonicalize shuffle undef, v -> v, undef. Commute the shuffle mask.
	if (N0.isUndef())
	return DAG.getCommutedVectorShuffle(*SVN);

	// Remove references to rhs if it is undef
	if (N1.isUndef()) {
	bool Changed = false;
	SmallVector<int, 8> NewMask;
	for (unsigned i = 0; i != NumElts; ++i) {
	int Idx = SVN->getMaskElt(i);
	if (Idx >= (int)NumElts) {
	Idx = -1;
	Changed = true;
	}
	NewMask.push_back(Idx);
	}
	if (Changed)
	return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, NewMask);
	}

	// Simplify shuffle mask if a referenced element is UNDEF.
	if (SDValue V = simplifyShuffleMask(SVN, N0, N1, DAG))
	return V;

	if (SDValue InsElt = replaceShuffleOfInsert(SVN, DAG))
	return InsElt;

	// A shuffle of a single vector that is a splat can always be folded.
	if (auto *N0Shuf = dyn_cast<ShuffleVectorSDNode>(N0))
	if (N1->isUndef() && N0Shuf->isSplat())
	return combineShuffleOfSplat(SVN->getMask(), N0Shuf, DAG);

	// If it is a splat, check if the argument vector is another splat or a
	// build_vector.
	if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) {
	SDNode *V = N0.getNode();

	// If this is a bit convert that changes the element type of the vector but
	// not the number of vector elements, look through it. Be careful not to
	// look though conversions that change things like v4f32 to v2f64.
	if (V->getOpcode() == ISD::BITCAST) {
	SDValue ConvInput = V->getOperand(0);
	if (ConvInput.getValueType().isVector() &&
	ConvInput.getValueType().getVectorNumElements() == NumElts)
	V = ConvInput.getNode();
	}

	if (V->getOpcode() == ISD::BUILD_VECTOR) {
	assert(V->getNumOperands() == NumElts &&
	"BUILD_VECTOR has wrong number of operands");
	SDValue Base;
	bool AllSame = true;
	for (unsigned i = 0; i != NumElts; ++i) {
	if (!V->getOperand(i).isUndef()) {
	Base = V->getOperand(i);
	break;
	}
	}
	// Splat of <u, u, u, u>, return <u, u, u, u>
	if (!Base.getNode())
	return N0;
	for (unsigned i = 0; i != NumElts; ++i) {
	if (V->getOperand(i) != Base) {
	AllSame = false;
	break;
	}
	}
	// Splat of <x, x, x, x>, return <x, x, x, x>
	if (AllSame)
	return N0;

	// Canonicalize any other splat as a build_vector.
	const SDValue &Splatted = V->getOperand(SVN->getSplatIndex());
	SmallVector<SDValue, 8> Ops(NumElts, Splatted);
	SDValue NewBV = DAG.getBuildVector(V->getValueType(0), SDLoc(N), Ops);

	// We may have jumped through bitcasts, so the type of the
	// BUILD_VECTOR may not match the type of the shuffle.
	if (V->getValueType(0) != VT)
	NewBV = DAG.getBitcast(VT, NewBV);
	return NewBV;
	}
	}

	// There are various patterns used to build up a vector from smaller vectors,
	// subvectors, or elements. Scan chains of these and replace unused insertions
	// or components with undef.
	if (SDValue S = simplifyShuffleOperands(SVN, N0, N1, DAG))
	return S;

	// Match shuffles that can be converted to any_vector_extend_in_reg.
	if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations, LegalTypes))
	return V;

	// Combine "truncate_vector_in_reg" style shuffles.
	if (SDValue V = combineTruncationShuffle(SVN, DAG))
	return V;

	if (N0.getOpcode() == ISD::CONCAT_VECTORS &&
	Level < AfterLegalizeVectorOps &&
	(N1.isUndef() \|\|
	(N1.getOpcode() == ISD::CONCAT_VECTORS &&
	N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()))) {
	if (SDValue V = partitionShuffleOfConcats(N, DAG))
	return V;
	}

	// Attempt to combine a shuffle of 2 inputs of 'scalar sources' -
	// BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR.
	if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT))
	if (SDValue Res = combineShuffleOfScalars(SVN, DAG, TLI))
	return Res;

	// If this shuffle only has a single input that is a bitcasted shuffle,
	// attempt to merge the 2 shuffles and suitably bitcast the inputs/output
	// back to their original types.
	if (N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() &&
	N1.isUndef() && Level < AfterLegalizeVectorOps &&
	TLI.isTypeLegal(VT)) {

	// Peek through the bitcast only if there is one user.
	SDValue BC0 = N0;
	while (BC0.getOpcode() == ISD::BITCAST) {
	if (!BC0.hasOneUse())
	break;
	BC0 = BC0.getOperand(0);
	}

	auto ScaleShuffleMask = [](ArrayRef<int> Mask, int Scale) {
	if (Scale == 1)
	return SmallVector<int, 8>(Mask.begin(), Mask.end());

	SmallVector<int, 8> NewMask;
	for (int M : Mask)
	for (int s = 0; s != Scale; ++s)
	NewMask.push_back(M < 0 ? -1 : Scale * M + s);
	return NewMask;
	};

	if (BC0.getOpcode() == ISD::VECTOR_SHUFFLE && BC0.hasOneUse()) {
	EVT SVT = VT.getScalarType();
	EVT InnerVT = BC0->getValueType(0);
	EVT InnerSVT = InnerVT.getScalarType();

	// Determine which shuffle works with the smaller scalar type.
	EVT ScaleVT = SVT.bitsLT(InnerSVT) ? VT : InnerVT;
	EVT ScaleSVT = ScaleVT.getScalarType();

	if (TLI.isTypeLegal(ScaleVT) &&
	0 == (InnerSVT.getSizeInBits() % ScaleSVT.getSizeInBits()) &&
	0 == (SVT.getSizeInBits() % ScaleSVT.getSizeInBits())) {
	int InnerScale = InnerSVT.getSizeInBits() / ScaleSVT.getSizeInBits();
	int OuterScale = SVT.getSizeInBits() / ScaleSVT.getSizeInBits();

	// Scale the shuffle masks to the smaller scalar type.
	ShuffleVectorSDNode *InnerSVN = cast<ShuffleVectorSDNode>(BC0);
	SmallVector<int, 8> InnerMask =
	ScaleShuffleMask(InnerSVN->getMask(), InnerScale);
	SmallVector<int, 8> OuterMask =
	ScaleShuffleMask(SVN->getMask(), OuterScale);

	// Merge the shuffle masks.
	SmallVector<int, 8> NewMask;
	for (int M : OuterMask)
	NewMask.push_back(M < 0 ? -1 : InnerMask[M]);

	// Test for shuffle mask legality over both commutations.
	SDValue SV0 = BC0->getOperand(0);
	SDValue SV1 = BC0->getOperand(1);
	bool LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT);
	if (!LegalMask) {
	std::swap(SV0, SV1);
	ShuffleVectorSDNode::commuteMask(NewMask);
	LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT);
	}

	if (LegalMask) {
	SV0 = DAG.getBitcast(ScaleVT, SV0);
	SV1 = DAG.getBitcast(ScaleVT, SV1);
	return DAG.getBitcast(
	VT, DAG.getVectorShuffle(ScaleVT, SDLoc(N), SV0, SV1, NewMask));
	}
	}
	}
	}

	// Canonicalize shuffles according to rules:
	// shuffle(A, shuffle(A, B)) -> shuffle(shuffle(A,B), A)
	// shuffle(B, shuffle(A, B)) -> shuffle(shuffle(A,B), B)
	// shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
	if (N1.getOpcode() == ISD::VECTOR_SHUFFLE &&
	N0.getOpcode() != ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG &&
	TLI.isTypeLegal(VT)) {
	// The incoming shuffle must be of the same type as the result of the
	// current shuffle.
	assert(N1->getOperand(0).getValueType() == VT &&
	"Shuffle types don't match");

	SDValue SV0 = N1->getOperand(0);
	SDValue SV1 = N1->getOperand(1);
	bool HasSameOp0 = N0 == SV0;
	bool IsSV1Undef = SV1.isUndef();
	if (HasSameOp0 \|\| IsSV1Undef \|\| N0 == SV1)
	// Commute the operands of this shuffle so that next rule
	// will trigger.
	return DAG.getCommutedVectorShuffle(*SVN);
	}

	// Try to fold according to rules:
	// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
	// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
	// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
	// Don't try to fold shuffles with illegal type.
	// Only fold if this shuffle is the only user of the other shuffle.
	if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && N->isOnlyUserOf(N0.getNode()) &&
	Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) {
	ShuffleVectorSDNode *OtherSV = cast<ShuffleVectorSDNode>(N0);

	// Don't try to fold splats; they're likely to simplify somehow, or they
	// might be free.
	if (OtherSV->isSplat())
	return SDValue();

	// The incoming shuffle must be of the same type as the result of the
	// current shuffle.
	assert(OtherSV->getOperand(0).getValueType() == VT &&
	"Shuffle types don't match");

	SDValue SV0, SV1;
	SmallVector<int, 4> Mask;
	// Compute the combined shuffle mask for a shuffle with SV0 as the first
	// operand, and SV1 as the second operand.
	for (unsigned i = 0; i != NumElts; ++i) {
	int Idx = SVN->getMaskElt(i);
	if (Idx < 0) {
	// Propagate Undef.
	Mask.push_back(Idx);
	continue;
	}

	SDValue CurrentVec;
	if (Idx < (int)NumElts) {
	// This shuffle index refers to the inner shuffle N0. Lookup the inner
	// shuffle mask to identify which vector is actually referenced.
	Idx = OtherSV->getMaskElt(Idx);
	if (Idx < 0) {
	// Propagate Undef.
	Mask.push_back(Idx);
	continue;
	}

	CurrentVec = (Idx < (int) NumElts) ? OtherSV->getOperand(0)
	: OtherSV->getOperand(1);
	} else {
	// This shuffle index references an element within N1.
	CurrentVec = N1;
	}

	// Simple case where 'CurrentVec' is UNDEF.
	if (CurrentVec.isUndef()) {
	Mask.push_back(-1);
	continue;
	}

	// Canonicalize the shuffle index. We don't know yet if CurrentVec
	// will be the first or second operand of the combined shuffle.
	Idx = Idx % NumElts;
	if (!SV0.getNode() \|\| SV0 == CurrentVec) {
	// Ok. CurrentVec is the left hand side.
	// Update the mask accordingly.
	SV0 = CurrentVec;
	Mask.push_back(Idx);
	continue;
	}

	// Bail out if we cannot convert the shuffle pair into a single shuffle.
	if (SV1.getNode() && SV1 != CurrentVec)
	return SDValue();

	// Ok. CurrentVec is the right hand side.
	// Update the mask accordingly.
	SV1 = CurrentVec;
	Mask.push_back(Idx + NumElts);
	}

	// Check if all indices in Mask are Undef. In case, propagate Undef.
	bool isUndefMask = true;
	for (unsigned i = 0; i != NumElts && isUndefMask; ++i)
	isUndefMask &= Mask[i] < 0;

	if (isUndefMask)
	return DAG.getUNDEF(VT);

	if (!SV0.getNode())
	SV0 = DAG.getUNDEF(VT);
	if (!SV1.getNode())
	SV1 = DAG.getUNDEF(VT);

	// Avoid introducing shuffles with illegal mask.
	if (!TLI.isShuffleMaskLegal(Mask, VT)) {
	ShuffleVectorSDNode::commuteMask(Mask);

	if (!TLI.isShuffleMaskLegal(Mask, VT))
	return SDValue();

	// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, A, M2)
	// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, A, M2)
	// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, B, M2)
	std::swap(SV0, SV1);
	}

	// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
	// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
	// shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
	return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, Mask);
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitSCALAR_TO_VECTOR(SDNode *N) {
	SDValue InVal = N->getOperand(0);
	EVT VT = N->getValueType(0);

	// Replace a SCALAR_TO_VECTOR(EXTRACT_VECTOR_ELT(V,C0)) pattern
	// with a VECTOR_SHUFFLE and possible truncate.
	if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	SDValue InVec = InVal->getOperand(0);
	SDValue EltNo = InVal->getOperand(1);
	auto InVecT = InVec.getValueType();
	if (ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(EltNo)) {
	SmallVector<int, 8> NewMask(InVecT.getVectorNumElements(), -1);
	int Elt = C0->getZExtValue();
	NewMask[0] = Elt;
	SDValue Val;
	// If we have an implict truncate do truncate here as long as it's legal.
	// if it's not legal, this should
	if (VT.getScalarType() != InVal.getValueType() &&
	InVal.getValueType().isScalarInteger() &&
	isTypeLegal(VT.getScalarType())) {
	Val =
	DAG.getNode(ISD::TRUNCATE, SDLoc(InVal), VT.getScalarType(), InVal);
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Val);
	}
	if (VT.getScalarType() == InVecT.getScalarType() &&
	VT.getVectorNumElements() <= InVecT.getVectorNumElements() &&
	TLI.isShuffleMaskLegal(NewMask, VT)) {
	Val = DAG.getVectorShuffle(InVecT, SDLoc(N), InVec,
	DAG.getUNDEF(InVecT), NewMask);
	// If the initial vector is the correct size this shuffle is a
	// valid result.
	if (VT == InVecT)
	return Val;
	// If not we must truncate the vector.
	if (VT.getVectorNumElements() != InVecT.getVectorNumElements()) {
	MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
	SDValue ZeroIdx = DAG.getConstant(0, SDLoc(N), IdxTy);
	EVT SubVT =
	EVT::getVectorVT(*DAG.getContext(), InVecT.getVectorElementType(),
	VT.getVectorNumElements());
	Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), SubVT, Val,
	ZeroIdx);
	return Val;
	}
	}
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue N2 = N->getOperand(2);

	// If inserting an UNDEF, just return the original vector.
	if (N1.isUndef())
	return N0;

	// For nested INSERT_SUBVECTORs, attempt to combine inner node first to allow
	// us to pull BITCASTs from input to output.
	if (N0.hasOneUse() && N0->getOpcode() == ISD::INSERT_SUBVECTOR)
	if (SDValue NN0 = visitINSERT_SUBVECTOR(N0.getNode()))
	return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, NN0, N1, N2);

	// If this is an insert of an extracted vector into an undef vector, we can
	// just use the input to the extract.
	if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N1.getOperand(1) == N2 && N1.getOperand(0).getValueType() == VT)
	return N1.getOperand(0);

	// If we are inserting a bitcast value into an undef, with the same
	// number of elements, just use the bitcast input of the extract.
	// i.e. INSERT_SUBVECTOR UNDEF (BITCAST N1) N2 ->
	// BITCAST (INSERT_SUBVECTOR UNDEF N1 N2)
	if (N0.isUndef() && N1.getOpcode() == ISD::BITCAST &&
	N1.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N1.getOperand(0).getOperand(1) == N2 &&
	N1.getOperand(0).getOperand(0).getValueType().getVectorNumElements() ==
	- VT.getVectorNumElements()) {
	+ VT.getVectorNumElements() &&
	+ N1.getOperand(0).getOperand(0).getValueType().getSizeInBits() ==
	+ VT.getSizeInBits()) {
	return DAG.getBitcast(VT, N1.getOperand(0).getOperand(0));
	}

	// If both N1 and N2 are bitcast values on which insert_subvector
	// would makes sense, pull the bitcast through.
	// i.e. INSERT_SUBVECTOR (BITCAST N0) (BITCAST N1) N2 ->
	// BITCAST (INSERT_SUBVECTOR N0 N1 N2)
	if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
	SDValue CN0 = N0.getOperand(0);
	SDValue CN1 = N1.getOperand(0);
	if (CN0.getValueType().getVectorElementType() ==
	CN1.getValueType().getVectorElementType() &&
	CN0.getValueType().getVectorNumElements() ==
	VT.getVectorNumElements()) {
	SDValue NewINSERT = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N),
	CN0.getValueType(), CN0, CN1, N2);
	return DAG.getBitcast(VT, NewINSERT);
	}
	}

	// Combine INSERT_SUBVECTORs where we are inserting to the same index.
	// INSERT_SUBVECTOR( INSERT_SUBVECTOR( Vec, SubOld, Idx ), SubNew, Idx )
	// --> INSERT_SUBVECTOR( Vec, SubNew, Idx )
	if (N0.getOpcode() == ISD::INSERT_SUBVECTOR &&
	N0.getOperand(1).getValueType() == N1.getValueType() &&
	N0.getOperand(2) == N2)
	return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
	N1, N2);

	if (!isa<ConstantSDNode>(N2))
	return SDValue();

	unsigned InsIdx = cast<ConstantSDNode>(N2)->getZExtValue();

	// Canonicalize insert_subvector dag nodes.
	// Example:
	// (insert_subvector (insert_subvector A, Idx0), Idx1)
	// -> (insert_subvector (insert_subvector A, Idx1), Idx0)
	if (N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.hasOneUse() &&
	N1.getValueType() == N0.getOperand(1).getValueType() &&
	isa<ConstantSDNode>(N0.getOperand(2))) {
	unsigned OtherIdx = N0.getConstantOperandVal(2);
	if (InsIdx < OtherIdx) {
	// Swap nodes.
	SDValue NewOp = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT,
	N0.getOperand(0), N1, N2);
	AddToWorklist(NewOp.getNode());
	return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N0.getNode()),
	VT, NewOp, N0.getOperand(1), N0.getOperand(2));
	}
	}

	// If the input vector is a concatenation, and the insert replaces
	// one of the pieces, we can optimize into a single concat_vectors.
	if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0.hasOneUse() &&
	N0.getOperand(0).getValueType() == N1.getValueType()) {
	unsigned Factor = N1.getValueType().getVectorNumElements();

	SmallVector<SDValue, 8> Ops(N0->op_begin(), N0->op_end());
	Ops[cast<ConstantSDNode>(N2)->getZExtValue() / Factor] = N1;

	return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
	}

	return SDValue();
	}

	SDValue DAGCombiner::visitFP_TO_FP16(SDNode *N) {
	SDValue N0 = N->getOperand(0);

	// fold (fp_to_fp16 (fp16_to_fp op)) -> op
	if (N0->getOpcode() == ISD::FP16_TO_FP)
	return N0->getOperand(0);

	return SDValue();
	}

	SDValue DAGCombiner::visitFP16_TO_FP(SDNode *N) {
	SDValue N0 = N->getOperand(0);

	// fold fp16_to_fp(op & 0xffff) -> fp16_to_fp(op)
	if (N0->getOpcode() == ISD::AND) {
	ConstantSDNode *AndConst = getAsNonOpaqueConstant(N0.getOperand(1));
	if (AndConst && AndConst->getAPIntValue() == 0xffff) {
	return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), N->getValueType(0),
	N0.getOperand(0));
	}
	}

	return SDValue();
	}

	/// Returns a vector_shuffle if it able to transform an AND to a vector_shuffle
	/// with the destination vector and a zero vector.
	/// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==>
	/// vector_shuffle V, Zero, <0, 4, 2, 4>
	SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
	assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");

	EVT VT = N->getValueType(0);
	SDValue LHS = N->getOperand(0);
	SDValue RHS = peekThroughBitcast(N->getOperand(1));
	SDLoc DL(N);

	// Make sure we're not running after operation legalization where it
	// may have custom lowered the vector shuffles.
	if (LegalOperations)
	return SDValue();

	if (RHS.getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	EVT RVT = RHS.getValueType();
	unsigned NumElts = RHS.getNumOperands();

	// Attempt to create a valid clear mask, splitting the mask into
	// sub elements and checking to see if each is
	// all zeros or all ones - suitable for shuffle masking.
	auto BuildClearMask = [&](int Split) {
	int NumSubElts = NumElts * Split;
	int NumSubBits = RVT.getScalarSizeInBits() / Split;

	SmallVector<int, 8> Indices;
	for (int i = 0; i != NumSubElts; ++i) {
	int EltIdx = i / Split;
	int SubIdx = i % Split;
	SDValue Elt = RHS.getOperand(EltIdx);
	if (Elt.isUndef()) {
	Indices.push_back(-1);
	continue;
	}

	APInt Bits;
	if (isa<ConstantSDNode>(Elt))
	Bits = cast<ConstantSDNode>(Elt)->getAPIntValue();
	else if (isa<ConstantFPSDNode>(Elt))
	Bits = cast<ConstantFPSDNode>(Elt)->getValueAPF().bitcastToAPInt();
	else
	return SDValue();

	// Extract the sub element from the constant bit mask.
	if (DAG.getDataLayout().isBigEndian()) {
	Bits.lshrInPlace((Split - SubIdx - 1) * NumSubBits);
	} else {
	Bits.lshrInPlace(SubIdx * NumSubBits);
	}

	if (Split > 1)
	Bits = Bits.trunc(NumSubBits);

	if (Bits.isAllOnesValue())
	Indices.push_back(i);
	else if (Bits == 0)
	Indices.push_back(i + NumSubElts);
	else
	return SDValue();
	}

	// Let's see if the target supports this vector_shuffle.
	EVT ClearSVT = EVT::getIntegerVT(*DAG.getContext(), NumSubBits);
	EVT ClearVT = EVT::getVectorVT(*DAG.getContext(), ClearSVT, NumSubElts);
	if (!TLI.isVectorClearMaskLegal(Indices, ClearVT))
	return SDValue();

	SDValue Zero = DAG.getConstant(0, DL, ClearVT);
	return DAG.getBitcast(VT, DAG.getVectorShuffle(ClearVT, DL,
	DAG.getBitcast(ClearVT, LHS),
	Zero, Indices));
	};

	// Determine maximum split level (byte level masking).
	int MaxSplit = 1;
	if (RVT.getScalarSizeInBits() % 8 == 0)
	MaxSplit = RVT.getScalarSizeInBits() / 8;

	for (int Split = 1; Split <= MaxSplit; ++Split)
	if (RVT.getScalarSizeInBits() % Split == 0)
	if (SDValue S = BuildClearMask(Split))
	return S;

	return SDValue();
	}

	/// Visit a binary vector operation, like ADD.
	SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
	assert(N->getValueType(0).isVector() &&
	"SimplifyVBinOp only works on vectors!");

	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	SDValue Ops[] = {LHS, RHS};

	// See if we can constant fold the vector operation.
	if (SDValue Fold = DAG.FoldConstantVectorArithmetic(
	N->getOpcode(), SDLoc(LHS), LHS.getValueType(), Ops, N->getFlags()))
	return Fold;

	// Type legalization might introduce new shuffles in the DAG.
	// Fold (VBinOp (shuffle (A, Undef, Mask)), (shuffle (B, Undef, Mask)))
	// -> (shuffle (VBinOp (A, B)), Undef, Mask).
	if (LegalTypes && isa<ShuffleVectorSDNode>(LHS) &&
	isa<ShuffleVectorSDNode>(RHS) && LHS.hasOneUse() && RHS.hasOneUse() &&
	LHS.getOperand(1).isUndef() &&
	RHS.getOperand(1).isUndef()) {
	ShuffleVectorSDNode *SVN0 = cast<ShuffleVectorSDNode>(LHS);
	ShuffleVectorSDNode *SVN1 = cast<ShuffleVectorSDNode>(RHS);

	if (SVN0->getMask().equals(SVN1->getMask())) {
	EVT VT = N->getValueType(0);
	SDValue UndefVector = LHS.getOperand(1);
	SDValue NewBinOp = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
	LHS.getOperand(0), RHS.getOperand(0),
	N->getFlags());
	AddUsersToWorklist(N);
	return DAG.getVectorShuffle(VT, SDLoc(N), NewBinOp, UndefVector,
	SVN0->getMask());
	}
	}

	return SDValue();
	}

	SDValue DAGCombiner::SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1,
	SDValue N2) {
	assert(N0.getOpcode() ==ISD::SETCC && "First argument must be a SetCC node!");

	SDValue SCC = SimplifySelectCC(DL, N0.getOperand(0), N0.getOperand(1), N1, N2,
	cast<CondCodeSDNode>(N0.getOperand(2))->get());

	// If we got a simplified select_cc node back from SimplifySelectCC, then
	// break it down into a new SETCC node, and a new SELECT node, and then return
	// the SELECT node, since we were called with a SELECT node.
	if (SCC.getNode()) {
	// Check to see if we got a select_cc back (to turn into setcc/select).
	// Otherwise, just return whatever node we got back, like fabs.
	if (SCC.getOpcode() == ISD::SELECT_CC) {
	SDValue SETCC = DAG.getNode(ISD::SETCC, SDLoc(N0),
	N0.getValueType(),
	SCC.getOperand(0), SCC.getOperand(1),
	SCC.getOperand(4));
	AddToWorklist(SETCC.getNode());
	return DAG.getSelect(SDLoc(SCC), SCC.getValueType(), SETCC,
	SCC.getOperand(2), SCC.getOperand(3));
	}

	return SCC;
	}
	return SDValue();
	}

	/// Given a SELECT or a SELECT_CC node, where LHS and RHS are the two values
	/// being selected between, see if we can simplify the select. Callers of this
	/// should assume that TheSelect is deleted if this returns true. As such, they
	/// should return the appropriate thing (e.g. the node) back to the top-level of
	/// the DAG combiner loop to avoid it being looked at.
	bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
	SDValue RHS) {
	// fold (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x))
	// The select + setcc is redundant, because fsqrt returns NaN for X < 0.
	if (const ConstantFPSDNode *NaN = isConstOrConstSplatFP(LHS)) {
	if (NaN->isNaN() && RHS.getOpcode() == ISD::FSQRT) {
	// We have: (select (setcc ?, ?, ?), NaN, (fsqrt ?))
	SDValue Sqrt = RHS;
	ISD::CondCode CC;
	SDValue CmpLHS;
	const ConstantFPSDNode *Zero = nullptr;

	if (TheSelect->getOpcode() == ISD::SELECT_CC) {
	CC = dyn_cast<CondCodeSDNode>(TheSelect->getOperand(4))->get();
	CmpLHS = TheSelect->getOperand(0);
	Zero = isConstOrConstSplatFP(TheSelect->getOperand(1));
	} else {
	// SELECT or VSELECT
	SDValue Cmp = TheSelect->getOperand(0);
	if (Cmp.getOpcode() == ISD::SETCC) {
	CC = dyn_cast<CondCodeSDNode>(Cmp.getOperand(2))->get();
	CmpLHS = Cmp.getOperand(0);
	Zero = isConstOrConstSplatFP(Cmp.getOperand(1));
	}
	}
	if (Zero && Zero->isZero() &&
	Sqrt.getOperand(0) == CmpLHS && (CC == ISD::SETOLT \|\|
	CC == ISD::SETULT \|\| CC == ISD::SETLT)) {
	// We have: (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x))
	CombineTo(TheSelect, Sqrt);
	return true;
	}
	}
	}
	// Cannot simplify select with vector condition
	if (TheSelect->getOperand(0).getValueType().isVector()) return false;

	// If this is a select from two identical things, try to pull the operation
	// through the select.
	if (LHS.getOpcode() != RHS.getOpcode() \|\|
	!LHS.hasOneUse() \|\| !RHS.hasOneUse())
	return false;

	// If this is a load and the token chain is identical, replace the select
	// of two loads with a load through a select of the address to load from.
	// This triggers in things like "select bool X, 10.0, 123.0" after the FP
	// constants have been dropped into the constant pool.
	if (LHS.getOpcode() == ISD::LOAD) {
	LoadSDNode *LLD = cast<LoadSDNode>(LHS);
	LoadSDNode *RLD = cast<LoadSDNode>(RHS);

	// Token chains must be identical.
	if (LHS.getOperand(0) != RHS.getOperand(0) \|\|
	// Do not let this transformation reduce the number of volatile loads.
	LLD->isVolatile() \|\| RLD->isVolatile() \|\|
	// FIXME: If either is a pre/post inc/dec load,
	// we'd need to split out the address adjustment.
	LLD->isIndexed() \|\| RLD->isIndexed() \|\|
	// If this is an EXTLOAD, the VT's must match.
	LLD->getMemoryVT() != RLD->getMemoryVT() \|\|
	// If this is an EXTLOAD, the kind of extension must match.
	(LLD->getExtensionType() != RLD->getExtensionType() &&
	// The only exception is if one of the extensions is anyext.
	LLD->getExtensionType() != ISD::EXTLOAD &&
	RLD->getExtensionType() != ISD::EXTLOAD) \|\|
	// FIXME: this discards src value information. This is
	// over-conservative. It would be beneficial to be able to remember
	// both potential memory locations. Since we are discarding
	// src value info, don't do the transformation if the memory
	// locations are not in the default address space.
	LLD->getPointerInfo().getAddrSpace() != 0 \|\|
	RLD->getPointerInfo().getAddrSpace() != 0 \|\|
	!TLI.isOperationLegalOrCustom(TheSelect->getOpcode(),
	LLD->getBasePtr().getValueType()))
	return false;

	// Check that the select condition doesn't reach either load. If so,
	// folding this will induce a cycle into the DAG. If not, this is safe to
	// xform, so create a select of the addresses.
	SDValue Addr;
	if (TheSelect->getOpcode() == ISD::SELECT) {
	SDNode *CondNode = TheSelect->getOperand(0).getNode();
	if ((LLD->hasAnyUseOfValue(1) && LLD->isPredecessorOf(CondNode)) \|\|
	(RLD->hasAnyUseOfValue(1) && RLD->isPredecessorOf(CondNode)))
	return false;
	// The loads must not depend on one another.
	if (LLD->isPredecessorOf(RLD) \|\|
	RLD->isPredecessorOf(LLD))
	return false;
	Addr = DAG.getSelect(SDLoc(TheSelect),
	LLD->getBasePtr().getValueType(),
	TheSelect->getOperand(0), LLD->getBasePtr(),
	RLD->getBasePtr());
	} else { // Otherwise SELECT_CC
	SDNode *CondLHS = TheSelect->getOperand(0).getNode();
	SDNode *CondRHS = TheSelect->getOperand(1).getNode();

	if ((LLD->hasAnyUseOfValue(1) &&
	(LLD->isPredecessorOf(CondLHS) \|\| LLD->isPredecessorOf(CondRHS))) \|\|
	(RLD->hasAnyUseOfValue(1) &&
	(RLD->isPredecessorOf(CondLHS) \|\| RLD->isPredecessorOf(CondRHS))))
	return false;

	Addr = DAG.getNode(ISD::SELECT_CC, SDLoc(TheSelect),
	LLD->getBasePtr().getValueType(),
	TheSelect->getOperand(0),
	TheSelect->getOperand(1),
	LLD->getBasePtr(), RLD->getBasePtr(),
	TheSelect->getOperand(4));
	}

	SDValue Load;
	// It is safe to replace the two loads if they have different alignments,
	// but the new load must be the minimum (most restrictive) alignment of the
	// inputs.
	unsigned Alignment = std::min(LLD->getAlignment(), RLD->getAlignment());
	MachineMemOperand::Flags MMOFlags = LLD->getMemOperand()->getFlags();
	if (!RLD->isInvariant())
	MMOFlags &= ~MachineMemOperand::MOInvariant;
	if (!RLD->isDereferenceable())
	MMOFlags &= ~MachineMemOperand::MODereferenceable;
	if (LLD->getExtensionType() == ISD::NON_EXTLOAD) {
	// FIXME: Discards pointer and AA info.
	Load = DAG.getLoad(TheSelect->getValueType(0), SDLoc(TheSelect),
	LLD->getChain(), Addr, MachinePointerInfo(), Alignment,
	MMOFlags);
	} else {
	// FIXME: Discards pointer and AA info.
	Load = DAG.getExtLoad(
	LLD->getExtensionType() == ISD::EXTLOAD ? RLD->getExtensionType()
	: LLD->getExtensionType(),
	SDLoc(TheSelect), TheSelect->getValueType(0), LLD->getChain(), Addr,
	MachinePointerInfo(), LLD->getMemoryVT(), Alignment, MMOFlags);
	}

	// Users of the select now use the result of the load.
	CombineTo(TheSelect, Load);

	// Users of the old loads now use the new load's chain. We know the
	// old-load value is dead now.
	CombineTo(LHS.getNode(), Load.getValue(0), Load.getValue(1));
	CombineTo(RHS.getNode(), Load.getValue(0), Load.getValue(1));
	return true;
	}

	return false;
	}

	/// Try to fold an expression of the form (N0 cond N1) ? N2 : N3 to a shift and
	/// bitwise 'and'.
	SDValue DAGCombiner::foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0,
	SDValue N1, SDValue N2, SDValue N3,
	ISD::CondCode CC) {
	// If this is a select where the false operand is zero and the compare is a
	// check of the sign bit, see if we can perform the "gzip trick":
	// select_cc setlt X, 0, A, 0 -> and (sra X, size(X)-1), A
	// select_cc setgt X, 0, A, 0 -> and (not (sra X, size(X)-1)), A
	EVT XType = N0.getValueType();
	EVT AType = N2.getValueType();
	if (!isNullConstant(N3) \|\| !XType.bitsGE(AType))
	return SDValue();

	// If the comparison is testing for a positive value, we have to invert
	// the sign bit mask, so only do that transform if the target has a bitwise
	// 'and not' instruction (the invert is free).
	if (CC == ISD::SETGT && TLI.hasAndNot(N2)) {
	// (X > -1) ? A : 0
	// (X > 0) ? X : 0 <-- This is canonical signed max.
	if (!(isAllOnesConstant(N1) \|\| (isNullConstant(N1) && N0 == N2)))
	return SDValue();
	} else if (CC == ISD::SETLT) {
	// (X < 0) ? A : 0
	// (X < 1) ? X : 0 <-- This is un-canonicalized signed min.
	if (!(isNullConstant(N1) \|\| (isOneConstant(N1) && N0 == N2)))
	return SDValue();
	} else {
	return SDValue();
	}

	// and (sra X, size(X)-1), A -> "and (srl X, C2), A" iff A is a single-bit
	// constant.
	EVT ShiftAmtTy = getShiftAmountTy(N0.getValueType());
	auto *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
	if (N2C && ((N2C->getAPIntValue() & (N2C->getAPIntValue() - 1)) == 0)) {
	unsigned ShCt = XType.getSizeInBits() - N2C->getAPIntValue().logBase2() - 1;
	SDValue ShiftAmt = DAG.getConstant(ShCt, DL, ShiftAmtTy);
	SDValue Shift = DAG.getNode(ISD::SRL, DL, XType, N0, ShiftAmt);
	AddToWorklist(Shift.getNode());

	if (XType.bitsGT(AType)) {
	Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift);
	AddToWorklist(Shift.getNode());
	}

	if (CC == ISD::SETGT)
	Shift = DAG.getNOT(DL, Shift, AType);

	return DAG.getNode(ISD::AND, DL, AType, Shift, N2);
	}

	SDValue ShiftAmt = DAG.getConstant(XType.getSizeInBits() - 1, DL, ShiftAmtTy);
	SDValue Shift = DAG.getNode(ISD::SRA, DL, XType, N0, ShiftAmt);
	AddToWorklist(Shift.getNode());

	if (XType.bitsGT(AType)) {
	Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift);
	AddToWorklist(Shift.getNode());
	}

	if (CC == ISD::SETGT)
	Shift = DAG.getNOT(DL, Shift, AType);

	return DAG.getNode(ISD::AND, DL, AType, Shift, N2);
	}

	/// Simplify an expression of the form (N0 cond N1) ? N2 : N3
	/// where 'cond' is the comparison specified by CC.
	SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
	SDValue N2, SDValue N3, ISD::CondCode CC,
	bool NotExtCompare) {
	// (x ? y : y) -> y.
	if (N2 == N3) return N2;

	EVT VT = N2.getValueType();
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
	ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2.getNode());

	// Determine if the condition we're dealing with is constant
	SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()),
	N0, N1, CC, DL, false);
	if (SCC.getNode()) AddToWorklist(SCC.getNode());

	if (ConstantSDNode *SCCC = dyn_cast_or_null<ConstantSDNode>(SCC.getNode())) {
	// fold select_cc true, x, y -> x
	// fold select_cc false, x, y -> y
	return !SCCC->isNullValue() ? N2 : N3;
	}

	// Check to see if we can simplify the select into an fabs node
	if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N1)) {
	// Allow either -0.0 or 0.0
	if (CFP->isZero()) {
	// select (setg[te] X, +/-0.0), X, fneg(X) -> fabs
	if ((CC == ISD::SETGE \|\| CC == ISD::SETGT) &&
	N0 == N2 && N3.getOpcode() == ISD::FNEG &&
	N2 == N3.getOperand(0))
	return DAG.getNode(ISD::FABS, DL, VT, N0);

	// select (setl[te] X, +/-0.0), fneg(X), X -> fabs
	if ((CC == ISD::SETLT \|\| CC == ISD::SETLE) &&
	N0 == N3 && N2.getOpcode() == ISD::FNEG &&
	N2.getOperand(0) == N3)
	return DAG.getNode(ISD::FABS, DL, VT, N3);
	}
	}

	// Turn "(a cond b) ? 1.0f : 2.0f" into "load (tmp + ((a cond b) ? 0 : 4)"
	// where "tmp" is a constant pool entry containing an array with 1.0 and 2.0
	// in it. This is a win when the constant is not otherwise available because
	// it replaces two constant pool loads with one. We only do this if the FP
	// type is known to be legal, because if it isn't, then we are before legalize
	// types an we want the other legalization to happen first (e.g. to avoid
	// messing with soft float) and if the ConstantFP is not legal, because if
	// it is legal, we may not need to store the FP constant in a constant pool.
	if (ConstantFPSDNode *TV = dyn_cast<ConstantFPSDNode>(N2))
	if (ConstantFPSDNode *FV = dyn_cast<ConstantFPSDNode>(N3)) {
	if (TLI.isTypeLegal(N2.getValueType()) &&
	(TLI.getOperationAction(ISD::ConstantFP, N2.getValueType()) !=
	TargetLowering::Legal &&
	!TLI.isFPImmLegal(TV->getValueAPF(), TV->getValueType(0)) &&
	!TLI.isFPImmLegal(FV->getValueAPF(), FV->getValueType(0))) &&
	// If both constants have multiple uses, then we won't need to do an
	// extra load, they are likely around in registers for other users.
	(TV->hasOneUse() \|\| FV->hasOneUse())) {
	Constant *Elts[] = {
	const_cast<ConstantFP*>(FV->getConstantFPValue()),
	const_cast<ConstantFP*>(TV->getConstantFPValue())
	};
	Type *FPTy = Elts[0]->getType();
	const DataLayout &TD = DAG.getDataLayout();

	// Create a ConstantArray of the two constants.
	Constant *CA = ConstantArray::get(ArrayType::get(FPTy, 2), Elts);
	SDValue CPIdx =
	DAG.getConstantPool(CA, TLI.getPointerTy(DAG.getDataLayout()),
	TD.getPrefTypeAlignment(FPTy));
	unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();

	// Get the offsets to the 0 and 1 element of the array so that we can
	// select between them.
	SDValue Zero = DAG.getIntPtrConstant(0, DL);
	unsigned EltSize = (unsigned)TD.getTypeAllocSize(Elts[0]->getType());
	SDValue One = DAG.getIntPtrConstant(EltSize, SDLoc(FV));

	SDValue Cond = DAG.getSetCC(DL,
	getSetCCResultType(N0.getValueType()),
	N0, N1, CC);
	AddToWorklist(Cond.getNode());
	SDValue CstOffset = DAG.getSelect(DL, Zero.getValueType(),
	Cond, One, Zero);
	AddToWorklist(CstOffset.getNode());
	CPIdx = DAG.getNode(ISD::ADD, DL, CPIdx.getValueType(), CPIdx,
	CstOffset);
	AddToWorklist(CPIdx.getNode());
	return DAG.getLoad(
	TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	}
	}

	if (SDValue V = foldSelectCCToShiftAnd(DL, N0, N1, N2, N3, CC))
	return V;

	// fold (select_cc seteq (and x, y), 0, 0, A) -> (and (shr (shl x)) A)
	// where y is has a single bit set.
	// A plaintext description would be, we can turn the SELECT_CC into an AND
	// when the condition can be materialized as an all-ones register. Any
	// single bit-test can be materialized as an all-ones register with
	// shift-left and shift-right-arith.
	if (CC == ISD::SETEQ && N0->getOpcode() == ISD::AND &&
	N0->getValueType(0) == VT && isNullConstant(N1) && isNullConstant(N2)) {
	SDValue AndLHS = N0->getOperand(0);
	ConstantSDNode *ConstAndRHS = dyn_cast<ConstantSDNode>(N0->getOperand(1));
	if (ConstAndRHS && ConstAndRHS->getAPIntValue().countPopulation() == 1) {
	// Shift the tested bit over the sign bit.
	const APInt &AndMask = ConstAndRHS->getAPIntValue();
	SDValue ShlAmt =
	DAG.getConstant(AndMask.countLeadingZeros(), SDLoc(AndLHS),
	getShiftAmountTy(AndLHS.getValueType()));
	SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N0), VT, AndLHS, ShlAmt);

	// Now arithmetic right shift it all the way over, so the result is either
	// all-ones, or zero.
	SDValue ShrAmt =
	DAG.getConstant(AndMask.getBitWidth() - 1, SDLoc(Shl),
	getShiftAmountTy(Shl.getValueType()));
	SDValue Shr = DAG.getNode(ISD::SRA, SDLoc(N0), VT, Shl, ShrAmt);

	return DAG.getNode(ISD::AND, DL, VT, Shr, N3);
	}
	}

	// fold select C, 16, 0 -> shl C, 4
	if (N2C && isNullConstant(N3) && N2C->getAPIntValue().isPowerOf2() &&
	TLI.getBooleanContents(N0.getValueType()) ==
	TargetLowering::ZeroOrOneBooleanContent) {

	// If the caller doesn't want us to simplify this into a zext of a compare,
	// don't do it.
	if (NotExtCompare && N2C->isOne())
	return SDValue();

	// Get a SetCC of the condition
	// NOTE: Don't create a SETCC if it's not legal on this target.
	if (!LegalOperations \|\|
	TLI.isOperationLegal(ISD::SETCC, N0.getValueType())) {
	SDValue Temp, SCC;
	// cast from setcc result type to select result type
	if (LegalTypes) {
	SCC = DAG.getSetCC(DL, getSetCCResultType(N0.getValueType()),
	N0, N1, CC);
	if (N2.getValueType().bitsLT(SCC.getValueType()))
	Temp = DAG.getZeroExtendInReg(SCC, SDLoc(N2),
	N2.getValueType());
	else
	Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2),
	N2.getValueType(), SCC);
	} else {
	SCC = DAG.getSetCC(SDLoc(N0), MVT::i1, N0, N1, CC);
	Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2),
	N2.getValueType(), SCC);
	}

	AddToWorklist(SCC.getNode());
	AddToWorklist(Temp.getNode());

	if (N2C->isOne())
	return Temp;

	// shl setcc result by log2 n2c
	return DAG.getNode(
	ISD::SHL, DL, N2.getValueType(), Temp,
	DAG.getConstant(N2C->getAPIntValue().logBase2(), SDLoc(Temp),
	getShiftAmountTy(Temp.getValueType())));
	}
	}

	// Check to see if this is an integer abs.
	// select_cc setg[te] X, 0, X, -X ->
	// select_cc setgt X, -1, X, -X ->
	// select_cc setl[te] X, 0, -X, X ->
	// select_cc setlt X, 1, -X, X ->
	// Y = sra (X, size(X)-1); xor (add (X, Y), Y)
	if (N1C) {
	ConstantSDNode *SubC = nullptr;
	if (((N1C->isNullValue() && (CC == ISD::SETGT \|\| CC == ISD::SETGE)) \|\|
	(N1C->isAllOnesValue() && CC == ISD::SETGT)) &&
	N0 == N2 && N3.getOpcode() == ISD::SUB && N0 == N3.getOperand(1))
	SubC = dyn_cast<ConstantSDNode>(N3.getOperand(0));
	else if (((N1C->isNullValue() && (CC == ISD::SETLT \|\| CC == ISD::SETLE)) \|\|
	(N1C->isOne() && CC == ISD::SETLT)) &&
	N0 == N3 && N2.getOpcode() == ISD::SUB && N0 == N2.getOperand(1))
	SubC = dyn_cast<ConstantSDNode>(N2.getOperand(0));

	EVT XType = N0.getValueType();
	if (SubC && SubC->isNullValue() && XType.isInteger()) {
	SDLoc DL(N0);
	SDValue Shift = DAG.getNode(ISD::SRA, DL, XType,
	N0,
	DAG.getConstant(XType.getSizeInBits() - 1, DL,
	getShiftAmountTy(N0.getValueType())));
	SDValue Add = DAG.getNode(ISD::ADD, DL,
	XType, N0, Shift);
	AddToWorklist(Shift.getNode());
	AddToWorklist(Add.getNode());
	return DAG.getNode(ISD::XOR, DL, XType, Add, Shift);
	}
	}

	// select_cc seteq X, 0, sizeof(X), ctlz(X) -> ctlz(X)
	// select_cc seteq X, 0, sizeof(X), ctlz_zero_undef(X) -> ctlz(X)
	// select_cc seteq X, 0, sizeof(X), cttz(X) -> cttz(X)
	// select_cc seteq X, 0, sizeof(X), cttz_zero_undef(X) -> cttz(X)
	// select_cc setne X, 0, ctlz(X), sizeof(X) -> ctlz(X)
	// select_cc setne X, 0, ctlz_zero_undef(X), sizeof(X) -> ctlz(X)
	// select_cc setne X, 0, cttz(X), sizeof(X) -> cttz(X)
	// select_cc setne X, 0, cttz_zero_undef(X), sizeof(X) -> cttz(X)
	if (N1C && N1C->isNullValue() && (CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	SDValue ValueOnZero = N2;
	SDValue Count = N3;
	// If the condition is NE instead of E, swap the operands.
	if (CC == ISD::SETNE)
	std::swap(ValueOnZero, Count);
	// Check if the value on zero is a constant equal to the bits in the type.
	if (auto *ValueOnZeroC = dyn_cast<ConstantSDNode>(ValueOnZero)) {
	if (ValueOnZeroC->getAPIntValue() == VT.getSizeInBits()) {
	// If the other operand is cttz/cttz_zero_undef of N0, and cttz is
	// legal, combine to just cttz.
	if ((Count.getOpcode() == ISD::CTTZ \|\|
	Count.getOpcode() == ISD::CTTZ_ZERO_UNDEF) &&
	N0 == Count.getOperand(0) &&
	(!LegalOperations \|\| TLI.isOperationLegal(ISD::CTTZ, VT)))
	return DAG.getNode(ISD::CTTZ, DL, VT, N0);
	// If the other operand is ctlz/ctlz_zero_undef of N0, and ctlz is
	// legal, combine to just ctlz.
	if ((Count.getOpcode() == ISD::CTLZ \|\|
	Count.getOpcode() == ISD::CTLZ_ZERO_UNDEF) &&
	N0 == Count.getOperand(0) &&
	(!LegalOperations \|\| TLI.isOperationLegal(ISD::CTLZ, VT)))
	return DAG.getNode(ISD::CTLZ, DL, VT, N0);
	}
	}
	}

	return SDValue();
	}

	/// This is a stub for TargetLowering::SimplifySetCC.
	SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
	ISD::CondCode Cond, const SDLoc &DL,
	bool foldBooleans) {
	TargetLowering::DAGCombinerInfo
	DagCombineInfo(DAG, Level, false, this);
	return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL);
	}

	/// Given an ISD::SDIV node expressing a divide by constant, return
	/// a DAG expression to select that will generate the same value by multiplying
	/// by a magic number.
	/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
	SDValue DAGCombiner::BuildSDIV(SDNode *N) {
	// when optimising for minimum size, we don't want to expand a div to a mul
	// and a shift.
	if (DAG.getMachineFunction().getFunction().optForMinSize())
	return SDValue();

	ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
	if (!C)
	return SDValue();

	// Avoid division by zero.
	if (C->isNullValue())
	return SDValue();

	std::vector<SDNode *> Built;
	SDValue S =
	TLI.BuildSDIV(N, C->getAPIntValue(), DAG, LegalOperations, &Built);

	for (SDNode *N : Built)
	AddToWorklist(N);
	return S;
	}

	/// Given an ISD::SDIV node expressing a divide by constant power of 2, return a
	/// DAG expression that will generate the same value by right shifting.
	SDValue DAGCombiner::BuildSDIVPow2(SDNode *N) {
	ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
	if (!C)
	return SDValue();

	// Avoid division by zero.
	if (C->isNullValue())
	return SDValue();

	std::vector<SDNode *> Built;
	SDValue S = TLI.BuildSDIVPow2(N, C->getAPIntValue(), DAG, &Built);

	for (SDNode *N : Built)
	AddToWorklist(N);
	return S;
	}

	/// Given an ISD::UDIV node expressing a divide by constant, return a DAG
	/// expression that will generate the same value by multiplying by a magic
	/// number.
	/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
	SDValue DAGCombiner::BuildUDIV(SDNode *N) {
	// when optimising for minimum size, we don't want to expand a div to a mul
	// and a shift.
	if (DAG.getMachineFunction().getFunction().optForMinSize())
	return SDValue();

	ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
	if (!C)
	return SDValue();

	// Avoid division by zero.
	if (C->isNullValue())
	return SDValue();

	std::vector<SDNode *> Built;
	SDValue S =
	TLI.BuildUDIV(N, C->getAPIntValue(), DAG, LegalOperations, &Built);

	for (SDNode *N : Built)
	AddToWorklist(N);
	return S;
	}

	/// Determines the LogBase2 value for a non-null input value using the
	/// transform: LogBase2(V) = (EltBits - 1) - ctlz(V).
	SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL) {
	EVT VT = V.getValueType();
	unsigned EltBits = VT.getScalarSizeInBits();
	SDValue Ctlz = DAG.getNode(ISD::CTLZ, DL, VT, V);
	SDValue Base = DAG.getConstant(EltBits - 1, DL, VT);
	SDValue LogBase2 = DAG.getNode(ISD::SUB, DL, VT, Base, Ctlz);
	return LogBase2;
	}

	/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
	/// For the reciprocal, we need to find the zero of the function:
	/// F(X) = A X - 1 [which has a zero at X = 1/A]
	/// =>
	/// X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
	/// does not require additional intermediate precision]
	SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags) {
	if (Level >= AfterLegalizeDAG)
	return SDValue();

	// TODO: Handle half and/or extended types?
	EVT VT = Op.getValueType();
	if (VT.getScalarType() != MVT::f32 && VT.getScalarType() != MVT::f64)
	return SDValue();

	// If estimates are explicitly disabled for this function, we're done.
	MachineFunction &MF = DAG.getMachineFunction();
	int Enabled = TLI.getRecipEstimateDivEnabled(VT, MF);
	if (Enabled == TLI.ReciprocalEstimate::Disabled)
	return SDValue();

	// Estimates may be explicitly enabled for this type with a custom number of
	// refinement steps.
	int Iterations = TLI.getDivRefinementSteps(VT, MF);
	if (SDValue Est = TLI.getRecipEstimate(Op, DAG, Enabled, Iterations)) {
	AddToWorklist(Est.getNode());

	if (Iterations) {
	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);

	// Newton iterations: Est = Est + Est (1 - Arg * Est)
	for (int i = 0; i < Iterations; ++i) {
	SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, Est, Flags);
	AddToWorklist(NewEst.getNode());

	NewEst = DAG.getNode(ISD::FSUB, DL, VT, FPOne, NewEst, Flags);
	AddToWorklist(NewEst.getNode());

	NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
	AddToWorklist(NewEst.getNode());

	Est = DAG.getNode(ISD::FADD, DL, VT, Est, NewEst, Flags);
	AddToWorklist(Est.getNode());
	}
	}
	return Est;
	}

	return SDValue();
	}

	/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
	/// For the reciprocal sqrt, we need to find the zero of the function:
	/// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
	/// =>
	/// X_{i+1} = X_i (1.5 - A X_i^2 / 2)
	/// As a result, we precompute A/2 prior to the iteration loop.
	SDValue DAGCombiner::buildSqrtNROneConst(SDValue Arg, SDValue Est,
	unsigned Iterations,
	SDNodeFlags Flags, bool Reciprocal) {
	EVT VT = Arg.getValueType();
	SDLoc DL(Arg);
	SDValue ThreeHalves = DAG.getConstantFP(1.5, DL, VT);

	// We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that
	// this entire sequence requires only one FP constant.
	SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg, Flags);
	AddToWorklist(HalfArg.getNode());

	HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg, Flags);
	AddToWorklist(HalfArg.getNode());

	// Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
	for (unsigned i = 0; i < Iterations; ++i) {
	SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est, Flags);
	AddToWorklist(NewEst.getNode());

	NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst, Flags);
	AddToWorklist(NewEst.getNode());

	NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst, Flags);
	AddToWorklist(NewEst.getNode());

	Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
	AddToWorklist(Est.getNode());
	}

	// If non-reciprocal square root is requested, multiply the result by Arg.
	if (!Reciprocal) {
	Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg, Flags);
	AddToWorklist(Est.getNode());
	}

	return Est;
	}

	/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
	/// For the reciprocal sqrt, we need to find the zero of the function:
	/// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
	/// =>
	/// X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0))
	SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est,
	unsigned Iterations,
	SDNodeFlags Flags, bool Reciprocal) {
	EVT VT = Arg.getValueType();
	SDLoc DL(Arg);
	SDValue MinusThree = DAG.getConstantFP(-3.0, DL, VT);
	SDValue MinusHalf = DAG.getConstantFP(-0.5, DL, VT);

	// This routine must enter the loop below to work correctly
	// when (Reciprocal == false).
	assert(Iterations > 0);

	// Newton iterations for reciprocal square root:
	// E = (E * -0.5) * ((A * E) * E + -3.0)
	for (unsigned i = 0; i < Iterations; ++i) {
	SDValue AE = DAG.getNode(ISD::FMUL, DL, VT, Arg, Est, Flags);
	AddToWorklist(AE.getNode());

	SDValue AEE = DAG.getNode(ISD::FMUL, DL, VT, AE, Est, Flags);
	AddToWorklist(AEE.getNode());

	SDValue RHS = DAG.getNode(ISD::FADD, DL, VT, AEE, MinusThree, Flags);
	AddToWorklist(RHS.getNode());

	// When calculating a square root at the last iteration build:
	// S = ((A * E) * -0.5) * ((A * E) * E + -3.0)
	// (notice a common subexpression)
	SDValue LHS;
	if (Reciprocal \|\| (i + 1) < Iterations) {
	// RSQRT: LHS = (E * -0.5)
	LHS = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf, Flags);
	} else {
	// SQRT: LHS = (A * E) * -0.5
	LHS = DAG.getNode(ISD::FMUL, DL, VT, AE, MinusHalf, Flags);
	}
	AddToWorklist(LHS.getNode());

	Est = DAG.getNode(ISD::FMUL, DL, VT, LHS, RHS, Flags);
	AddToWorklist(Est.getNode());
	}

	return Est;
	}

	/// Build code to calculate either rsqrt(Op) or sqrt(Op). In the latter case
	/// Op*rsqrt(Op) is actually computed, so additional postprocessing is needed if
	/// Op can be zero.
	SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags,
	bool Reciprocal) {
	if (Level >= AfterLegalizeDAG)
	return SDValue();

	// TODO: Handle half and/or extended types?
	EVT VT = Op.getValueType();
	if (VT.getScalarType() != MVT::f32 && VT.getScalarType() != MVT::f64)
	return SDValue();

	// If estimates are explicitly disabled for this function, we're done.
	MachineFunction &MF = DAG.getMachineFunction();
	int Enabled = TLI.getRecipEstimateSqrtEnabled(VT, MF);
	if (Enabled == TLI.ReciprocalEstimate::Disabled)
	return SDValue();

	// Estimates may be explicitly enabled for this type with a custom number of
	// refinement steps.
	int Iterations = TLI.getSqrtRefinementSteps(VT, MF);

	bool UseOneConstNR = false;
	if (SDValue Est =
	TLI.getSqrtEstimate(Op, DAG, Enabled, Iterations, UseOneConstNR,
	Reciprocal)) {
	AddToWorklist(Est.getNode());

	if (Iterations) {
	Est = UseOneConstNR
	? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal)
	: buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal);

	if (!Reciprocal) {
	// Unfortunately, Est is now NaN if the input was exactly 0.0.
	// Select out this case and force the answer to 0.0.
	EVT VT = Op.getValueType();
	SDLoc DL(Op);

	SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
	EVT CCVT = getSetCCResultType(VT);
	SDValue ZeroCmp = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
	AddToWorklist(ZeroCmp.getNode());

	Est = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT,
	ZeroCmp, FPZero, Est);
	AddToWorklist(Est.getNode());
	}
	}
	return Est;
	}

	return SDValue();
	}

	SDValue DAGCombiner::buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags) {
	return buildSqrtEstimateImpl(Op, Flags, true);
	}

	SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags Flags) {
	return buildSqrtEstimateImpl(Op, Flags, false);
	}

	/// Return true if there is any possibility that the two addresses overlap.
	bool DAGCombiner::isAlias(LSBaseSDNode Op0, LSBaseSDNode Op1) const {
	// If they are the same then they must be aliases.
	if (Op0->getBasePtr() == Op1->getBasePtr()) return true;

	// If they are both volatile then they cannot be reordered.
	if (Op0->isVolatile() && Op1->isVolatile()) return true;

	// If one operation reads from invariant memory, and the other may store, they
	// cannot alias. These should really be checking the equivalent of mayWrite,
	// but it only matters for memory nodes other than load /store.
	if (Op0->isInvariant() && Op1->writeMem())
	return false;

	if (Op1->isInvariant() && Op0->writeMem())
	return false;

	unsigned NumBytes0 = Op0->getMemoryVT().getStoreSize();
	unsigned NumBytes1 = Op1->getMemoryVT().getStoreSize();

	// Check for BaseIndexOffset matching.
	BaseIndexOffset BasePtr0 = BaseIndexOffset::match(Op0, DAG);
	BaseIndexOffset BasePtr1 = BaseIndexOffset::match(Op1, DAG);
	int64_t PtrDiff;
	if (BasePtr0.getBase().getNode() && BasePtr1.getBase().getNode()) {
	if (BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff))
	return !((NumBytes0 <= PtrDiff) \|\| (PtrDiff + NumBytes1 <= 0));

	// If both BasePtr0 and BasePtr1 are FrameIndexes, we will not be
	// able to calculate their relative offset if at least one arises
	// from an alloca. However, these allocas cannot overlap and we
	// can infer there is no alias.
	if (auto *A = dyn_cast<FrameIndexSDNode>(BasePtr0.getBase()))
	if (auto *B = dyn_cast<FrameIndexSDNode>(BasePtr1.getBase())) {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	// If the base are the same frame index but the we couldn't find a
	// constant offset, (indices are different) be conservative.
	if (A != B && (!MFI.isFixedObjectIndex(A->getIndex()) \|\|
	!MFI.isFixedObjectIndex(B->getIndex())))
	return false;
	}

	bool IsFI0 = isa<FrameIndexSDNode>(BasePtr0.getBase());
	bool IsFI1 = isa<FrameIndexSDNode>(BasePtr1.getBase());
	bool IsGV0 = isa<GlobalAddressSDNode>(BasePtr0.getBase());
	bool IsGV1 = isa<GlobalAddressSDNode>(BasePtr1.getBase());
	bool IsCV0 = isa<ConstantPoolSDNode>(BasePtr0.getBase());
	bool IsCV1 = isa<ConstantPoolSDNode>(BasePtr1.getBase());

	// If of mismatched base types or checkable indices we can check
	// they do not alias.
	if ((BasePtr0.getIndex() == BasePtr1.getIndex() \|\| (IsFI0 != IsFI1) \|\|
	(IsGV0 != IsGV1) \|\| (IsCV0 != IsCV1)) &&
	(IsFI0 \|\| IsGV0 \|\| IsCV0) && (IsFI1 \|\| IsGV1 \|\| IsCV1))
	return false;
	}

	// If we know required SrcValue1 and SrcValue2 have relatively large
	// alignment compared to the size and offset of the access, we may be able
	// to prove they do not alias. This check is conservative for now to catch
	// cases created by splitting vector types.
	int64_t SrcValOffset0 = Op0->getSrcValueOffset();
	int64_t SrcValOffset1 = Op1->getSrcValueOffset();
	unsigned OrigAlignment0 = Op0->getOriginalAlignment();
	unsigned OrigAlignment1 = Op1->getOriginalAlignment();
	if (OrigAlignment0 == OrigAlignment1 && SrcValOffset0 != SrcValOffset1 &&
	NumBytes0 == NumBytes1 && OrigAlignment0 > NumBytes0) {
	int64_t OffAlign0 = SrcValOffset0 % OrigAlignment0;
	int64_t OffAlign1 = SrcValOffset1 % OrigAlignment1;

	// There is no overlap between these relatively aligned accesses of
	// similar size. Return no alias.
	if ((OffAlign0 + NumBytes0) <= OffAlign1 \|\|
	(OffAlign1 + NumBytes1) <= OffAlign0)
	return false;
	}

	bool UseAA = CombinerGlobalAA.getNumOccurrences() > 0
	? CombinerGlobalAA
	: DAG.getSubtarget().useAA();
	#ifndef NDEBUG
	if (CombinerAAOnlyFunc.getNumOccurrences() &&
	CombinerAAOnlyFunc != DAG.getMachineFunction().getName())
	UseAA = false;
	#endif

	if (UseAA && AA &&
	Op0->getMemOperand()->getValue() && Op1->getMemOperand()->getValue()) {
	// Use alias analysis information.
	int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1);
	int64_t Overlap0 = NumBytes0 + SrcValOffset0 - MinOffset;
	int64_t Overlap1 = NumBytes1 + SrcValOffset1 - MinOffset;
	AliasResult AAResult =
	AA->alias(MemoryLocation(Op0->getMemOperand()->getValue(), Overlap0,
	UseTBAA ? Op0->getAAInfo() : AAMDNodes()),
	MemoryLocation(Op1->getMemOperand()->getValue(), Overlap1,
	UseTBAA ? Op1->getAAInfo() : AAMDNodes()) );
	if (AAResult == NoAlias)
	return false;
	}

	// Otherwise we have to assume they alias.
	return true;
	}

	/// Walk up chain skipping non-aliasing memory nodes,
	/// looking for aliasing nodes and adding them to the Aliases vector.
	void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
	SmallVectorImpl<SDValue> &Aliases) {
	SmallVector<SDValue, 8> Chains; // List of chains to visit.
	SmallPtrSet<SDNode *, 16> Visited; // Visited node set.

	// Get alias information for node.
	bool IsLoad = isa<LoadSDNode>(N) && !cast<LSBaseSDNode>(N)->isVolatile();

	// Starting off.
	Chains.push_back(OriginalChain);
	unsigned Depth = 0;

	// Look at each chain and determine if it is an alias. If so, add it to the
	// aliases list. If not, then continue up the chain looking for the next
	// candidate.
	while (!Chains.empty()) {
	SDValue Chain = Chains.pop_back_val();

	// For TokenFactor nodes, look at each operand and only continue up the
	// chain until we reach the depth limit.
	//
	// FIXME: The depth check could be made to return the last non-aliasing
	// chain we found before we hit a tokenfactor rather than the original
	// chain.
	if (Depth > TLI.getGatherAllAliasesMaxDepth()) {
	Aliases.clear();
	Aliases.push_back(OriginalChain);
	return;
	}

	// Don't bother if we've been before.
	if (!Visited.insert(Chain.getNode()).second)
	continue;

	switch (Chain.getOpcode()) {
	case ISD::EntryToken:
	// Entry token is ideal chain operand, but handled in FindBetterChain.
	break;

	case ISD::LOAD:
	case ISD::STORE: {
	// Get alias information for Chain.
	bool IsOpLoad = isa<LoadSDNode>(Chain.getNode()) &&
	!cast<LSBaseSDNode>(Chain.getNode())->isVolatile();

	// If chain is alias then stop here.
	if (!(IsLoad && IsOpLoad) &&
	isAlias(cast<LSBaseSDNode>(N), cast<LSBaseSDNode>(Chain.getNode()))) {
	Aliases.push_back(Chain);
	} else {
	// Look further up the chain.
	Chains.push_back(Chain.getOperand(0));
	++Depth;
	}
	break;
	}

	case ISD::TokenFactor:
	// We have to check each of the operands of the token factor for "small"
	// token factors, so we queue them up. Adding the operands to the queue
	// (stack) in reverse order maintains the original order and increases the
	// likelihood that getNode will find a matching token factor (CSE.)
	if (Chain.getNumOperands() > 16) {
	Aliases.push_back(Chain);
	break;
	}
	for (unsigned n = Chain.getNumOperands(); n;)
	Chains.push_back(Chain.getOperand(--n));
	++Depth;
	break;

	case ISD::CopyFromReg:
	// Forward past CopyFromReg.
	Chains.push_back(Chain.getOperand(0));
	++Depth;
	break;

	default:
	// For all other instructions we will just have to take what we can get.
	Aliases.push_back(Chain);
	break;
	}
	}
	}

	/// Walk up chain skipping non-aliasing memory nodes, looking for a better chain
	/// (aliasing node.)
	SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
	if (OptLevel == CodeGenOpt::None)
	return OldChain;

	// Ops for replacing token factor.
	SmallVector<SDValue, 8> Aliases;

	// Accumulate all the aliases to this node.
	GatherAllAliases(N, OldChain, Aliases);

	// If no operands then chain to entry token.
	if (Aliases.size() == 0)
	return DAG.getEntryNode();

	// If a single operand then chain to it. We don't need to revisit it.
	if (Aliases.size() == 1)
	return Aliases[0];

	// Construct a custom tailored token factor.
	return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Aliases);
	}

	// This function tries to collect a bunch of potentially interesting
	// nodes to improve the chains of, all at once. This might seem
	// redundant, as this function gets called when visiting every store
	// node, so why not let the work be done on each store as it's visited?
	//
	// I believe this is mainly important because MergeConsecutiveStores
	// is unable to deal with merging stores of different sizes, so unless
	// we improve the chains of all the potential candidates up-front
	// before running MergeConsecutiveStores, it might only see some of
	// the nodes that will eventually be candidates, and then not be able
	// to go from a partially-merged state to the desired final
	// fully-merged state.
	bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
	if (OptLevel == CodeGenOpt::None)
	return false;

	// This holds the base pointer, index, and the offset in bytes from the base
	// pointer.
	BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);

	// We must have a base and an offset.
	if (!BasePtr.getBase().getNode())
	return false;

	// Do not handle stores to undef base pointers.
	if (BasePtr.getBase().isUndef())
	return false;

	SmallVector<StoreSDNode *, 8> ChainedStores;
	ChainedStores.push_back(St);

	// Walk up the chain and look for nodes with offsets from the same
	// base pointer. Stop when reaching an instruction with a different kind
	// or instruction which has a different base pointer.
	StoreSDNode *Index = St;
	while (Index) {
	// If the chain has more than one use, then we can't reorder the mem ops.
	if (Index != St && !SDValue(Index, 0)->hasOneUse())
	break;

	if (Index->isVolatile() \|\| Index->isIndexed())
	break;

	// Find the base pointer and offset for this memory node.
	BaseIndexOffset Ptr = BaseIndexOffset::match(Index, DAG);

	// Check that the base pointer is the same as the original one.
	if (!BasePtr.equalBaseIndex(Ptr, DAG))
	break;

	// Walk up the chain to find the next store node, ignoring any
	// intermediate loads. Any other kind of node will halt the loop.
	SDNode *NextInChain = Index->getChain().getNode();
	while (true) {
	if (StoreSDNode *STn = dyn_cast<StoreSDNode>(NextInChain)) {
	// We found a store node. Use it for the next iteration.
	if (STn->isVolatile() \|\| STn->isIndexed()) {
	Index = nullptr;
	break;
	}
	ChainedStores.push_back(STn);
	Index = STn;
	break;
	} else if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(NextInChain)) {
	NextInChain = Ldn->getChain().getNode();
	continue;
	} else {
	Index = nullptr;
	break;
	}
	} // end while
	}

	// At this point, ChainedStores lists all of the Store nodes
	// reachable by iterating up through chain nodes matching the above
	// conditions. For each such store identified, try to find an
	// earlier chain to attach the store to which won't violate the
	// required ordering.
	bool MadeChangeToSt = false;
	SmallVector<std::pair<StoreSDNode *, SDValue>, 8> BetterChains;

	for (StoreSDNode *ChainedStore : ChainedStores) {
	SDValue Chain = ChainedStore->getChain();
	SDValue BetterChain = FindBetterChain(ChainedStore, Chain);

	if (Chain != BetterChain) {
	if (ChainedStore == St)
	MadeChangeToSt = true;
	BetterChains.push_back(std::make_pair(ChainedStore, BetterChain));
	}
	}

	// Do all replacements after finding the replacements to make to avoid making
	// the chains more complicated by introducing new TokenFactors.
	for (auto Replacement : BetterChains)
	replaceStoreChain(Replacement.first, Replacement.second);

	return MadeChangeToSt;
	}

	/// This is the entry point for the file.
	void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis *AA,
	CodeGenOpt::Level OptLevel) {
	/// This is the main entry point to this class.
	DAGCombiner(*this, AA, OptLevel).Run(Level);
	}
	Index: head/contrib/llvm/lib/CodeGen/SplitKit.cpp
	===================================================================
	--- head/contrib/llvm/lib/CodeGen/SplitKit.cpp (revision 329409)
	+++ head/contrib/llvm/lib/CodeGen/SplitKit.cpp (revision 329410)
	@@ -1,1816 +1,1849 @@
	//===- SplitKit.cpp - Toolkit for splitting live ranges -------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains the SplitAnalysis class as well as mutator functions for
	// live range splitting.
	//
	//===----------------------------------------------------------------------===//

	#include "SplitKit.h"
	#include "LiveRangeCalc.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/DenseSet.h"
	#include "llvm/ADT/None.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/CodeGen/LiveInterval.h"
	#include "llvm/CodeGen/LiveIntervals.h"
	#include "llvm/CodeGen/LiveRangeEdit.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
	#include "llvm/CodeGen/MachineDominators.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineLoopInfo.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/SlotIndexes.h"
	#include "llvm/CodeGen/TargetInstrInfo.h"
	#include "llvm/CodeGen/TargetOpcodes.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/CodeGen/VirtRegMap.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/MC/LaneBitmask.h"
	#include "llvm/Support/Allocator.h"
	#include "llvm/Support/BlockFrequency.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/raw_ostream.h"
	#include <algorithm>
	#include <cassert>
	#include <iterator>
	#include <limits>
	#include <tuple>
	#include <utility>

	using namespace llvm;

	#define DEBUG_TYPE "regalloc"

	STATISTIC(NumFinished, "Number of splits finished");
	STATISTIC(NumSimple, "Number of splits that were simple");
	STATISTIC(NumCopies, "Number of copies inserted for splitting");
	STATISTIC(NumRemats, "Number of rematerialized defs for splitting");
	STATISTIC(NumRepairs, "Number of invalid live ranges repaired");

	//===----------------------------------------------------------------------===//
	// Last Insert Point Analysis
	//===----------------------------------------------------------------------===//

	InsertPointAnalysis::InsertPointAnalysis(const LiveIntervals &lis,
	unsigned BBNum)
	: LIS(lis), LastInsertPoint(BBNum) {}

	SlotIndex
	InsertPointAnalysis::computeLastInsertPoint(const LiveInterval &CurLI,
	const MachineBasicBlock &MBB) {
	unsigned Num = MBB.getNumber();
	std::pair<SlotIndex, SlotIndex> &LIP = LastInsertPoint[Num];
	SlotIndex MBBEnd = LIS.getMBBEndIdx(&MBB);

	SmallVector<const MachineBasicBlock *, 1> EHPadSuccessors;
	for (const MachineBasicBlock *SMBB : MBB.successors())
	if (SMBB->isEHPad())
	EHPadSuccessors.push_back(SMBB);

	// Compute insert points on the first call. The pair is independent of the
	// current live interval.
	if (!LIP.first.isValid()) {
	MachineBasicBlock::const_iterator FirstTerm = MBB.getFirstTerminator();
	if (FirstTerm == MBB.end())
	LIP.first = MBBEnd;
	else
	LIP.first = LIS.getInstructionIndex(*FirstTerm);

	// If there is a landing pad successor, also find the call instruction.
	if (EHPadSuccessors.empty())
	return LIP.first;
	// There may not be a call instruction (?) in which case we ignore LPad.
	LIP.second = LIP.first;
	for (MachineBasicBlock::const_iterator I = MBB.end(), E = MBB.begin();
	I != E;) {
	--I;
	if (I->isCall()) {
	LIP.second = LIS.getInstructionIndex(*I);
	break;
	}
	}
	}

	// If CurLI is live into a landing pad successor, move the last insert point
	// back to the call that may throw.
	if (!LIP.second)
	return LIP.first;

	if (none_of(EHPadSuccessors, [&](const MachineBasicBlock *EHPad) {
	return LIS.isLiveInToMBB(CurLI, EHPad);
	}))
	return LIP.first;

	// Find the value leaving MBB.
	const VNInfo *VNI = CurLI.getVNInfoBefore(MBBEnd);
	if (!VNI)
	return LIP.first;

	// If the value leaving MBB was defined after the call in MBB, it can't
	// really be live-in to the landing pad. This can happen if the landing pad
	// has a PHI, and this register is undef on the exceptional edge.
	// <rdar://problem/10664933>
	if (!SlotIndex::isEarlierInstr(VNI->def, LIP.second) && VNI->def < MBBEnd)
	return LIP.first;

	// Value is properly live-in to the landing pad.
	// Only allow inserts before the call.
	return LIP.second;
	}

	MachineBasicBlock::iterator
	InsertPointAnalysis::getLastInsertPointIter(const LiveInterval &CurLI,
	MachineBasicBlock &MBB) {
	SlotIndex LIP = getLastInsertPoint(CurLI, MBB);
	if (LIP == LIS.getMBBEndIdx(&MBB))
	return MBB.end();
	return LIS.getInstructionFromIndex(LIP);
	}

	//===----------------------------------------------------------------------===//
	// Split Analysis
	//===----------------------------------------------------------------------===//

	SplitAnalysis::SplitAnalysis(const VirtRegMap &vrm, const LiveIntervals &lis,
	const MachineLoopInfo &mli)
	: MF(vrm.getMachineFunction()), VRM(vrm), LIS(lis), Loops(mli),
	TII(*MF.getSubtarget().getInstrInfo()), IPA(lis, MF.getNumBlockIDs()) {}

	void SplitAnalysis::clear() {
	UseSlots.clear();
	UseBlocks.clear();
	ThroughBlocks.clear();
	CurLI = nullptr;
	DidRepairRange = false;
	}

	/// analyzeUses - Count instructions, basic blocks, and loops using CurLI.
	void SplitAnalysis::analyzeUses() {
	assert(UseSlots.empty() && "Call clear first");

	// First get all the defs from the interval values. This provides the correct
	// slots for early clobbers.
	for (const VNInfo *VNI : CurLI->valnos)
	if (!VNI->isPHIDef() && !VNI->isUnused())
	UseSlots.push_back(VNI->def);

	// Get use slots form the use-def chain.
	const MachineRegisterInfo &MRI = MF.getRegInfo();
	for (MachineOperand &MO : MRI.use_nodbg_operands(CurLI->reg))
	if (!MO.isUndef())
	UseSlots.push_back(LIS.getInstructionIndex(*MO.getParent()).getRegSlot());

	array_pod_sort(UseSlots.begin(), UseSlots.end());

	// Remove duplicates, keeping the smaller slot for each instruction.
	// That is what we want for early clobbers.
	UseSlots.erase(std::unique(UseSlots.begin(), UseSlots.end(),
	SlotIndex::isSameInstr),
	UseSlots.end());

	// Compute per-live block info.
	if (!calcLiveBlockInfo()) {
	// FIXME: calcLiveBlockInfo found inconsistencies in the live range.
	// I am looking at you, RegisterCoalescer!
	DidRepairRange = true;
	++NumRepairs;
	DEBUG(dbgs() << "* Fixing inconsistent live interval! *\n");
	const_cast<LiveIntervals&>(LIS)
	.shrinkToUses(const_cast<LiveInterval*>(CurLI));
	UseBlocks.clear();
	ThroughBlocks.clear();
	bool fixed = calcLiveBlockInfo();
	(void)fixed;
	assert(fixed && "Couldn't fix broken live interval");
	}

	DEBUG(dbgs() << "Analyze counted "
	<< UseSlots.size() << " instrs in "
	<< UseBlocks.size() << " blocks, through "
	<< NumThroughBlocks << " blocks.\n");
	}

	/// calcLiveBlockInfo - Fill the LiveBlocks array with information about blocks
	/// where CurLI is live.
	bool SplitAnalysis::calcLiveBlockInfo() {
	ThroughBlocks.resize(MF.getNumBlockIDs());
	NumThroughBlocks = NumGapBlocks = 0;
	if (CurLI->empty())
	return true;

	LiveInterval::const_iterator LVI = CurLI->begin();
	LiveInterval::const_iterator LVE = CurLI->end();

	SmallVectorImpl<SlotIndex>::const_iterator UseI, UseE;
	UseI = UseSlots.begin();
	UseE = UseSlots.end();

	// Loop over basic blocks where CurLI is live.
	MachineFunction::iterator MFI =
	LIS.getMBBFromIndex(LVI->start)->getIterator();
	while (true) {
	BlockInfo BI;
	BI.MBB = &*MFI;
	SlotIndex Start, Stop;
	std::tie(Start, Stop) = LIS.getSlotIndexes()->getMBBRange(BI.MBB);

	// If the block contains no uses, the range must be live through. At one
	// point, RegisterCoalescer could create dangling ranges that ended
	// mid-block.
	if (UseI == UseE \|\| *UseI >= Stop) {
	++NumThroughBlocks;
	ThroughBlocks.set(BI.MBB->getNumber());
	// The range shouldn't end mid-block if there are no uses. This shouldn't
	// happen.
	if (LVI->end < Stop)
	return false;
	} else {
	// This block has uses. Find the first and last uses in the block.
	BI.FirstInstr = *UseI;
	assert(BI.FirstInstr >= Start);
	do ++UseI;
	while (UseI != UseE && *UseI < Stop);
	BI.LastInstr = UseI[-1];
	assert(BI.LastInstr < Stop);

	// LVI is the first live segment overlapping MBB.
	BI.LiveIn = LVI->start <= Start;

	// When not live in, the first use should be a def.
	if (!BI.LiveIn) {
	assert(LVI->start == LVI->valno->def && "Dangling Segment start");
	assert(LVI->start == BI.FirstInstr && "First instr should be a def");
	BI.FirstDef = BI.FirstInstr;
	}

	// Look for gaps in the live range.
	BI.LiveOut = true;
	while (LVI->end < Stop) {
	SlotIndex LastStop = LVI->end;
	if (++LVI == LVE \|\| LVI->start >= Stop) {
	BI.LiveOut = false;
	BI.LastInstr = LastStop;
	break;
	}

	if (LastStop < LVI->start) {
	// There is a gap in the live range. Create duplicate entries for the
	// live-in snippet and the live-out snippet.
	++NumGapBlocks;

	// Push the Live-in part.
	BI.LiveOut = false;
	UseBlocks.push_back(BI);
	UseBlocks.back().LastInstr = LastStop;

	// Set up BI for the live-out part.
	BI.LiveIn = false;
	BI.LiveOut = true;
	BI.FirstInstr = BI.FirstDef = LVI->start;
	}

	// A Segment that starts in the middle of the block must be a def.
	assert(LVI->start == LVI->valno->def && "Dangling Segment start");
	if (!BI.FirstDef)
	BI.FirstDef = LVI->start;
	}

	UseBlocks.push_back(BI);

	// LVI is now at LVE or LVI->end >= Stop.
	if (LVI == LVE)
	break;
	}

	// Live segment ends exactly at Stop. Move to the next segment.
	if (LVI->end == Stop && ++LVI == LVE)
	break;

	// Pick the next basic block.
	if (LVI->start < Stop)
	++MFI;
	else
	MFI = LIS.getMBBFromIndex(LVI->start)->getIterator();
	}

	assert(getNumLiveBlocks() == countLiveBlocks(CurLI) && "Bad block count");
	return true;
	}

	unsigned SplitAnalysis::countLiveBlocks(const LiveInterval *cli) const {
	if (cli->empty())
	return 0;
	LiveInterval li = const_cast<LiveInterval>(cli);
	LiveInterval::iterator LVI = li->begin();
	LiveInterval::iterator LVE = li->end();
	unsigned Count = 0;

	// Loop over basic blocks where li is live.
	MachineFunction::const_iterator MFI =
	LIS.getMBBFromIndex(LVI->start)->getIterator();
	SlotIndex Stop = LIS.getMBBEndIdx(&*MFI);
	while (true) {
	++Count;
	LVI = li->advanceTo(LVI, Stop);
	if (LVI == LVE)
	return Count;
	do {
	++MFI;
	Stop = LIS.getMBBEndIdx(&*MFI);
	} while (Stop <= LVI->start);
	}
	}

	bool SplitAnalysis::isOriginalEndpoint(SlotIndex Idx) const {
	unsigned OrigReg = VRM.getOriginal(CurLI->reg);
	const LiveInterval &Orig = LIS.getInterval(OrigReg);
	assert(!Orig.empty() && "Splitting empty interval?");
	LiveInterval::const_iterator I = Orig.find(Idx);

	// Range containing Idx should begin at Idx.
	if (I != Orig.end() && I->start <= Idx)
	return I->start == Idx;

	// Range does not contain Idx, previous must end at Idx.
	return I != Orig.begin() && (--I)->end == Idx;
	}

	void SplitAnalysis::analyze(const LiveInterval *li) {
	clear();
	CurLI = li;
	analyzeUses();
	}

	//===----------------------------------------------------------------------===//
	// Split Editor
	//===----------------------------------------------------------------------===//

	/// Create a new SplitEditor for editing the LiveInterval analyzed by SA.
	SplitEditor::SplitEditor(SplitAnalysis &sa, AliasAnalysis &aa,
	LiveIntervals &lis, VirtRegMap &vrm,
	MachineDominatorTree &mdt,
	MachineBlockFrequencyInfo &mbfi)
	: SA(sa), AA(aa), LIS(lis), VRM(vrm),
	MRI(vrm.getMachineFunction().getRegInfo()), MDT(mdt),
	TII(*vrm.getMachineFunction().getSubtarget().getInstrInfo()),
	TRI(*vrm.getMachineFunction().getSubtarget().getRegisterInfo()),
	MBFI(mbfi), RegAssign(Allocator) {}

	void SplitEditor::reset(LiveRangeEdit &LRE, ComplementSpillMode SM) {
	Edit = &LRE;
	SpillMode = SM;
	OpenIdx = 0;
	RegAssign.clear();
	Values.clear();

	// Reset the LiveRangeCalc instances needed for this spill mode.
	LRCalc[0].reset(&VRM.getMachineFunction(), LIS.getSlotIndexes(), &MDT,
	&LIS.getVNInfoAllocator());
	if (SpillMode)
	LRCalc[1].reset(&VRM.getMachineFunction(), LIS.getSlotIndexes(), &MDT,
	&LIS.getVNInfoAllocator());

	// We don't need an AliasAnalysis since we will only be performing
	// cheap-as-a-copy remats anyway.
	Edit->anyRematerializable(nullptr);
	}

	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	LLVM_DUMP_METHOD void SplitEditor::dump() const {
	if (RegAssign.empty()) {
	dbgs() << " empty\n";
	return;
	}

	for (RegAssignMap::const_iterator I = RegAssign.begin(); I.valid(); ++I)
	dbgs() << " [" << I.start() << ';' << I.stop() << "):" << I.value();
	dbgs() << '\n';
	}
	#endif

	LiveInterval::SubRange &SplitEditor::getSubRangeForMask(LaneBitmask LM,
	LiveInterval &LI) {
	for (LiveInterval::SubRange &S : LI.subranges())
	if (S.LaneMask == LM)
	return S;
	llvm_unreachable("SubRange for this mask not found");
	}

	void SplitEditor::addDeadDef(LiveInterval &LI, VNInfo *VNI, bool Original) {
	if (!LI.hasSubRanges()) {
	LI.createDeadDef(VNI);
	return;
	}

	SlotIndex Def = VNI->def;
	if (Original) {
	// If we are transferring a def from the original interval, make sure
	// to only update the subranges for which the original subranges had
	// a def at this location.
	for (LiveInterval::SubRange &S : LI.subranges()) {
	auto &PS = getSubRangeForMask(S.LaneMask, Edit->getParent());
	VNInfo *PV = PS.getVNInfoAt(Def);
	if (PV != nullptr && PV->def == Def)
	S.createDeadDef(Def, LIS.getVNInfoAllocator());
	}
	} else {
	// This is a new def: either from rematerialization, or from an inserted
	// copy. Since rematerialization can regenerate a definition of a sub-
	// register, we need to check which subranges need to be updated.
	const MachineInstr *DefMI = LIS.getInstructionFromIndex(Def);
	assert(DefMI != nullptr);
	LaneBitmask LM;
	for (const MachineOperand &DefOp : DefMI->defs()) {
	unsigned R = DefOp.getReg();
	if (R != LI.reg)
	continue;
	if (unsigned SR = DefOp.getSubReg())
	LM \|= TRI.getSubRegIndexLaneMask(SR);
	else {
	LM = MRI.getMaxLaneMaskForVReg(R);
	break;
	}
	}
	for (LiveInterval::SubRange &S : LI.subranges())
	if ((S.LaneMask & LM).any())
	S.createDeadDef(Def, LIS.getVNInfoAllocator());
	}
	}

	VNInfo *SplitEditor::defValue(unsigned RegIdx,
	const VNInfo *ParentVNI,
	SlotIndex Idx,
	bool Original) {
	assert(ParentVNI && "Mapping NULL value");
	assert(Idx.isValid() && "Invalid SlotIndex");
	assert(Edit->getParent().getVNInfoAt(Idx) == ParentVNI && "Bad Parent VNI");
	LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx));

	// Create a new value.
	VNInfo *VNI = LI->getNextValue(Idx, LIS.getVNInfoAllocator());

	bool Force = LI->hasSubRanges();
	ValueForcePair FP(Force ? nullptr : VNI, Force);
	// Use insert for lookup, so we can add missing values with a second lookup.
	std::pair<ValueMap::iterator, bool> InsP =
	Values.insert(std::make_pair(std::make_pair(RegIdx, ParentVNI->id), FP));

	// This was the first time (RegIdx, ParentVNI) was mapped, and it is not
	// forced. Keep it as a simple def without any liveness.
	if (!Force && InsP.second)
	return VNI;

	// If the previous value was a simple mapping, add liveness for it now.
	if (VNInfo *OldVNI = InsP.first->second.getPointer()) {
	addDeadDef(*LI, OldVNI, Original);

	// No longer a simple mapping. Switch to a complex mapping. If the
	// interval has subranges, make it a forced mapping.
	InsP.first->second = ValueForcePair(nullptr, Force);
	}

	// This is a complex mapping, add liveness for VNI
	addDeadDef(*LI, VNI, Original);
	return VNI;
	}

	-void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo *ParentVNI) {
	- assert(ParentVNI && "Mapping NULL value");
	- ValueForcePair &VFP = Values[std::make_pair(RegIdx, ParentVNI->id)];
	+void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo &ParentVNI) {
	+ ValueForcePair &VFP = Values[std::make_pair(RegIdx, ParentVNI.id)];
	VNInfo *VNI = VFP.getPointer();

	// ParentVNI was either unmapped or already complex mapped. Either way, just
	// set the force bit.
	if (!VNI) {
	VFP.setInt(true);
	return;
	}

	// This was previously a single mapping. Make sure the old def is represented
	// by a trivial live range.
	addDeadDef(LIS.getInterval(Edit->get(RegIdx)), VNI, false);

	// Mark as complex mapped, forced.
	VFP = ValueForcePair(nullptr, true);
	}

	SlotIndex SplitEditor::buildSingleSubRegCopy(unsigned FromReg, unsigned ToReg,
	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
	unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex Def) {
	const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
	bool FirstCopy = !Def.isValid();
	MachineInstr *CopyMI = BuildMI(MBB, InsertBefore, DebugLoc(), Desc)
	.addReg(ToReg, RegState::Define \| getUndefRegState(FirstCopy)
	\| getInternalReadRegState(!FirstCopy), SubIdx)
	.addReg(FromReg, 0, SubIdx);

	BumpPtrAllocator &Allocator = LIS.getVNInfoAllocator();
	if (FirstCopy) {
	SlotIndexes &Indexes = *LIS.getSlotIndexes();
	Def = Indexes.insertMachineInstrInMaps(*CopyMI, Late).getRegSlot();
	} else {
	CopyMI->bundleWithPred();
	}
	LaneBitmask LaneMask = TRI.getSubRegIndexLaneMask(SubIdx);
	DestLI.refineSubRanges(Allocator, LaneMask,
	[Def, &Allocator](LiveInterval::SubRange& SR) {
	SR.createDeadDef(Def, Allocator);
	});
	return Def;
	}

	SlotIndex SplitEditor::buildCopy(unsigned FromReg, unsigned ToReg,
	LaneBitmask LaneMask, MachineBasicBlock &MBB,
	MachineBasicBlock::iterator InsertBefore, bool Late, unsigned RegIdx) {
	const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
	if (LaneMask.all() \|\| LaneMask == MRI.getMaxLaneMaskForVReg(FromReg)) {
	// The full vreg is copied.
	MachineInstr *CopyMI =
	BuildMI(MBB, InsertBefore, DebugLoc(), Desc, ToReg).addReg(FromReg);
	SlotIndexes &Indexes = *LIS.getSlotIndexes();
	return Indexes.insertMachineInstrInMaps(*CopyMI, Late).getRegSlot();
	}

	// Only a subset of lanes needs to be copied. The following is a simple
	// heuristic to construct a sequence of COPYs. We could add a target
	// specific callback if this turns out to be suboptimal.
	LiveInterval &DestLI = LIS.getInterval(Edit->get(RegIdx));

	// First pass: Try to find a perfectly matching subregister index. If none
	// exists find the one covering the most lanemask bits.
	SmallVector<unsigned, 8> PossibleIndexes;
	unsigned BestIdx = 0;
	unsigned BestCover = 0;
	const TargetRegisterClass *RC = MRI.getRegClass(FromReg);
	assert(RC == MRI.getRegClass(ToReg) && "Should have same reg class");
	for (unsigned Idx = 1, E = TRI.getNumSubRegIndices(); Idx < E; ++Idx) {
	// Is this index even compatible with the given class?
	if (TRI.getSubClassWithSubReg(RC, Idx) != RC)
	continue;
	LaneBitmask SubRegMask = TRI.getSubRegIndexLaneMask(Idx);
	// Early exit if we found a perfect match.
	if (SubRegMask == LaneMask) {
	BestIdx = Idx;
	break;
	}

	// The index must not cover any lanes outside \p LaneMask.
	if ((SubRegMask & ~LaneMask).any())
	continue;

	unsigned PopCount = SubRegMask.getNumLanes();
	PossibleIndexes.push_back(Idx);
	if (PopCount > BestCover) {
	BestCover = PopCount;
	BestIdx = Idx;
	}
	}

	// Abort if we cannot possibly implement the COPY with the given indexes.
	if (BestIdx == 0)
	report_fatal_error("Impossible to implement partial COPY");

	SlotIndex Def = buildSingleSubRegCopy(FromReg, ToReg, MBB, InsertBefore,
	BestIdx, DestLI, Late, SlotIndex());

	// Greedy heuristic: Keep iterating keeping the best covering subreg index
	// each time.
	LaneBitmask LanesLeft = LaneMask & ~(TRI.getSubRegIndexLaneMask(BestIdx));
	while (LanesLeft.any()) {
	unsigned BestIdx = 0;
	int BestCover = std::numeric_limits<int>::min();
	for (unsigned Idx : PossibleIndexes) {
	LaneBitmask SubRegMask = TRI.getSubRegIndexLaneMask(Idx);
	// Early exit if we found a perfect match.
	if (SubRegMask == LanesLeft) {
	BestIdx = Idx;
	break;
	}

	// Try to cover as much of the remaining lanes as possible but
	// as few of the already covered lanes as possible.
	int Cover = (SubRegMask & LanesLeft).getNumLanes()
	- (SubRegMask & ~LanesLeft).getNumLanes();
	if (Cover > BestCover) {
	BestCover = Cover;
	BestIdx = Idx;
	}
	}

	if (BestIdx == 0)
	report_fatal_error("Impossible to implement partial COPY");

	buildSingleSubRegCopy(FromReg, ToReg, MBB, InsertBefore, BestIdx,
	DestLI, Late, Def);
	LanesLeft &= ~TRI.getSubRegIndexLaneMask(BestIdx);
	}

	return Def;
	}

	VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
	VNInfo *ParentVNI,
	SlotIndex UseIdx,
	MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I) {
	SlotIndex Def;
	LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx));

	// We may be trying to avoid interference that ends at a deleted instruction,
	// so always begin RegIdx 0 early and all others late.
	bool Late = RegIdx != 0;

	// Attempt cheap-as-a-copy rematerialization.
	unsigned Original = VRM.getOriginal(Edit->get(RegIdx));
	LiveInterval &OrigLI = LIS.getInterval(Original);
	VNInfo *OrigVNI = OrigLI.getVNInfoAt(UseIdx);

	unsigned Reg = LI->reg;
	bool DidRemat = false;
	if (OrigVNI) {
	LiveRangeEdit::Remat RM(ParentVNI);
	RM.OrigMI = LIS.getInstructionFromIndex(OrigVNI->def);
	if (Edit->canRematerializeAt(RM, OrigVNI, UseIdx, true)) {
	Def = Edit->rematerializeAt(MBB, I, Reg, RM, TRI, Late);
	++NumRemats;
	DidRemat = true;
	}
	}
	if (!DidRemat) {
	LaneBitmask LaneMask;
	if (LI->hasSubRanges()) {
	LaneMask = LaneBitmask::getNone();
	for (LiveInterval::SubRange &S : LI->subranges())
	LaneMask \|= S.LaneMask;
	} else {
	LaneMask = LaneBitmask::getAll();
	}

	++NumCopies;
	Def = buildCopy(Edit->getReg(), Reg, LaneMask, MBB, I, Late, RegIdx);
	}

	// Define the value in Reg.
	return defValue(RegIdx, ParentVNI, Def, false);
	}

	/// Create a new virtual register and live interval.
	unsigned SplitEditor::openIntv() {
	// Create the complement as index 0.
	if (Edit->empty())
	Edit->createEmptyInterval();

	// Create the open interval.
	OpenIdx = Edit->size();
	Edit->createEmptyInterval();
	return OpenIdx;
	}

	void SplitEditor::selectIntv(unsigned Idx) {
	assert(Idx != 0 && "Cannot select the complement interval");
	assert(Idx < Edit->size() && "Can only select previously opened interval");
	DEBUG(dbgs() << " selectIntv " << OpenIdx << " -> " << Idx << '\n');
	OpenIdx = Idx;
	}

	SlotIndex SplitEditor::enterIntvBefore(SlotIndex Idx) {
	assert(OpenIdx && "openIntv not called before enterIntvBefore");
	DEBUG(dbgs() << " enterIntvBefore " << Idx);
	Idx = Idx.getBaseIndex();
	VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Idx);
	if (!ParentVNI) {
	DEBUG(dbgs() << ": not live\n");
	return Idx;
	}
	DEBUG(dbgs() << ": valno " << ParentVNI->id << '\n');
	MachineInstr *MI = LIS.getInstructionFromIndex(Idx);
	assert(MI && "enterIntvBefore called with invalid index");

	VNInfo VNI = defFromParent(OpenIdx, ParentVNI, Idx, MI->getParent(), MI);
	return VNI->def;
	}

	SlotIndex SplitEditor::enterIntvAfter(SlotIndex Idx) {
	assert(OpenIdx && "openIntv not called before enterIntvAfter");
	DEBUG(dbgs() << " enterIntvAfter " << Idx);
	Idx = Idx.getBoundaryIndex();
	VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Idx);
	if (!ParentVNI) {
	DEBUG(dbgs() << ": not live\n");
	return Idx;
	}
	DEBUG(dbgs() << ": valno " << ParentVNI->id << '\n');
	MachineInstr *MI = LIS.getInstructionFromIndex(Idx);
	assert(MI && "enterIntvAfter called with invalid index");

	VNInfo VNI = defFromParent(OpenIdx, ParentVNI, Idx, MI->getParent(),
	std::next(MachineBasicBlock::iterator(MI)));
	return VNI->def;
	}

	SlotIndex SplitEditor::enterIntvAtEnd(MachineBasicBlock &MBB) {
	assert(OpenIdx && "openIntv not called before enterIntvAtEnd");
	SlotIndex End = LIS.getMBBEndIdx(&MBB);
	SlotIndex Last = End.getPrevSlot();
	DEBUG(dbgs() << " enterIntvAtEnd " << printMBBReference(MBB) << ", "
	<< Last);
	VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Last);
	if (!ParentVNI) {
	DEBUG(dbgs() << ": not live\n");
	return End;
	}
	DEBUG(dbgs() << ": valno " << ParentVNI->id);
	VNInfo *VNI = defFromParent(OpenIdx, ParentVNI, Last, MBB,
	SA.getLastSplitPointIter(&MBB));
	RegAssign.insert(VNI->def, End, OpenIdx);
	DEBUG(dump());
	return VNI->def;
	}

	/// useIntv - indicate that all instructions in MBB should use OpenLI.
	void SplitEditor::useIntv(const MachineBasicBlock &MBB) {
	useIntv(LIS.getMBBStartIdx(&MBB), LIS.getMBBEndIdx(&MBB));
	}

	void SplitEditor::useIntv(SlotIndex Start, SlotIndex End) {
	assert(OpenIdx && "openIntv not called before useIntv");
	DEBUG(dbgs() << " useIntv [" << Start << ';' << End << "):");
	RegAssign.insert(Start, End, OpenIdx);
	DEBUG(dump());
	}

	SlotIndex SplitEditor::leaveIntvAfter(SlotIndex Idx) {
	assert(OpenIdx && "openIntv not called before leaveIntvAfter");
	DEBUG(dbgs() << " leaveIntvAfter " << Idx);

	// The interval must be live beyond the instruction at Idx.
	SlotIndex Boundary = Idx.getBoundaryIndex();
	VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Boundary);
	if (!ParentVNI) {
	DEBUG(dbgs() << ": not live\n");
	return Boundary.getNextSlot();
	}
	DEBUG(dbgs() << ": valno " << ParentVNI->id << '\n');
	MachineInstr *MI = LIS.getInstructionFromIndex(Boundary);
	assert(MI && "No instruction at index");

	// In spill mode, make live ranges as short as possible by inserting the copy
	// before MI. This is only possible if that instruction doesn't redefine the
	// value. The inserted COPY is not a kill, and we don't need to recompute
	// the source live range. The spiller also won't try to hoist this copy.
	if (SpillMode && !SlotIndex::isSameInstr(ParentVNI->def, Idx) &&
	MI->readsVirtualRegister(Edit->getReg())) {
	- forceRecompute(0, ParentVNI);
	+ forceRecompute(0, *ParentVNI);
	defFromParent(0, ParentVNI, Idx, *MI->getParent(), MI);
	return Idx;
	}

	VNInfo VNI = defFromParent(0, ParentVNI, Boundary, MI->getParent(),
	std::next(MachineBasicBlock::iterator(MI)));
	return VNI->def;
	}

	SlotIndex SplitEditor::leaveIntvBefore(SlotIndex Idx) {
	assert(OpenIdx && "openIntv not called before leaveIntvBefore");
	DEBUG(dbgs() << " leaveIntvBefore " << Idx);

	// The interval must be live into the instruction at Idx.
	Idx = Idx.getBaseIndex();
	VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Idx);
	if (!ParentVNI) {
	DEBUG(dbgs() << ": not live\n");
	return Idx.getNextSlot();
	}
	DEBUG(dbgs() << ": valno " << ParentVNI->id << '\n');

	MachineInstr *MI = LIS.getInstructionFromIndex(Idx);
	assert(MI && "No instruction at index");
	VNInfo VNI = defFromParent(0, ParentVNI, Idx, MI->getParent(), MI);
	return VNI->def;
	}

	SlotIndex SplitEditor::leaveIntvAtTop(MachineBasicBlock &MBB) {
	assert(OpenIdx && "openIntv not called before leaveIntvAtTop");
	SlotIndex Start = LIS.getMBBStartIdx(&MBB);
	DEBUG(dbgs() << " leaveIntvAtTop " << printMBBReference(MBB) << ", "
	<< Start);

	VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Start);
	if (!ParentVNI) {
	DEBUG(dbgs() << ": not live\n");
	return Start;
	}

	VNInfo *VNI = defFromParent(0, ParentVNI, Start, MBB,
	MBB.SkipPHIsLabelsAndDebug(MBB.begin()));
	RegAssign.insert(Start, VNI->def, OpenIdx);
	DEBUG(dump());
	return VNI->def;
	}

	void SplitEditor::overlapIntv(SlotIndex Start, SlotIndex End) {
	assert(OpenIdx && "openIntv not called before overlapIntv");
	const VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Start);
	assert(ParentVNI == Edit->getParent().getVNInfoBefore(End) &&
	"Parent changes value in extended range");
	assert(LIS.getMBBFromIndex(Start) == LIS.getMBBFromIndex(End) &&
	"Range cannot span basic blocks");

	// The complement interval will be extended as needed by LRCalc.extend().
	if (ParentVNI)
	- forceRecompute(0, ParentVNI);
	+ forceRecompute(0, *ParentVNI);
	DEBUG(dbgs() << " overlapIntv [" << Start << ';' << End << "):");
	RegAssign.insert(Start, End, OpenIdx);
	DEBUG(dump());
	}

	//===----------------------------------------------------------------------===//
	// Spill modes
	//===----------------------------------------------------------------------===//

	void SplitEditor::removeBackCopies(SmallVectorImpl<VNInfo*> &Copies) {
	LiveInterval *LI = &LIS.getInterval(Edit->get(0));
	DEBUG(dbgs() << "Removing " << Copies.size() << " back-copies.\n");
	RegAssignMap::iterator AssignI;
	AssignI.setMap(RegAssign);

	for (unsigned i = 0, e = Copies.size(); i != e; ++i) {
	SlotIndex Def = Copies[i]->def;
	MachineInstr *MI = LIS.getInstructionFromIndex(Def);
	assert(MI && "No instruction for back-copy");

	MachineBasicBlock *MBB = MI->getParent();
	MachineBasicBlock::iterator MBBI(MI);
	bool AtBegin;
	do AtBegin = MBBI == MBB->begin();
	while (!AtBegin && (--MBBI)->isDebugValue());

	DEBUG(dbgs() << "Removing " << Def << '\t' << *MI);
	LIS.removeVRegDefAt(*LI, Def);
	LIS.RemoveMachineInstrFromMaps(*MI);
	MI->eraseFromParent();

	// Adjust RegAssign if a register assignment is killed at Def. We want to
	// avoid calculating the live range of the source register if possible.
	AssignI.find(Def.getPrevSlot());
	if (!AssignI.valid() \|\| AssignI.start() >= Def)
	continue;
	// If MI doesn't kill the assigned register, just leave it.
	if (AssignI.stop() != Def)
	continue;
	unsigned RegIdx = AssignI.value();
	if (AtBegin \|\| !MBBI->readsVirtualRegister(Edit->getReg())) {
	DEBUG(dbgs() << " cannot find simple kill of RegIdx " << RegIdx << '\n');
	- forceRecompute(RegIdx, Edit->getParent().getVNInfoAt(Def));
	+ forceRecompute(RegIdx, *Edit->getParent().getVNInfoAt(Def));
	} else {
	SlotIndex Kill = LIS.getInstructionIndex(*MBBI).getRegSlot();
	DEBUG(dbgs() << " move kill to " << Kill << '\t' << *MBBI);
	AssignI.setStop(Kill);
	}
	}
	}

	MachineBasicBlock*
	SplitEditor::findShallowDominator(MachineBasicBlock *MBB,
	MachineBasicBlock *DefMBB) {
	if (MBB == DefMBB)
	return MBB;
	assert(MDT.dominates(DefMBB, MBB) && "MBB must be dominated by the def.");

	const MachineLoopInfo &Loops = SA.Loops;
	const MachineLoop *DefLoop = Loops.getLoopFor(DefMBB);
	MachineDomTreeNode *DefDomNode = MDT[DefMBB];

	// Best candidate so far.
	MachineBasicBlock *BestMBB = MBB;
	unsigned BestDepth = std::numeric_limits<unsigned>::max();

	while (true) {
	const MachineLoop *Loop = Loops.getLoopFor(MBB);

	// MBB isn't in a loop, it doesn't get any better. All dominators have a
	// higher frequency by definition.
	if (!Loop) {
	DEBUG(dbgs() << "Def in " << printMBBReference(*DefMBB) << " dominates "
	<< printMBBReference(*MBB) << " at depth 0\n");
	return MBB;
	}

	// We'll never be able to exit the DefLoop.
	if (Loop == DefLoop) {
	DEBUG(dbgs() << "Def in " << printMBBReference(*DefMBB) << " dominates "
	<< printMBBReference(*MBB) << " in the same loop\n");
	return MBB;
	}

	// Least busy dominator seen so far.
	unsigned Depth = Loop->getLoopDepth();
	if (Depth < BestDepth) {
	BestMBB = MBB;
	BestDepth = Depth;
	DEBUG(dbgs() << "Def in " << printMBBReference(*DefMBB) << " dominates "
	<< printMBBReference(*MBB) << " at depth " << Depth << '\n');
	}

	// Leave loop by going to the immediate dominator of the loop header.
	// This is a bigger stride than simply walking up the dominator tree.
	MachineDomTreeNode *IDom = MDT[Loop->getHeader()]->getIDom();

	// Too far up the dominator tree?
	if (!IDom \|\| !MDT.dominates(DefDomNode, IDom))
	return BestMBB;

	MBB = IDom->getBlock();
	}
	}

	void SplitEditor::computeRedundantBackCopies(
	DenseSet<unsigned> &NotToHoistSet, SmallVectorImpl<VNInfo *> &BackCopies) {
	LiveInterval *LI = &LIS.getInterval(Edit->get(0));
	LiveInterval *Parent = &Edit->getParent();
	SmallVector<SmallPtrSet<VNInfo *, 8>, 8> EqualVNs(Parent->getNumValNums());
	SmallPtrSet<VNInfo *, 8> DominatedVNIs;

	// Aggregate VNIs having the same value as ParentVNI.
	for (VNInfo *VNI : LI->valnos) {
	if (VNI->isUnused())
	continue;
	VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
	EqualVNs[ParentVNI->id].insert(VNI);
	}

	// For VNI aggregation of each ParentVNI, collect dominated, i.e.,
	// redundant VNIs to BackCopies.
	for (unsigned i = 0, e = Parent->getNumValNums(); i != e; ++i) {
	VNInfo *ParentVNI = Parent->getValNumInfo(i);
	if (!NotToHoistSet.count(ParentVNI->id))
	continue;
	SmallPtrSetIterator<VNInfo *> It1 = EqualVNs[ParentVNI->id].begin();
	SmallPtrSetIterator<VNInfo *> It2 = It1;
	for (; It1 != EqualVNs[ParentVNI->id].end(); ++It1) {
	It2 = It1;
	for (++It2; It2 != EqualVNs[ParentVNI->id].end(); ++It2) {
	if (DominatedVNIs.count(It1) \|\| DominatedVNIs.count(It2))
	continue;

	MachineBasicBlock MBB1 = LIS.getMBBFromIndex((It1)->def);
	MachineBasicBlock MBB2 = LIS.getMBBFromIndex((It2)->def);
	if (MBB1 == MBB2) {
	DominatedVNIs.insert((It1)->def < (It2)->def ? (It2) : (It1));
	} else if (MDT.dominates(MBB1, MBB2)) {
	DominatedVNIs.insert(*It2);
	} else if (MDT.dominates(MBB2, MBB1)) {
	DominatedVNIs.insert(*It1);
	}
	}
	}
	if (!DominatedVNIs.empty()) {
	- forceRecompute(0, ParentVNI);
	+ forceRecompute(0, *ParentVNI);
	for (auto VNI : DominatedVNIs) {
	BackCopies.push_back(VNI);
	}
	DominatedVNIs.clear();
	}
	}
	}

	/// For SM_Size mode, find a common dominator for all the back-copies for
	/// the same ParentVNI and hoist the backcopies to the dominator BB.
	/// For SM_Speed mode, if the common dominator is hot and it is not beneficial
	/// to do the hoisting, simply remove the dominated backcopies for the same
	/// ParentVNI.
	void SplitEditor::hoistCopies() {
	// Get the complement interval, always RegIdx 0.
	LiveInterval *LI = &LIS.getInterval(Edit->get(0));
	LiveInterval *Parent = &Edit->getParent();

	// Track the nearest common dominator for all back-copies for each ParentVNI,
	// indexed by ParentVNI->id.
	using DomPair = std::pair<MachineBasicBlock *, SlotIndex>;
	SmallVector<DomPair, 8> NearestDom(Parent->getNumValNums());
	// The total cost of all the back-copies for each ParentVNI.
	SmallVector<BlockFrequency, 8> Costs(Parent->getNumValNums());
	// The ParentVNI->id set for which hoisting back-copies are not beneficial
	// for Speed.
	DenseSet<unsigned> NotToHoistSet;

	// Find the nearest common dominator for parent values with multiple
	// back-copies. If a single back-copy dominates, put it in DomPair.second.
	for (VNInfo *VNI : LI->valnos) {
	if (VNI->isUnused())
	continue;
	VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
	assert(ParentVNI && "Parent not live at complement def");

	// Don't hoist remats. The complement is probably going to disappear
	// completely anyway.
	if (Edit->didRematerialize(ParentVNI))
	continue;

	MachineBasicBlock *ValMBB = LIS.getMBBFromIndex(VNI->def);

	DomPair &Dom = NearestDom[ParentVNI->id];

	// Keep directly defined parent values. This is either a PHI or an
	// instruction in the complement range. All other copies of ParentVNI
	// should be eliminated.
	if (VNI->def == ParentVNI->def) {
	DEBUG(dbgs() << "Direct complement def at " << VNI->def << '\n');
	Dom = DomPair(ValMBB, VNI->def);
	continue;
	}
	// Skip the singly mapped values. There is nothing to gain from hoisting a
	// single back-copy.
	if (Values.lookup(std::make_pair(0, ParentVNI->id)).getPointer()) {
	DEBUG(dbgs() << "Single complement def at " << VNI->def << '\n');
	continue;
	}

	if (!Dom.first) {
	// First time we see ParentVNI. VNI dominates itself.
	Dom = DomPair(ValMBB, VNI->def);
	} else if (Dom.first == ValMBB) {
	// Two defs in the same block. Pick the earlier def.
	if (!Dom.second.isValid() \|\| VNI->def < Dom.second)
	Dom.second = VNI->def;
	} else {
	// Different basic blocks. Check if one dominates.
	MachineBasicBlock *Near =
	MDT.findNearestCommonDominator(Dom.first, ValMBB);
	if (Near == ValMBB)
	// Def ValMBB dominates.
	Dom = DomPair(ValMBB, VNI->def);
	else if (Near != Dom.first)
	// None dominate. Hoist to common dominator, need new def.
	Dom = DomPair(Near, SlotIndex());
	Costs[ParentVNI->id] += MBFI.getBlockFreq(ValMBB);
	}

	DEBUG(dbgs() << "Multi-mapped complement " << VNI->id << '@' << VNI->def
	<< " for parent " << ParentVNI->id << '@' << ParentVNI->def
	<< " hoist to " << printMBBReference(*Dom.first) << ' '
	<< Dom.second << '\n');
	}

	// Insert the hoisted copies.
	for (unsigned i = 0, e = Parent->getNumValNums(); i != e; ++i) {
	DomPair &Dom = NearestDom[i];
	if (!Dom.first \|\| Dom.second.isValid())
	continue;
	// This value needs a hoisted copy inserted at the end of Dom.first.
	VNInfo *ParentVNI = Parent->getValNumInfo(i);
	MachineBasicBlock *DefMBB = LIS.getMBBFromIndex(ParentVNI->def);
	// Get a less loopy dominator than Dom.first.
	Dom.first = findShallowDominator(Dom.first, DefMBB);
	if (SpillMode == SM_Speed &&
	MBFI.getBlockFreq(Dom.first) > Costs[ParentVNI->id]) {
	NotToHoistSet.insert(ParentVNI->id);
	continue;
	}
	SlotIndex Last = LIS.getMBBEndIdx(Dom.first).getPrevSlot();
	Dom.second =
	defFromParent(0, ParentVNI, Last, *Dom.first,
	SA.getLastSplitPointIter(Dom.first))->def;
	}

	// Remove redundant back-copies that are now known to be dominated by another
	// def with the same value.
	SmallVector<VNInfo*, 8> BackCopies;
	for (VNInfo *VNI : LI->valnos) {
	if (VNI->isUnused())
	continue;
	VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
	const DomPair &Dom = NearestDom[ParentVNI->id];
	if (!Dom.first \|\| Dom.second == VNI->def \|\|
	NotToHoistSet.count(ParentVNI->id))
	continue;
	BackCopies.push_back(VNI);
	- forceRecompute(0, ParentVNI);
	+ forceRecompute(0, *ParentVNI);
	}

	// If it is not beneficial to hoist all the BackCopies, simply remove
	// redundant BackCopies in speed mode.
	if (SpillMode == SM_Speed && !NotToHoistSet.empty())
	computeRedundantBackCopies(NotToHoistSet, BackCopies);

	removeBackCopies(BackCopies);
	}

	/// transferValues - Transfer all possible values to the new live ranges.
	/// Values that were rematerialized are left alone, they need LRCalc.extend().
	bool SplitEditor::transferValues() {
	bool Skipped = false;
	RegAssignMap::const_iterator AssignI = RegAssign.begin();
	for (const LiveRange::Segment &S : Edit->getParent()) {
	DEBUG(dbgs() << " blit " << S << ':');
	VNInfo *ParentVNI = S.valno;
	// RegAssign has holes where RegIdx 0 should be used.
	SlotIndex Start = S.start;
	AssignI.advanceTo(Start);
	do {
	unsigned RegIdx;
	SlotIndex End = S.end;
	if (!AssignI.valid()) {
	RegIdx = 0;
	} else if (AssignI.start() <= Start) {
	RegIdx = AssignI.value();
	if (AssignI.stop() < End) {
	End = AssignI.stop();
	++AssignI;
	}
	} else {
	RegIdx = 0;
	End = std::min(End, AssignI.start());
	}

	// The interval [Start;End) is continuously mapped to RegIdx, ParentVNI.
	DEBUG(dbgs() << " [" << Start << ';' << End << ")=" << RegIdx
	<< '(' << printReg(Edit->get(RegIdx)) << ')');
	LiveInterval &LI = LIS.getInterval(Edit->get(RegIdx));

	// Check for a simply defined value that can be blitted directly.
	ValueForcePair VFP = Values.lookup(std::make_pair(RegIdx, ParentVNI->id));
	if (VNInfo *VNI = VFP.getPointer()) {
	DEBUG(dbgs() << ':' << VNI->id);
	LI.addSegment(LiveInterval::Segment(Start, End, VNI));
	Start = End;
	continue;
	}

	// Skip values with forced recomputation.
	if (VFP.getInt()) {
	DEBUG(dbgs() << "(recalc)");
	Skipped = true;
	Start = End;
	continue;
	}

	LiveRangeCalc &LRC = getLRCalc(RegIdx);

	// This value has multiple defs in RegIdx, but it wasn't rematerialized,
	// so the live range is accurate. Add live-in blocks in [Start;End) to the
	// LiveInBlocks.
	MachineFunction::iterator MBB = LIS.getMBBFromIndex(Start)->getIterator();
	SlotIndex BlockStart, BlockEnd;
	std::tie(BlockStart, BlockEnd) = LIS.getSlotIndexes()->getMBBRange(&*MBB);

	// The first block may be live-in, or it may have its own def.
	if (Start != BlockStart) {
	VNInfo *VNI = LI.extendInBlock(BlockStart, std::min(BlockEnd, End));
	assert(VNI && "Missing def for complex mapped value");
	DEBUG(dbgs() << ':' << VNI->id << "" << printMBBReference(MBB));
	// MBB has its own def. Is it also live-out?
	if (BlockEnd <= End)
	LRC.setLiveOutValue(&*MBB, VNI);

	// Skip to the next block for live-in.
	++MBB;
	BlockStart = BlockEnd;
	}

	// Handle the live-in blocks covered by [Start;End).
	assert(Start <= BlockStart && "Expected live-in block");
	while (BlockStart < End) {
	DEBUG(dbgs() << ">" << printMBBReference(*MBB));
	BlockEnd = LIS.getMBBEndIdx(&*MBB);
	if (BlockStart == ParentVNI->def) {
	// This block has the def of a parent PHI, so it isn't live-in.
	assert(ParentVNI->isPHIDef() && "Non-phi defined at block start?");
	VNInfo *VNI = LI.extendInBlock(BlockStart, std::min(BlockEnd, End));
	assert(VNI && "Missing def for complex mapped parent PHI");
	if (End >= BlockEnd)
	LRC.setLiveOutValue(&*MBB, VNI); // Live-out as well.
	} else {
	// This block needs a live-in value. The last block covered may not
	// be live-out.
	if (End < BlockEnd)
	LRC.addLiveInBlock(LI, MDT[&*MBB], End);
	else {
	// Live-through, and we don't know the value.
	LRC.addLiveInBlock(LI, MDT[&*MBB]);
	LRC.setLiveOutValue(&*MBB, nullptr);
	}
	}
	BlockStart = BlockEnd;
	++MBB;
	}
	Start = End;
	} while (Start != S.end);
	DEBUG(dbgs() << '\n');
	}

	LRCalc[0].calculateValues();
	if (SpillMode)
	LRCalc[1].calculateValues();

	return Skipped;
	}

	static bool removeDeadSegment(SlotIndex Def, LiveRange &LR) {
	const LiveRange::Segment *Seg = LR.getSegmentContaining(Def);
	if (Seg == nullptr)
	return true;
	if (Seg->end != Def.getDeadSlot())
	return false;
	// This is a dead PHI. Remove it.
	LR.removeSegment(*Seg, true);
	return true;
	}

	void SplitEditor::extendPHIRange(MachineBasicBlock &B, LiveRangeCalc &LRC,
	LiveRange &LR, LaneBitmask LM,
	ArrayRef<SlotIndex> Undefs) {
	for (MachineBasicBlock *P : B.predecessors()) {
	SlotIndex End = LIS.getMBBEndIdx(P);
	SlotIndex LastUse = End.getPrevSlot();
	// The predecessor may not have a live-out value. That is OK, like an
	// undef PHI operand.
	LiveInterval &PLI = Edit->getParent();
	// Need the cast because the inputs to ?: would otherwise be deemed
	// "incompatible": SubRange vs LiveInterval.
	LiveRange &PSR = !LM.all() ? getSubRangeForMask(LM, PLI)
	: static_cast<LiveRange&>(PLI);
	if (PSR.liveAt(LastUse))
	LRC.extend(LR, End, /PhysReg=/0, Undefs);
	}
	}

	void SplitEditor::extendPHIKillRanges() {
	// Extend live ranges to be live-out for successor PHI values.

	// Visit each PHI def slot in the parent live interval. If the def is dead,
	// remove it. Otherwise, extend the live interval to reach the end indexes
	// of all predecessor blocks.

	LiveInterval &ParentLI = Edit->getParent();
	for (const VNInfo *V : ParentLI.valnos) {
	if (V->isUnused() \|\| !V->isPHIDef())
	continue;

	unsigned RegIdx = RegAssign.lookup(V->def);
	LiveInterval &LI = LIS.getInterval(Edit->get(RegIdx));
	LiveRangeCalc &LRC = getLRCalc(RegIdx);
	MachineBasicBlock &B = *LIS.getMBBFromIndex(V->def);
	if (!removeDeadSegment(V->def, LI))
	extendPHIRange(B, LRC, LI, LaneBitmask::getAll(), /Undefs=/{});
	}

	SmallVector<SlotIndex, 4> Undefs;
	LiveRangeCalc SubLRC;

	for (LiveInterval::SubRange &PS : ParentLI.subranges()) {
	for (const VNInfo *V : PS.valnos) {
	if (V->isUnused() \|\| !V->isPHIDef())
	continue;
	unsigned RegIdx = RegAssign.lookup(V->def);
	LiveInterval &LI = LIS.getInterval(Edit->get(RegIdx));
	LiveInterval::SubRange &S = getSubRangeForMask(PS.LaneMask, LI);
	if (removeDeadSegment(V->def, S))
	continue;

	MachineBasicBlock &B = *LIS.getMBBFromIndex(V->def);
	SubLRC.reset(&VRM.getMachineFunction(), LIS.getSlotIndexes(), &MDT,
	&LIS.getVNInfoAllocator());
	Undefs.clear();
	LI.computeSubRangeUndefs(Undefs, PS.LaneMask, MRI, *LIS.getSlotIndexes());
	extendPHIRange(B, SubLRC, S, PS.LaneMask, Undefs);
	}
	}
	}

	/// rewriteAssigned - Rewrite all uses of Edit->getReg().
	void SplitEditor::rewriteAssigned(bool ExtendRanges) {
	struct ExtPoint {
	ExtPoint(const MachineOperand &O, unsigned R, SlotIndex N)
	: MO(O), RegIdx(R), Next(N) {}

	MachineOperand MO;
	unsigned RegIdx;
	SlotIndex Next;
	};

	SmallVector<ExtPoint,4> ExtPoints;

	for (MachineRegisterInfo::reg_iterator RI = MRI.reg_begin(Edit->getReg()),
	RE = MRI.reg_end(); RI != RE;) {
	MachineOperand &MO = *RI;
	MachineInstr *MI = MO.getParent();
	++RI;
	// LiveDebugVariables should have handled all DBG_VALUE instructions.
	if (MI->isDebugValue()) {
	DEBUG(dbgs() << "Zapping " << *MI);
	MO.setReg(0);
	continue;
	}

	// <undef> operands don't really read the register, so it doesn't matter
	// which register we choose. When the use operand is tied to a def, we must
	// use the same register as the def, so just do that always.
	SlotIndex Idx = LIS.getInstructionIndex(*MI);
	if (MO.isDef() \|\| MO.isUndef())
	Idx = Idx.getRegSlot(MO.isEarlyClobber());

	// Rewrite to the mapped register at Idx.
	unsigned RegIdx = RegAssign.lookup(Idx);
	LiveInterval &LI = LIS.getInterval(Edit->get(RegIdx));
	MO.setReg(LI.reg);
	DEBUG(dbgs() << " rewr " << printMBBReference(*MI->getParent()) << '\t'
	<< Idx << ':' << RegIdx << '\t' << *MI);

	// Extend liveness to Idx if the instruction reads reg.
	if (!ExtendRanges \|\| MO.isUndef())
	continue;

	// Skip instructions that don't read Reg.
	if (MO.isDef()) {
	if (!MO.getSubReg() && !MO.isEarlyClobber())
	continue;
	// We may want to extend a live range for a partial redef, or for a use
	// tied to an early clobber.
	Idx = Idx.getPrevSlot();
	if (!Edit->getParent().liveAt(Idx))
	continue;
	} else
	Idx = Idx.getRegSlot(true);

	SlotIndex Next = Idx.getNextSlot();
	if (LI.hasSubRanges()) {
	// We have to delay extending subranges until we have seen all operands
	// defining the register. This is because a <def,read-undef> operand
	// will create an "undef" point, and we cannot extend any subranges
	// until all of them have been accounted for.
	if (MO.isUse())
	ExtPoints.push_back(ExtPoint(MO, RegIdx, Next));
	} else {
	LiveRangeCalc &LRC = getLRCalc(RegIdx);
	LRC.extend(LI, Next, 0, ArrayRef<SlotIndex>());
	}
	}

	for (ExtPoint &EP : ExtPoints) {
	LiveInterval &LI = LIS.getInterval(Edit->get(EP.RegIdx));
	assert(LI.hasSubRanges());

	LiveRangeCalc SubLRC;
	unsigned Reg = EP.MO.getReg(), Sub = EP.MO.getSubReg();
	LaneBitmask LM = Sub != 0 ? TRI.getSubRegIndexLaneMask(Sub)
	: MRI.getMaxLaneMaskForVReg(Reg);
	for (LiveInterval::SubRange &S : LI.subranges()) {
	if ((S.LaneMask & LM).none())
	continue;
	// The problem here can be that the new register may have been created
	// for a partially defined original register. For example:
	// %0:subreg_hireg<def,read-undef> = ...
	// ...
	// %1 = COPY %0
	if (S.empty())
	continue;
	SubLRC.reset(&VRM.getMachineFunction(), LIS.getSlotIndexes(), &MDT,
	&LIS.getVNInfoAllocator());
	SmallVector<SlotIndex, 4> Undefs;
	LI.computeSubRangeUndefs(Undefs, S.LaneMask, MRI, *LIS.getSlotIndexes());
	SubLRC.extend(S, EP.Next, 0, Undefs);
	}
	}

	for (unsigned R : *Edit) {
	LiveInterval &LI = LIS.getInterval(R);
	if (!LI.hasSubRanges())
	continue;
	LI.clear();
	LI.removeEmptySubRanges();
	LIS.constructMainRangeFromSubranges(LI);
	}
	}

	void SplitEditor::deleteRematVictims() {
	SmallVector<MachineInstr*, 8> Dead;
	for (LiveRangeEdit::iterator I = Edit->begin(), E = Edit->end(); I != E; ++I){
	LiveInterval LI = &LIS.getInterval(I);
	for (const LiveRange::Segment &S : LI->segments) {
	// Dead defs end at the dead slot.
	if (S.end != S.valno->def.getDeadSlot())
	continue;
	if (S.valno->isPHIDef())
	continue;
	MachineInstr *MI = LIS.getInstructionFromIndex(S.valno->def);
	assert(MI && "Missing instruction for dead def");
	MI->addRegisterDead(LI->reg, &TRI);

	if (!MI->allDefsAreDead())
	continue;

	DEBUG(dbgs() << "All defs dead: " << *MI);
	Dead.push_back(MI);
	}
	}

	if (Dead.empty())
	return;

	Edit->eliminateDeadDefs(Dead, None, &AA);
	}

	+void SplitEditor::forceRecomputeVNI(const VNInfo &ParentVNI) {
	+ // Fast-path for common case.
	+ if (!ParentVNI.isPHIDef()) {
	+ for (unsigned I = 0, E = Edit->size(); I != E; ++I)
	+ forceRecompute(I, ParentVNI);
	+ return;
	+ }
	+
	+ // Trace value through phis.
	+ SmallPtrSet<const VNInfo *, 8> Visited; ///< whether VNI was/is in worklist.
	+ SmallVector<const VNInfo *, 4> WorkList;
	+ Visited.insert(&ParentVNI);
	+ WorkList.push_back(&ParentVNI);
	+
	+ const LiveInterval &ParentLI = Edit->getParent();
	+ const SlotIndexes &Indexes = *LIS.getSlotIndexes();
	+ do {
	+ const VNInfo &VNI = *WorkList.back();
	+ WorkList.pop_back();
	+ for (unsigned I = 0, E = Edit->size(); I != E; ++I)
	+ forceRecompute(I, VNI);
	+ if (!VNI.isPHIDef())
	+ continue;
	+
	+ MachineBasicBlock &MBB = *Indexes.getMBBFromIndex(VNI.def);
	+ for (const MachineBasicBlock *Pred : MBB.predecessors()) {
	+ SlotIndex PredEnd = Indexes.getMBBEndIdx(Pred);
	+ VNInfo *PredVNI = ParentLI.getVNInfoBefore(PredEnd);
	+ assert(PredVNI && "Value available in PhiVNI predecessor");
	+ if (Visited.insert(PredVNI).second)
	+ WorkList.push_back(PredVNI);
	+ }
	+ } while(!WorkList.empty());
	+}
	+
	void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
	++NumFinished;

	// At this point, the live intervals in Edit contain VNInfos corresponding to
	// the inserted copies.

	// Add the original defs from the parent interval.
	for (const VNInfo *ParentVNI : Edit->getParent().valnos) {
	if (ParentVNI->isUnused())
	continue;
	unsigned RegIdx = RegAssign.lookup(ParentVNI->def);
	defValue(RegIdx, ParentVNI, ParentVNI->def, true);

	// Force rematted values to be recomputed everywhere.
	// The new live ranges may be truncated.
	if (Edit->didRematerialize(ParentVNI))
	- for (unsigned i = 0, e = Edit->size(); i != e; ++i)
	- forceRecompute(i, ParentVNI);
	+ forceRecomputeVNI(*ParentVNI);
	}

	// Hoist back-copies to the complement interval when in spill mode.
	switch (SpillMode) {
	case SM_Partition:
	// Leave all back-copies as is.
	break;
	case SM_Size:
	case SM_Speed:
	// hoistCopies will behave differently between size and speed.
	hoistCopies();
	}

	// Transfer the simply mapped values, check if any are skipped.
	bool Skipped = transferValues();

	// Rewrite virtual registers, possibly extending ranges.
	rewriteAssigned(Skipped);

	if (Skipped)
	extendPHIKillRanges();
	else
	++NumSimple;

	// Delete defs that were rematted everywhere.
	if (Skipped)
	deleteRematVictims();

	// Get rid of unused values and set phi-kill flags.
	for (unsigned Reg : *Edit) {
	LiveInterval &LI = LIS.getInterval(Reg);
	LI.removeEmptySubRanges();
	LI.RenumberValues();
	}

	// Provide a reverse mapping from original indices to Edit ranges.
	if (LRMap) {
	LRMap->clear();
	for (unsigned i = 0, e = Edit->size(); i != e; ++i)
	LRMap->push_back(i);
	}

	// Now check if any registers were separated into multiple components.
	ConnectedVNInfoEqClasses ConEQ(LIS);
	for (unsigned i = 0, e = Edit->size(); i != e; ++i) {
	// Don't use iterators, they are invalidated by create() below.
	unsigned VReg = Edit->get(i);
	LiveInterval &LI = LIS.getInterval(VReg);
	SmallVector<LiveInterval*, 8> SplitLIs;
	LIS.splitSeparateComponents(LI, SplitLIs);
	unsigned Original = VRM.getOriginal(VReg);
	for (LiveInterval *SplitLI : SplitLIs)
	VRM.setIsSplitFromReg(SplitLI->reg, Original);

	// The new intervals all map back to i.
	if (LRMap)
	LRMap->resize(Edit->size(), i);
	}

	// Calculate spill weight and allocation hints for new intervals.
	Edit->calculateRegClassAndHint(VRM.getMachineFunction(), SA.Loops, MBFI);

	assert(!LRMap \|\| LRMap->size() == Edit->size());
	}

	//===----------------------------------------------------------------------===//
	// Single Block Splitting
	//===----------------------------------------------------------------------===//

	bool SplitAnalysis::shouldSplitSingleBlock(const BlockInfo &BI,
	bool SingleInstrs) const {
	// Always split for multiple instructions.
	if (!BI.isOneInstr())
	return true;
	// Don't split for single instructions unless explicitly requested.
	if (!SingleInstrs)
	return false;
	// Splitting a live-through range always makes progress.
	if (BI.LiveIn && BI.LiveOut)
	return true;
	// No point in isolating a copy. It has no register class constraints.
	if (LIS.getInstructionFromIndex(BI.FirstInstr)->isCopyLike())
	return false;
	// Finally, don't isolate an end point that was created by earlier splits.
	return isOriginalEndpoint(BI.FirstInstr);
	}

	void SplitEditor::splitSingleBlock(const SplitAnalysis::BlockInfo &BI) {
	openIntv();
	SlotIndex LastSplitPoint = SA.getLastSplitPoint(BI.MBB->getNumber());
	SlotIndex SegStart = enterIntvBefore(std::min(BI.FirstInstr,
	LastSplitPoint));
	if (!BI.LiveOut \|\| BI.LastInstr < LastSplitPoint) {
	useIntv(SegStart, leaveIntvAfter(BI.LastInstr));
	} else {
	// The last use is after the last valid split point.
	SlotIndex SegStop = leaveIntvBefore(LastSplitPoint);
	useIntv(SegStart, SegStop);
	overlapIntv(SegStop, BI.LastInstr);
	}
	}

	//===----------------------------------------------------------------------===//
	// Global Live Range Splitting Support
	//===----------------------------------------------------------------------===//

	// These methods support a method of global live range splitting that uses a
	// global algorithm to decide intervals for CFG edges. They will insert split
	// points and color intervals in basic blocks while avoiding interference.
	//
	// Note that splitSingleBlock is also useful for blocks where both CFG edges
	// are on the stack.

	void SplitEditor::splitLiveThroughBlock(unsigned MBBNum,
	unsigned IntvIn, SlotIndex LeaveBefore,
	unsigned IntvOut, SlotIndex EnterAfter){
	SlotIndex Start, Stop;
	std::tie(Start, Stop) = LIS.getSlotIndexes()->getMBBRange(MBBNum);

	DEBUG(dbgs() << "%bb." << MBBNum << " [" << Start << ';' << Stop << ") intf "
	<< LeaveBefore << '-' << EnterAfter << ", live-through "
	<< IntvIn << " -> " << IntvOut);

	assert((IntvIn \|\| IntvOut) && "Use splitSingleBlock for isolated blocks");

	assert((!LeaveBefore \|\| LeaveBefore < Stop) && "Interference after block");
	assert((!IntvIn \|\| !LeaveBefore \|\| LeaveBefore > Start) && "Impossible intf");
	assert((!EnterAfter \|\| EnterAfter >= Start) && "Interference before block");

	MachineBasicBlock *MBB = VRM.getMachineFunction().getBlockNumbered(MBBNum);

	if (!IntvOut) {
	DEBUG(dbgs() << ", spill on entry.\n");
	//
	// <<<<<<<<< Possible LeaveBefore interference.
	// \|-----------\| Live through.
	// -____________ Spill on entry.
	//
	selectIntv(IntvIn);
	SlotIndex Idx = leaveIntvAtTop(*MBB);
	assert((!LeaveBefore \|\| Idx <= LeaveBefore) && "Interference");
	(void)Idx;
	return;
	}

	if (!IntvIn) {
	DEBUG(dbgs() << ", reload on exit.\n");
	//
	// >>>>>>> Possible EnterAfter interference.
	// \|-----------\| Live through.
	// ___________-- Reload on exit.
	//
	selectIntv(IntvOut);
	SlotIndex Idx = enterIntvAtEnd(*MBB);
	assert((!EnterAfter \|\| Idx >= EnterAfter) && "Interference");
	(void)Idx;
	return;
	}

	if (IntvIn == IntvOut && !LeaveBefore && !EnterAfter) {
	DEBUG(dbgs() << ", straight through.\n");
	//
	// \|-----------\| Live through.
	// ------------- Straight through, same intv, no interference.
	//
	selectIntv(IntvOut);
	useIntv(Start, Stop);
	return;
	}

	// We cannot legally insert splits after LSP.
	SlotIndex LSP = SA.getLastSplitPoint(MBBNum);
	assert((!IntvOut \|\| !EnterAfter \|\| EnterAfter < LSP) && "Impossible intf");

	if (IntvIn != IntvOut && (!LeaveBefore \|\| !EnterAfter \|\|
	LeaveBefore.getBaseIndex() > EnterAfter.getBoundaryIndex())) {
	DEBUG(dbgs() << ", switch avoiding interference.\n");
	//
	// >>>> <<<< Non-overlapping EnterAfter/LeaveBefore interference.
	// \|-----------\| Live through.
	// ------======= Switch intervals between interference.
	//
	selectIntv(IntvOut);
	SlotIndex Idx;
	if (LeaveBefore && LeaveBefore < LSP) {
	Idx = enterIntvBefore(LeaveBefore);
	useIntv(Idx, Stop);
	} else {
	Idx = enterIntvAtEnd(*MBB);
	}
	selectIntv(IntvIn);
	useIntv(Start, Idx);
	assert((!LeaveBefore \|\| Idx <= LeaveBefore) && "Interference");
	assert((!EnterAfter \|\| Idx >= EnterAfter) && "Interference");
	return;
	}

	DEBUG(dbgs() << ", create local intv for interference.\n");
	//
	// >>><><><><<<< Overlapping EnterAfter/LeaveBefore interference.
	// \|-----------\| Live through.
	// ==---------== Switch intervals before/after interference.
	//
	assert(LeaveBefore <= EnterAfter && "Missed case");

	selectIntv(IntvOut);
	SlotIndex Idx = enterIntvAfter(EnterAfter);
	useIntv(Idx, Stop);
	assert((!EnterAfter \|\| Idx >= EnterAfter) && "Interference");

	selectIntv(IntvIn);
	Idx = leaveIntvBefore(LeaveBefore);
	useIntv(Start, Idx);
	assert((!LeaveBefore \|\| Idx <= LeaveBefore) && "Interference");
	}

	void SplitEditor::splitRegInBlock(const SplitAnalysis::BlockInfo &BI,
	unsigned IntvIn, SlotIndex LeaveBefore) {
	SlotIndex Start, Stop;
	std::tie(Start, Stop) = LIS.getSlotIndexes()->getMBBRange(BI.MBB);

	DEBUG(dbgs() << printMBBReference(*BI.MBB) << " [" << Start << ';' << Stop
	<< "), uses " << BI.FirstInstr << '-' << BI.LastInstr
	<< ", reg-in " << IntvIn << ", leave before " << LeaveBefore
	<< (BI.LiveOut ? ", stack-out" : ", killed in block"));

	assert(IntvIn && "Must have register in");
	assert(BI.LiveIn && "Must be live-in");
	assert((!LeaveBefore \|\| LeaveBefore > Start) && "Bad interference");

	if (!BI.LiveOut && (!LeaveBefore \|\| LeaveBefore >= BI.LastInstr)) {
	DEBUG(dbgs() << " before interference.\n");
	//
	// <<< Interference after kill.
	// \|---o---x \| Killed in block.
	// ========= Use IntvIn everywhere.
	//
	selectIntv(IntvIn);
	useIntv(Start, BI.LastInstr);
	return;
	}

	SlotIndex LSP = SA.getLastSplitPoint(BI.MBB->getNumber());

	if (!LeaveBefore \|\| LeaveBefore > BI.LastInstr.getBoundaryIndex()) {
	//
	// <<< Possible interference after last use.
	// \|---o---o---\| Live-out on stack.
	// =========____ Leave IntvIn after last use.
	//
	// < Interference after last use.
	// \|---o---o--o\| Live-out on stack, late last use.
	// ============ Copy to stack after LSP, overlap IntvIn.
	// \_____ Stack interval is live-out.
	//
	if (BI.LastInstr < LSP) {
	DEBUG(dbgs() << ", spill after last use before interference.\n");
	selectIntv(IntvIn);
	SlotIndex Idx = leaveIntvAfter(BI.LastInstr);
	useIntv(Start, Idx);
	assert((!LeaveBefore \|\| Idx <= LeaveBefore) && "Interference");
	} else {
	DEBUG(dbgs() << ", spill before last split point.\n");
	selectIntv(IntvIn);
	SlotIndex Idx = leaveIntvBefore(LSP);
	overlapIntv(Idx, BI.LastInstr);
	useIntv(Start, Idx);
	assert((!LeaveBefore \|\| Idx <= LeaveBefore) && "Interference");
	}
	return;
	}

	// The interference is overlapping somewhere we wanted to use IntvIn. That
	// means we need to create a local interval that can be allocated a
	// different register.
	unsigned LocalIntv = openIntv();
	(void)LocalIntv;
	DEBUG(dbgs() << ", creating local interval " << LocalIntv << ".\n");

	if (!BI.LiveOut \|\| BI.LastInstr < LSP) {
	//
	// <<<<<<< Interference overlapping uses.
	// \|---o---o---\| Live-out on stack.
	// =====----____ Leave IntvIn before interference, then spill.
	//
	SlotIndex To = leaveIntvAfter(BI.LastInstr);
	SlotIndex From = enterIntvBefore(LeaveBefore);
	useIntv(From, To);
	selectIntv(IntvIn);
	useIntv(Start, From);
	assert((!LeaveBefore \|\| From <= LeaveBefore) && "Interference");
	return;
	}

	// <<<<<<< Interference overlapping uses.
	// \|---o---o--o\| Live-out on stack, late last use.
	// =====------- Copy to stack before LSP, overlap LocalIntv.
	// \_____ Stack interval is live-out.
	//
	SlotIndex To = leaveIntvBefore(LSP);
	overlapIntv(To, BI.LastInstr);
	SlotIndex From = enterIntvBefore(std::min(To, LeaveBefore));
	useIntv(From, To);
	selectIntv(IntvIn);
	useIntv(Start, From);
	assert((!LeaveBefore \|\| From <= LeaveBefore) && "Interference");
	}

	void SplitEditor::splitRegOutBlock(const SplitAnalysis::BlockInfo &BI,
	unsigned IntvOut, SlotIndex EnterAfter) {
	SlotIndex Start, Stop;
	std::tie(Start, Stop) = LIS.getSlotIndexes()->getMBBRange(BI.MBB);

	DEBUG(dbgs() << printMBBReference(*BI.MBB) << " [" << Start << ';' << Stop
	<< "), uses " << BI.FirstInstr << '-' << BI.LastInstr
	<< ", reg-out " << IntvOut << ", enter after " << EnterAfter
	<< (BI.LiveIn ? ", stack-in" : ", defined in block"));

	SlotIndex LSP = SA.getLastSplitPoint(BI.MBB->getNumber());

	assert(IntvOut && "Must have register out");
	assert(BI.LiveOut && "Must be live-out");
	assert((!EnterAfter \|\| EnterAfter < LSP) && "Bad interference");

	if (!BI.LiveIn && (!EnterAfter \|\| EnterAfter <= BI.FirstInstr)) {
	DEBUG(dbgs() << " after interference.\n");
	//
	// >>>> Interference before def.
	// \| o---o---\| Defined in block.
	// ========= Use IntvOut everywhere.
	//
	selectIntv(IntvOut);
	useIntv(BI.FirstInstr, Stop);
	return;
	}

	if (!EnterAfter \|\| EnterAfter < BI.FirstInstr.getBaseIndex()) {
	DEBUG(dbgs() << ", reload after interference.\n");
	//
	// >>>> Interference before def.
	// \|---o---o---\| Live-through, stack-in.
	// ____========= Enter IntvOut before first use.
	//
	selectIntv(IntvOut);
	SlotIndex Idx = enterIntvBefore(std::min(LSP, BI.FirstInstr));
	useIntv(Idx, Stop);
	assert((!EnterAfter \|\| Idx >= EnterAfter) && "Interference");
	return;
	}

	// The interference is overlapping somewhere we wanted to use IntvOut. That
	// means we need to create a local interval that can be allocated a
	// different register.
	DEBUG(dbgs() << ", interference overlaps uses.\n");
	//
	// >>>>>>> Interference overlapping uses.
	// \|---o---o---\| Live-through, stack-in.
	// ____---====== Create local interval for interference range.
	//
	selectIntv(IntvOut);
	SlotIndex Idx = enterIntvAfter(EnterAfter);
	useIntv(Idx, Stop);
	assert((!EnterAfter \|\| Idx >= EnterAfter) && "Interference");

	openIntv();
	SlotIndex From = enterIntvBefore(std::min(Idx, BI.FirstInstr));
	useIntv(From, Idx);
	}
	Index: head/contrib/llvm/lib/CodeGen/SplitKit.h
	===================================================================
	--- head/contrib/llvm/lib/CodeGen/SplitKit.h (revision 329409)
	+++ head/contrib/llvm/lib/CodeGen/SplitKit.h (revision 329410)
	@@ -1,544 +1,548 @@
	//===- SplitKit.h - Toolkit for splitting live ranges ------------ C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains the SplitAnalysis class as well as mutator functions for
	// live range splitting.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_CODEGEN_SPLITKIT_H
	#define LLVM_LIB_CODEGEN_SPLITKIT_H

	#include "LiveRangeCalc.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/BitVector.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/DenseSet.h"
	#include "llvm/ADT/IntervalMap.h"
	#include "llvm/ADT/PointerIntPair.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/CodeGen/LiveInterval.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/SlotIndexes.h"
	#include "llvm/MC/LaneBitmask.h"
	#include "llvm/Support/Compiler.h"
	#include <utility>

	namespace llvm {

	class LiveIntervals;
	class LiveRangeEdit;
	class MachineBlockFrequencyInfo;
	class MachineDominatorTree;
	class MachineLoopInfo;
	class MachineRegisterInfo;
	class TargetInstrInfo;
	class TargetRegisterInfo;
	class VirtRegMap;

	/// Determines the latest safe point in a block in which we can insert a split,
	/// spill or other instruction related with CurLI.
	class LLVM_LIBRARY_VISIBILITY InsertPointAnalysis {
	private:
	const LiveIntervals &LIS;

	/// Last legal insert point in each basic block in the current function.
	/// The first entry is the first terminator, the second entry is the
	/// last valid point to insert a split or spill for a variable that is
	/// live into a landing pad successor.
	SmallVector<std::pair<SlotIndex, SlotIndex>, 8> LastInsertPoint;

	SlotIndex computeLastInsertPoint(const LiveInterval &CurLI,
	const MachineBasicBlock &MBB);

	public:
	InsertPointAnalysis(const LiveIntervals &lis, unsigned BBNum);

	/// Return the base index of the last valid insert point for \pCurLI in \pMBB.
	SlotIndex getLastInsertPoint(const LiveInterval &CurLI,
	const MachineBasicBlock &MBB) {
	unsigned Num = MBB.getNumber();
	// Inline the common simple case.
	if (LastInsertPoint[Num].first.isValid() &&
	!LastInsertPoint[Num].second.isValid())
	return LastInsertPoint[Num].first;
	return computeLastInsertPoint(CurLI, MBB);
	}

	/// Returns the last insert point as an iterator for \pCurLI in \pMBB.
	MachineBasicBlock::iterator getLastInsertPointIter(const LiveInterval &CurLI,
	MachineBasicBlock &MBB);
	};

	/// SplitAnalysis - Analyze a LiveInterval, looking for live range splitting
	/// opportunities.
	class LLVM_LIBRARY_VISIBILITY SplitAnalysis {
	public:
	const MachineFunction &MF;
	const VirtRegMap &VRM;
	const LiveIntervals &LIS;
	const MachineLoopInfo &Loops;
	const TargetInstrInfo &TII;

	/// Additional information about basic blocks where the current variable is
	/// live. Such a block will look like one of these templates:
	///
	/// 1. \| o---x \| Internal to block. Variable is only live in this block.
	/// 2. \|---x \| Live-in, kill.
	/// 3. \| o---\| Def, live-out.
	/// 4. \|---x o---\| Live-in, kill, def, live-out. Counted by NumGapBlocks.
	/// 5. \|---o---o---\| Live-through with uses or defs.
	/// 6. \|-----------\| Live-through without uses. Counted by NumThroughBlocks.
	///
	/// Two BlockInfo entries are created for template 4. One for the live-in
	/// segment, and one for the live-out segment. These entries look as if the
	/// block were split in the middle where the live range isn't live.
	///
	/// Live-through blocks without any uses don't get BlockInfo entries. They
	/// are simply listed in ThroughBlocks instead.
	///
	struct BlockInfo {
	MachineBasicBlock *MBB;
	SlotIndex FirstInstr; ///< First instr accessing current reg.
	SlotIndex LastInstr; ///< Last instr accessing current reg.
	SlotIndex FirstDef; ///< First non-phi valno->def, or SlotIndex().
	bool LiveIn; ///< Current reg is live in.
	bool LiveOut; ///< Current reg is live out.

	/// isOneInstr - Returns true when this BlockInfo describes a single
	/// instruction.
	bool isOneInstr() const {
	return SlotIndex::isSameInstr(FirstInstr, LastInstr);
	}
	};

	private:
	// Current live interval.
	const LiveInterval *CurLI = nullptr;

	/// Insert Point Analysis.
	InsertPointAnalysis IPA;

	// Sorted slot indexes of using instructions.
	SmallVector<SlotIndex, 8> UseSlots;

	/// UseBlocks - Blocks where CurLI has uses.
	SmallVector<BlockInfo, 8> UseBlocks;

	/// NumGapBlocks - Number of duplicate entries in UseBlocks for blocks where
	/// the live range has a gap.
	unsigned NumGapBlocks;

	/// ThroughBlocks - Block numbers where CurLI is live through without uses.
	BitVector ThroughBlocks;

	/// NumThroughBlocks - Number of live-through blocks.
	unsigned NumThroughBlocks;

	/// DidRepairRange - analyze was forced to shrinkToUses().
	bool DidRepairRange;

	// Sumarize statistics by counting instructions using CurLI.
	void analyzeUses();

	/// calcLiveBlockInfo - Compute per-block information about CurLI.
	bool calcLiveBlockInfo();

	public:
	SplitAnalysis(const VirtRegMap &vrm, const LiveIntervals &lis,
	const MachineLoopInfo &mli);

	/// analyze - set CurLI to the specified interval, and analyze how it may be
	/// split.
	void analyze(const LiveInterval *li);

	/// didRepairRange() - Returns true if CurLI was invalid and has been repaired
	/// by analyze(). This really shouldn't happen, but sometimes the coalescer
	/// can create live ranges that end in mid-air.
	bool didRepairRange() const { return DidRepairRange; }

	/// clear - clear all data structures so SplitAnalysis is ready to analyze a
	/// new interval.
	void clear();

	/// getParent - Return the last analyzed interval.
	const LiveInterval &getParent() const { return *CurLI; }

	/// isOriginalEndpoint - Return true if the original live range was killed or
	/// (re-)defined at Idx. Idx should be the 'def' slot for a normal kill/def,
	/// and 'use' for an early-clobber def.
	/// This can be used to recognize code inserted by earlier live range
	/// splitting.
	bool isOriginalEndpoint(SlotIndex Idx) const;

	/// getUseSlots - Return an array of SlotIndexes of instructions using CurLI.
	/// This include both use and def operands, at most one entry per instruction.
	ArrayRef<SlotIndex> getUseSlots() const { return UseSlots; }

	/// getUseBlocks - Return an array of BlockInfo objects for the basic blocks
	/// where CurLI has uses.
	ArrayRef<BlockInfo> getUseBlocks() const { return UseBlocks; }

	/// getNumThroughBlocks - Return the number of through blocks.
	unsigned getNumThroughBlocks() const { return NumThroughBlocks; }

	/// isThroughBlock - Return true if CurLI is live through MBB without uses.
	bool isThroughBlock(unsigned MBB) const { return ThroughBlocks.test(MBB); }

	/// getThroughBlocks - Return the set of through blocks.
	const BitVector &getThroughBlocks() const { return ThroughBlocks; }

	/// getNumLiveBlocks - Return the number of blocks where CurLI is live.
	unsigned getNumLiveBlocks() const {
	return getUseBlocks().size() - NumGapBlocks + getNumThroughBlocks();
	}

	/// countLiveBlocks - Return the number of blocks where li is live. This is
	/// guaranteed to return the same number as getNumLiveBlocks() after calling
	/// analyze(li).
	unsigned countLiveBlocks(const LiveInterval *li) const;

	using BlockPtrSet = SmallPtrSet<const MachineBasicBlock *, 16>;

	/// shouldSplitSingleBlock - Returns true if it would help to create a local
	/// live range for the instructions in BI. There is normally no benefit to
	/// creating a live range for a single instruction, but it does enable
	/// register class inflation if the instruction has a restricted register
	/// class.
	///
	/// @param BI The block to be isolated.
	/// @param SingleInstrs True when single instructions should be isolated.
	bool shouldSplitSingleBlock(const BlockInfo &BI, bool SingleInstrs) const;

	SlotIndex getLastSplitPoint(unsigned Num) {
	return IPA.getLastInsertPoint(CurLI, MF.getBlockNumbered(Num));
	}

	MachineBasicBlock::iterator getLastSplitPointIter(MachineBasicBlock *BB) {
	return IPA.getLastInsertPointIter(CurLI, BB);
	}
	};

	/// SplitEditor - Edit machine code and LiveIntervals for live range
	/// splitting.
	///
	/// - Create a SplitEditor from a SplitAnalysis.
	/// - Start a new live interval with openIntv.
	/// - Mark the places where the new interval is entered using enterIntv*
	/// - Mark the ranges where the new interval is used with useIntv*
	/// - Mark the places where the interval is exited with exitIntv*.
	/// - Finish the current interval with closeIntv and repeat from 2.
	/// - Rewrite instructions with finish().
	///
	class LLVM_LIBRARY_VISIBILITY SplitEditor {
	SplitAnalysis &SA;
	AliasAnalysis &AA;
	LiveIntervals &LIS;
	VirtRegMap &VRM;
	MachineRegisterInfo &MRI;
	MachineDominatorTree &MDT;
	const TargetInstrInfo &TII;
	const TargetRegisterInfo &TRI;
	const MachineBlockFrequencyInfo &MBFI;

	public:
	/// ComplementSpillMode - Select how the complement live range should be
	/// created. SplitEditor automatically creates interval 0 to contain
	/// anything that isn't added to another interval. This complement interval
	/// can get quite complicated, and it can sometimes be an advantage to allow
	/// it to overlap the other intervals. If it is going to spill anyway, no
	/// registers are wasted by keeping a value in two places at the same time.
	enum ComplementSpillMode {
	/// SM_Partition(Default) - Try to create the complement interval so it
	/// doesn't overlap any other intervals, and the original interval is
	/// partitioned. This may require a large number of back copies and extra
	/// PHI-defs. Only segments marked with overlapIntv will be overlapping.
	SM_Partition,

	/// SM_Size - Overlap intervals to minimize the number of inserted COPY
	/// instructions. Copies to the complement interval are hoisted to their
	/// common dominator, so only one COPY is required per value in the
	/// complement interval. This also means that no extra PHI-defs need to be
	/// inserted in the complement interval.
	SM_Size,

	/// SM_Speed - Overlap intervals to minimize the expected execution
	/// frequency of the inserted copies. This is very similar to SM_Size, but
	/// the complement interval may get some extra PHI-defs.
	SM_Speed
	};

	private:
	/// Edit - The current parent register and new intervals created.
	LiveRangeEdit *Edit = nullptr;

	/// Index into Edit of the currently open interval.
	/// The index 0 is used for the complement, so the first interval started by
	/// openIntv will be 1.
	unsigned OpenIdx = 0;

	/// The current spill mode, selected by reset().
	ComplementSpillMode SpillMode = SM_Partition;

	using RegAssignMap = IntervalMap<SlotIndex, unsigned>;

	/// Allocator for the interval map. This will eventually be shared with
	/// SlotIndexes and LiveIntervals.
	RegAssignMap::Allocator Allocator;

	/// RegAssign - Map of the assigned register indexes.
	/// Edit.get(RegAssign.lookup(Idx)) is the register that should be live at
	/// Idx.
	RegAssignMap RegAssign;

	using ValueForcePair = PointerIntPair<VNInfo *, 1>;
	using ValueMap = DenseMap<std::pair<unsigned, unsigned>, ValueForcePair>;

	/// Values - keep track of the mapping from parent values to values in the new
	/// intervals. Given a pair (RegIdx, ParentVNI->id), Values contains:
	///
	/// 1. No entry - the value is not mapped to Edit.get(RegIdx).
	/// 2. (Null, false) - the value is mapped to multiple values in
	/// Edit.get(RegIdx). Each value is represented by a minimal live range at
	/// its def. The full live range can be inferred exactly from the range
	/// of RegIdx in RegAssign.
	/// 3. (Null, true). As above, but the ranges in RegAssign are too large, and
	/// the live range must be recomputed using LiveRangeCalc::extend().
	/// 4. (VNI, false) The value is mapped to a single new value.
	/// The new value has no live ranges anywhere.
	ValueMap Values;

	/// LRCalc - Cache for computing live ranges and SSA update. Each instance
	/// can only handle non-overlapping live ranges, so use a separate
	/// LiveRangeCalc instance for the complement interval when in spill mode.
	LiveRangeCalc LRCalc[2];

	/// getLRCalc - Return the LRCalc to use for RegIdx. In spill mode, the
	/// complement interval can overlap the other intervals, so it gets its own
	/// LRCalc instance. When not in spill mode, all intervals can share one.
	LiveRangeCalc &getLRCalc(unsigned RegIdx) {
	return LRCalc[SpillMode != SM_Partition && RegIdx != 0];
	}

	/// Find a subrange corresponding to the lane mask @p LM in the live
	/// interval @p LI. The interval @p LI is assumed to contain such a subrange.
	/// This function is used to find corresponding subranges between the
	/// original interval and the new intervals.
	LiveInterval::SubRange &getSubRangeForMask(LaneBitmask LM, LiveInterval &LI);

	/// Add a segment to the interval LI for the value number VNI. If LI has
	/// subranges, corresponding segments will be added to them as well, but
	/// with newly created value numbers. If Original is true, dead def will
	/// only be added a subrange of LI if the corresponding subrange of the
	/// original interval has a def at this index. Otherwise, all subranges
	/// of LI will be updated.
	void addDeadDef(LiveInterval &LI, VNInfo *VNI, bool Original);

	/// defValue - define a value in RegIdx from ParentVNI at Idx.
	/// Idx does not have to be ParentVNI->def, but it must be contained within
	/// ParentVNI's live range in ParentLI. The new value is added to the value
	/// map. The value being defined may either come from rematerialization
	/// (or an inserted copy), or it may be coming from the original interval.
	/// The parameter Original should be true in the latter case, otherwise
	/// it should be false.
	/// Return the new LI value.
	VNInfo defValue(unsigned RegIdx, const VNInfo ParentVNI, SlotIndex Idx,
	bool Original);

	/// forceRecompute - Force the live range of ParentVNI in RegIdx to be
	/// recomputed by LiveRangeCalc::extend regardless of the number of defs.
	/// This is used for values whose live range doesn't match RegAssign exactly.
	/// They could have rematerialized, or back-copies may have been moved.
	- void forceRecompute(unsigned RegIdx, const VNInfo *ParentVNI);
	+ void forceRecompute(unsigned RegIdx, const VNInfo &ParentVNI);
	+
	+ /// Calls forceRecompute() on any affected regidx and on ParentVNI
	+ /// predecessors in case of a phi definition.
	+ void forceRecomputeVNI(const VNInfo &ParentVNI);

	/// defFromParent - Define Reg from ParentVNI at UseIdx using either
	/// rematerialization or a COPY from parent. Return the new value.
	VNInfo *defFromParent(unsigned RegIdx,
	VNInfo *ParentVNI,
	SlotIndex UseIdx,
	MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I);

	/// removeBackCopies - Remove the copy instructions that defines the values
	/// in the vector in the complement interval.
	void removeBackCopies(SmallVectorImpl<VNInfo*> &Copies);

	/// getShallowDominator - Returns the least busy dominator of MBB that is
	/// also dominated by DefMBB. Busy is measured by loop depth.
	MachineBasicBlock findShallowDominator(MachineBasicBlock MBB,
	MachineBasicBlock *DefMBB);

	/// Find out all the backCopies dominated by others.
	void computeRedundantBackCopies(DenseSet<unsigned> &NotToHoistSet,
	SmallVectorImpl<VNInfo *> &BackCopies);

	/// Hoist back-copies to the complement interval. It tries to hoist all
	/// the back-copies to one BB if it is beneficial, or else simply remove
	/// redundant backcopies dominated by others.
	void hoistCopies();

	/// transferValues - Transfer values to the new ranges.
	/// Return true if any ranges were skipped.
	bool transferValues();

	/// Live range @p LR corresponding to the lane Mask @p LM has a live
	/// PHI def at the beginning of block @p B. Extend the range @p LR of
	/// all predecessor values that reach this def. If @p LR is a subrange,
	/// the array @p Undefs is the set of all locations where it is undefined
	/// via <def,read-undef> in other subranges for the same register.
	void extendPHIRange(MachineBasicBlock &B, LiveRangeCalc &LRC,
	LiveRange &LR, LaneBitmask LM,
	ArrayRef<SlotIndex> Undefs);

	/// extendPHIKillRanges - Extend the ranges of all values killed by original
	/// parent PHIDefs.
	void extendPHIKillRanges();

	/// rewriteAssigned - Rewrite all uses of Edit.getReg() to assigned registers.
	void rewriteAssigned(bool ExtendRanges);

	/// deleteRematVictims - Delete defs that are dead after rematerializing.
	void deleteRematVictims();

	/// Add a copy instruction copying \p FromReg to \p ToReg before
	/// \p InsertBefore. This can be invoked with a \p LaneMask which may make it
	/// necessary to construct a sequence of copies to cover it exactly.
	SlotIndex buildCopy(unsigned FromReg, unsigned ToReg, LaneBitmask LaneMask,
	MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
	bool Late, unsigned RegIdx);

	SlotIndex buildSingleSubRegCopy(unsigned FromReg, unsigned ToReg,
	MachineBasicBlock &MB, MachineBasicBlock::iterator InsertBefore,
	unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex PrevCopy);

	public:
	/// Create a new SplitEditor for editing the LiveInterval analyzed by SA.
	/// Newly created intervals will be appended to newIntervals.
	SplitEditor(SplitAnalysis &sa, AliasAnalysis &aa, LiveIntervals &lis,
	VirtRegMap &vrm, MachineDominatorTree &mdt,
	MachineBlockFrequencyInfo &mbfi);

	/// reset - Prepare for a new split.
	void reset(LiveRangeEdit&, ComplementSpillMode = SM_Partition);

	/// Create a new virtual register and live interval.
	/// Return the interval index, starting from 1. Interval index 0 is the
	/// implicit complement interval.
	unsigned openIntv();

	/// currentIntv - Return the current interval index.
	unsigned currentIntv() const { return OpenIdx; }

	/// selectIntv - Select a previously opened interval index.
	void selectIntv(unsigned Idx);

	/// enterIntvBefore - Enter the open interval before the instruction at Idx.
	/// If the parent interval is not live before Idx, a COPY is not inserted.
	/// Return the beginning of the new live range.
	SlotIndex enterIntvBefore(SlotIndex Idx);

	/// enterIntvAfter - Enter the open interval after the instruction at Idx.
	/// Return the beginning of the new live range.
	SlotIndex enterIntvAfter(SlotIndex Idx);

	/// enterIntvAtEnd - Enter the open interval at the end of MBB.
	/// Use the open interval from the inserted copy to the MBB end.
	/// Return the beginning of the new live range.
	SlotIndex enterIntvAtEnd(MachineBasicBlock &MBB);

	/// useIntv - indicate that all instructions in MBB should use OpenLI.
	void useIntv(const MachineBasicBlock &MBB);

	/// useIntv - indicate that all instructions in range should use OpenLI.
	void useIntv(SlotIndex Start, SlotIndex End);

	/// leaveIntvAfter - Leave the open interval after the instruction at Idx.
	/// Return the end of the live range.
	SlotIndex leaveIntvAfter(SlotIndex Idx);

	/// leaveIntvBefore - Leave the open interval before the instruction at Idx.
	/// Return the end of the live range.
	SlotIndex leaveIntvBefore(SlotIndex Idx);

	/// leaveIntvAtTop - Leave the interval at the top of MBB.
	/// Add liveness from the MBB top to the copy.
	/// Return the end of the live range.
	SlotIndex leaveIntvAtTop(MachineBasicBlock &MBB);

	/// overlapIntv - Indicate that all instructions in range should use the open
	/// interval, but also let the complement interval be live.
	///
	/// This doubles the register pressure, but is sometimes required to deal with
	/// register uses after the last valid split point.
	///
	/// The Start index should be a return value from a leaveIntv* call, and End
	/// should be in the same basic block. The parent interval must have the same
	/// value across the range.
	///
	void overlapIntv(SlotIndex Start, SlotIndex End);

	/// finish - after all the new live ranges have been created, compute the
	/// remaining live range, and rewrite instructions to use the new registers.
	/// @param LRMap When not null, this vector will map each live range in Edit
	/// back to the indices returned by openIntv.
	/// There may be extra indices created by dead code elimination.
	void finish(SmallVectorImpl<unsigned> *LRMap = nullptr);

	/// dump - print the current interval mapping to dbgs().
	void dump() const;

	// ===--- High level methods ---===

	/// splitSingleBlock - Split CurLI into a separate live interval around the
	/// uses in a single block. This is intended to be used as part of a larger
	/// split, and doesn't call finish().
	void splitSingleBlock(const SplitAnalysis::BlockInfo &BI);

	/// splitLiveThroughBlock - Split CurLI in the given block such that it
	/// enters the block in IntvIn and leaves it in IntvOut. There may be uses in
	/// the block, but they will be ignored when placing split points.
	///
	/// @param MBBNum Block number.
	/// @param IntvIn Interval index entering the block.
	/// @param LeaveBefore When set, leave IntvIn before this point.
	/// @param IntvOut Interval index leaving the block.
	/// @param EnterAfter When set, enter IntvOut after this point.
	void splitLiveThroughBlock(unsigned MBBNum,
	unsigned IntvIn, SlotIndex LeaveBefore,
	unsigned IntvOut, SlotIndex EnterAfter);

	/// splitRegInBlock - Split CurLI in the given block such that it enters the
	/// block in IntvIn and leaves it on the stack (or not at all). Split points
	/// are placed in a way that avoids putting uses in the stack interval. This
	/// may require creating a local interval when there is interference.
	///
	/// @param BI Block descriptor.
	/// @param IntvIn Interval index entering the block. Not 0.
	/// @param LeaveBefore When set, leave IntvIn before this point.
	void splitRegInBlock(const SplitAnalysis::BlockInfo &BI,
	unsigned IntvIn, SlotIndex LeaveBefore);

	/// splitRegOutBlock - Split CurLI in the given block such that it enters the
	/// block on the stack (or isn't live-in at all) and leaves it in IntvOut.
	/// Split points are placed to avoid interference and such that the uses are
	/// not in the stack interval. This may require creating a local interval
	/// when there is interference.
	///
	/// @param BI Block descriptor.
	/// @param IntvOut Interval index leaving the block.
	/// @param EnterAfter When set, enter IntvOut after this point.
	void splitRegOutBlock(const SplitAnalysis::BlockInfo &BI,
	unsigned IntvOut, SlotIndex EnterAfter);
	};

	} // end namespace llvm

	#endif // LLVM_LIB_CODEGEN_SPLITKIT_H
	Index: head/contrib/llvm/lib/IR/AutoUpgrade.cpp
	===================================================================
	--- head/contrib/llvm/lib/IR/AutoUpgrade.cpp (revision 329409)
	+++ head/contrib/llvm/lib/IR/AutoUpgrade.cpp (revision 329410)
	@@ -1,2598 +1,2591 @@
	//===-- AutoUpgrade.cpp - Implement auto-upgrade helper functions ---------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the auto-upgrade helper functions.
	// This is where deprecated IR intrinsics and other IR features are updated to
	// current specifications.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/IR/AutoUpgrade.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DIBuilder.h"
	#include "llvm/IR/DebugInfo.h"
	#include "llvm/IR/DiagnosticInfo.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/Verifier.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/Regex.h"
	#include <cstring>
	using namespace llvm;

	static void rename(GlobalValue *GV) { GV->setName(GV->getName() + ".old"); }

	// Upgrade the declarations of the SSE4.1 ptest intrinsics whose arguments have
	// changed their type from v4f32 to v2i64.
	static bool UpgradePTESTIntrinsic(Function* F, Intrinsic::ID IID,
	Function *&NewFn) {
	// Check whether this is an old version of the function, which received
	// v4f32 arguments.
	Type *Arg0Type = F->getFunctionType()->getParamType(0);
	if (Arg0Type != VectorType::get(Type::getFloatTy(F->getContext()), 4))
	return false;

	// Yes, it's old, replace it with new version.
	rename(F);
	NewFn = Intrinsic::getDeclaration(F->getParent(), IID);
	return true;
	}

	// Upgrade the declarations of intrinsic functions whose 8-bit immediate mask
	// arguments have changed their type from i32 to i8.
	static bool UpgradeX86IntrinsicsWith8BitMask(Function *F, Intrinsic::ID IID,
	Function *&NewFn) {
	// Check that the last argument is an i32.
	Type *LastArgType = F->getFunctionType()->getParamType(
	F->getFunctionType()->getNumParams() - 1);
	if (!LastArgType->isIntegerTy(32))
	return false;

	// Move this function aside and map down.
	rename(F);
	NewFn = Intrinsic::getDeclaration(F->getParent(), IID);
	return true;
	}

	static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
	// All of the intrinsics matches below should be marked with which llvm
	// version started autoupgrading them. At some point in the future we would
	// like to use this information to remove upgrade code for some older
	// intrinsics. It is currently undecided how we will determine that future
	// point.
	if (Name=="ssse3.pabs.b.128" \|\| // Added in 6.0
	Name=="ssse3.pabs.w.128" \|\| // Added in 6.0
	Name=="ssse3.pabs.d.128" \|\| // Added in 6.0
	Name.startswith("avx512.mask.shuf.i") \|\| // Added in 6.0
	Name.startswith("avx512.mask.shuf.f") \|\| // Added in 6.0
	- Name.startswith("avx512.kunpck") \|\| //added in 6.0
	Name.startswith("avx2.pabs.") \|\| // Added in 6.0
	Name.startswith("avx512.mask.pabs.") \|\| // Added in 6.0
	Name.startswith("avx512.broadcastm") \|\| // Added in 6.0
	Name.startswith("avx512.mask.pbroadcast") \|\| // Added in 6.0
	Name.startswith("sse2.pcmpeq.") \|\| // Added in 3.1
	Name.startswith("sse2.pcmpgt.") \|\| // Added in 3.1
	Name.startswith("avx2.pcmpeq.") \|\| // Added in 3.1
	Name.startswith("avx2.pcmpgt.") \|\| // Added in 3.1
	Name.startswith("avx512.mask.pcmpeq.") \|\| // Added in 3.9
	Name.startswith("avx512.mask.pcmpgt.") \|\| // Added in 3.9
	Name.startswith("avx.vperm2f128.") \|\| // Added in 6.0
	Name == "avx2.vperm2i128" \|\| // Added in 6.0
	Name == "sse.add.ss" \|\| // Added in 4.0
	Name == "sse2.add.sd" \|\| // Added in 4.0
	Name == "sse.sub.ss" \|\| // Added in 4.0
	Name == "sse2.sub.sd" \|\| // Added in 4.0
	Name == "sse.mul.ss" \|\| // Added in 4.0
	Name == "sse2.mul.sd" \|\| // Added in 4.0
	Name == "sse.div.ss" \|\| // Added in 4.0
	Name == "sse2.div.sd" \|\| // Added in 4.0
	Name == "sse41.pmaxsb" \|\| // Added in 3.9
	Name == "sse2.pmaxs.w" \|\| // Added in 3.9
	Name == "sse41.pmaxsd" \|\| // Added in 3.9
	Name == "sse2.pmaxu.b" \|\| // Added in 3.9
	Name == "sse41.pmaxuw" \|\| // Added in 3.9
	Name == "sse41.pmaxud" \|\| // Added in 3.9
	Name == "sse41.pminsb" \|\| // Added in 3.9
	Name == "sse2.pmins.w" \|\| // Added in 3.9
	Name == "sse41.pminsd" \|\| // Added in 3.9
	Name == "sse2.pminu.b" \|\| // Added in 3.9
	Name == "sse41.pminuw" \|\| // Added in 3.9
	Name == "sse41.pminud" \|\| // Added in 3.9
	Name.startswith("avx512.mask.pshuf.b.") \|\| // Added in 4.0
	Name.startswith("avx2.pmax") \|\| // Added in 3.9
	Name.startswith("avx2.pmin") \|\| // Added in 3.9
	Name.startswith("avx512.mask.pmax") \|\| // Added in 4.0
	Name.startswith("avx512.mask.pmin") \|\| // Added in 4.0
	Name.startswith("avx2.vbroadcast") \|\| // Added in 3.8
	Name.startswith("avx2.pbroadcast") \|\| // Added in 3.8
	Name.startswith("avx.vpermil.") \|\| // Added in 3.1
	Name.startswith("sse2.pshuf") \|\| // Added in 3.9
	Name.startswith("avx512.pbroadcast") \|\| // Added in 3.9
	Name.startswith("avx512.mask.broadcast.s") \|\| // Added in 3.9
	Name.startswith("avx512.mask.movddup") \|\| // Added in 3.9
	Name.startswith("avx512.mask.movshdup") \|\| // Added in 3.9
	Name.startswith("avx512.mask.movsldup") \|\| // Added in 3.9
	Name.startswith("avx512.mask.pshuf.d.") \|\| // Added in 3.9
	Name.startswith("avx512.mask.pshufl.w.") \|\| // Added in 3.9
	Name.startswith("avx512.mask.pshufh.w.") \|\| // Added in 3.9
	Name.startswith("avx512.mask.shuf.p") \|\| // Added in 4.0
	Name.startswith("avx512.mask.vpermil.p") \|\| // Added in 3.9
	Name.startswith("avx512.mask.perm.df.") \|\| // Added in 3.9
	Name.startswith("avx512.mask.perm.di.") \|\| // Added in 3.9
	Name.startswith("avx512.mask.punpckl") \|\| // Added in 3.9
	Name.startswith("avx512.mask.punpckh") \|\| // Added in 3.9
	Name.startswith("avx512.mask.unpckl.") \|\| // Added in 3.9
	Name.startswith("avx512.mask.unpckh.") \|\| // Added in 3.9
	Name.startswith("avx512.mask.pand.") \|\| // Added in 3.9
	Name.startswith("avx512.mask.pandn.") \|\| // Added in 3.9
	Name.startswith("avx512.mask.por.") \|\| // Added in 3.9
	Name.startswith("avx512.mask.pxor.") \|\| // Added in 3.9
	Name.startswith("avx512.mask.and.") \|\| // Added in 3.9
	Name.startswith("avx512.mask.andn.") \|\| // Added in 3.9
	Name.startswith("avx512.mask.or.") \|\| // Added in 3.9
	Name.startswith("avx512.mask.xor.") \|\| // Added in 3.9
	Name.startswith("avx512.mask.padd.") \|\| // Added in 4.0
	Name.startswith("avx512.mask.psub.") \|\| // Added in 4.0
	Name.startswith("avx512.mask.pmull.") \|\| // Added in 4.0
	Name.startswith("avx512.mask.cvtdq2pd.") \|\| // Added in 4.0
	Name.startswith("avx512.mask.cvtudq2pd.") \|\| // Added in 4.0
	Name.startswith("avx512.mask.pmul.dq.") \|\| // Added in 4.0
	Name.startswith("avx512.mask.pmulu.dq.") \|\| // Added in 4.0
	Name.startswith("avx512.mask.packsswb.") \|\| // Added in 5.0
	Name.startswith("avx512.mask.packssdw.") \|\| // Added in 5.0
	Name.startswith("avx512.mask.packuswb.") \|\| // Added in 5.0
	Name.startswith("avx512.mask.packusdw.") \|\| // Added in 5.0
	Name.startswith("avx512.mask.cmp.b") \|\| // Added in 5.0
	Name.startswith("avx512.mask.cmp.d") \|\| // Added in 5.0
	Name.startswith("avx512.mask.cmp.q") \|\| // Added in 5.0
	Name.startswith("avx512.mask.cmp.w") \|\| // Added in 5.0
	Name.startswith("avx512.mask.ucmp.") \|\| // Added in 5.0
	Name == "avx512.mask.add.pd.128" \|\| // Added in 4.0
	Name == "avx512.mask.add.pd.256" \|\| // Added in 4.0
	Name == "avx512.mask.add.ps.128" \|\| // Added in 4.0
	Name == "avx512.mask.add.ps.256" \|\| // Added in 4.0
	Name == "avx512.mask.div.pd.128" \|\| // Added in 4.0
	Name == "avx512.mask.div.pd.256" \|\| // Added in 4.0
	Name == "avx512.mask.div.ps.128" \|\| // Added in 4.0
	Name == "avx512.mask.div.ps.256" \|\| // Added in 4.0
	Name == "avx512.mask.mul.pd.128" \|\| // Added in 4.0
	Name == "avx512.mask.mul.pd.256" \|\| // Added in 4.0
	Name == "avx512.mask.mul.ps.128" \|\| // Added in 4.0
	Name == "avx512.mask.mul.ps.256" \|\| // Added in 4.0
	Name == "avx512.mask.sub.pd.128" \|\| // Added in 4.0
	Name == "avx512.mask.sub.pd.256" \|\| // Added in 4.0
	Name == "avx512.mask.sub.ps.128" \|\| // Added in 4.0
	Name == "avx512.mask.sub.ps.256" \|\| // Added in 4.0
	Name == "avx512.mask.max.pd.128" \|\| // Added in 5.0
	Name == "avx512.mask.max.pd.256" \|\| // Added in 5.0
	Name == "avx512.mask.max.ps.128" \|\| // Added in 5.0
	Name == "avx512.mask.max.ps.256" \|\| // Added in 5.0
	Name == "avx512.mask.min.pd.128" \|\| // Added in 5.0
	Name == "avx512.mask.min.pd.256" \|\| // Added in 5.0
	Name == "avx512.mask.min.ps.128" \|\| // Added in 5.0
	Name == "avx512.mask.min.ps.256" \|\| // Added in 5.0
	Name.startswith("avx512.mask.vpermilvar.") \|\| // Added in 4.0
	Name.startswith("avx512.mask.psll.d") \|\| // Added in 4.0
	Name.startswith("avx512.mask.psll.q") \|\| // Added in 4.0
	Name.startswith("avx512.mask.psll.w") \|\| // Added in 4.0
	Name.startswith("avx512.mask.psra.d") \|\| // Added in 4.0
	Name.startswith("avx512.mask.psra.q") \|\| // Added in 4.0
	Name.startswith("avx512.mask.psra.w") \|\| // Added in 4.0
	Name.startswith("avx512.mask.psrl.d") \|\| // Added in 4.0
	Name.startswith("avx512.mask.psrl.q") \|\| // Added in 4.0
	Name.startswith("avx512.mask.psrl.w") \|\| // Added in 4.0
	Name.startswith("avx512.mask.pslli") \|\| // Added in 4.0
	Name.startswith("avx512.mask.psrai") \|\| // Added in 4.0
	Name.startswith("avx512.mask.psrli") \|\| // Added in 4.0
	Name.startswith("avx512.mask.psllv") \|\| // Added in 4.0
	Name.startswith("avx512.mask.psrav") \|\| // Added in 4.0
	Name.startswith("avx512.mask.psrlv") \|\| // Added in 4.0
	Name.startswith("sse41.pmovsx") \|\| // Added in 3.8
	Name.startswith("sse41.pmovzx") \|\| // Added in 3.9
	Name.startswith("avx2.pmovsx") \|\| // Added in 3.9
	Name.startswith("avx2.pmovzx") \|\| // Added in 3.9
	Name.startswith("avx512.mask.pmovsx") \|\| // Added in 4.0
	Name.startswith("avx512.mask.pmovzx") \|\| // Added in 4.0
	Name.startswith("avx512.mask.lzcnt.") \|\| // Added in 5.0
	Name == "sse2.cvtdq2pd" \|\| // Added in 3.9
	Name == "sse2.cvtps2pd" \|\| // Added in 3.9
	Name == "avx.cvtdq2.pd.256" \|\| // Added in 3.9
	Name == "avx.cvt.ps2.pd.256" \|\| // Added in 3.9
	Name.startswith("avx.vinsertf128.") \|\| // Added in 3.7
	Name == "avx2.vinserti128" \|\| // Added in 3.7
	Name.startswith("avx512.mask.insert") \|\| // Added in 4.0
	Name.startswith("avx.vextractf128.") \|\| // Added in 3.7
	Name == "avx2.vextracti128" \|\| // Added in 3.7
	Name.startswith("avx512.mask.vextract") \|\| // Added in 4.0
	Name.startswith("sse4a.movnt.") \|\| // Added in 3.9
	Name.startswith("avx.movnt.") \|\| // Added in 3.2
	Name.startswith("avx512.storent.") \|\| // Added in 3.9
	Name == "sse41.movntdqa" \|\| // Added in 5.0
	Name == "avx2.movntdqa" \|\| // Added in 5.0
	Name == "avx512.movntdqa" \|\| // Added in 5.0
	Name == "sse2.storel.dq" \|\| // Added in 3.9
	Name.startswith("sse.storeu.") \|\| // Added in 3.9
	Name.startswith("sse2.storeu.") \|\| // Added in 3.9
	Name.startswith("avx.storeu.") \|\| // Added in 3.9
	Name.startswith("avx512.mask.storeu.") \|\| // Added in 3.9
	Name.startswith("avx512.mask.store.p") \|\| // Added in 3.9
	Name.startswith("avx512.mask.store.b.") \|\| // Added in 3.9
	Name.startswith("avx512.mask.store.w.") \|\| // Added in 3.9
	Name.startswith("avx512.mask.store.d.") \|\| // Added in 3.9
	Name.startswith("avx512.mask.store.q.") \|\| // Added in 3.9
	Name.startswith("avx512.mask.loadu.") \|\| // Added in 3.9
	Name.startswith("avx512.mask.load.") \|\| // Added in 3.9
	Name == "sse42.crc32.64.8" \|\| // Added in 3.4
	Name.startswith("avx.vbroadcast.s") \|\| // Added in 3.5
	Name.startswith("avx512.mask.palignr.") \|\| // Added in 3.9
	Name.startswith("avx512.mask.valign.") \|\| // Added in 4.0
	Name.startswith("sse2.psll.dq") \|\| // Added in 3.7
	Name.startswith("sse2.psrl.dq") \|\| // Added in 3.7
	Name.startswith("avx2.psll.dq") \|\| // Added in 3.7
	Name.startswith("avx2.psrl.dq") \|\| // Added in 3.7
	Name.startswith("avx512.psll.dq") \|\| // Added in 3.9
	Name.startswith("avx512.psrl.dq") \|\| // Added in 3.9
	Name == "sse41.pblendw" \|\| // Added in 3.7
	Name.startswith("sse41.blendp") \|\| // Added in 3.7
	Name.startswith("avx.blend.p") \|\| // Added in 3.7
	Name == "avx2.pblendw" \|\| // Added in 3.7
	Name.startswith("avx2.pblendd.") \|\| // Added in 3.7
	Name.startswith("avx.vbroadcastf128") \|\| // Added in 4.0
	Name == "avx2.vbroadcasti128" \|\| // Added in 3.7
	Name.startswith("avx512.mask.broadcastf") \|\| // Added in 6.0
	Name.startswith("avx512.mask.broadcasti") \|\| // Added in 6.0
	Name == "xop.vpcmov" \|\| // Added in 3.8
	Name == "xop.vpcmov.256" \|\| // Added in 5.0
	Name.startswith("avx512.mask.move.s") \|\| // Added in 4.0
	Name.startswith("avx512.cvtmask2") \|\| // Added in 5.0
	(Name.startswith("xop.vpcom") && // Added in 3.2
	F->arg_size() == 2) \|\|
	Name.startswith("avx512.ptestm") \|\| //Added in 6.0
	Name.startswith("avx512.ptestnm") \|\| //Added in 6.0
	Name.startswith("sse2.pavg") \|\| // Added in 6.0
	Name.startswith("avx2.pavg") \|\| // Added in 6.0
	Name.startswith("avx512.mask.pavg")) // Added in 6.0
	return true;

	return false;
	}

	static bool UpgradeX86IntrinsicFunction(Function *F, StringRef Name,
	Function *&NewFn) {
	// Only handle intrinsics that start with "x86.".
	if (!Name.startswith("x86."))
	return false;
	// Remove "x86." prefix.
	Name = Name.substr(4);

	if (ShouldUpgradeX86Intrinsic(F, Name)) {
	NewFn = nullptr;
	return true;
	}

	// SSE4.1 ptest functions may have an old signature.
	if (Name.startswith("sse41.ptest")) { // Added in 3.2
	if (Name.substr(11) == "c")
	return UpgradePTESTIntrinsic(F, Intrinsic::x86_sse41_ptestc, NewFn);
	if (Name.substr(11) == "z")
	return UpgradePTESTIntrinsic(F, Intrinsic::x86_sse41_ptestz, NewFn);
	if (Name.substr(11) == "nzc")
	return UpgradePTESTIntrinsic(F, Intrinsic::x86_sse41_ptestnzc, NewFn);
	}
	// Several blend and other instructions with masks used the wrong number of
	// bits.
	if (Name == "sse41.insertps") // Added in 3.6
	return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_insertps,
	NewFn);
	if (Name == "sse41.dppd") // Added in 3.6
	return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_dppd,
	NewFn);
	if (Name == "sse41.dpps") // Added in 3.6
	return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_dpps,
	NewFn);
	if (Name == "sse41.mpsadbw") // Added in 3.6
	return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_mpsadbw,
	NewFn);
	if (Name == "avx.dp.ps.256") // Added in 3.6
	return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_avx_dp_ps_256,
	NewFn);
	if (Name == "avx2.mpsadbw") // Added in 3.6
	return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_avx2_mpsadbw,
	NewFn);

	// frcz.ss/sd may need to have an argument dropped. Added in 3.2
	if (Name.startswith("xop.vfrcz.ss") && F->arg_size() == 2) {
	rename(F);
	NewFn = Intrinsic::getDeclaration(F->getParent(),
	Intrinsic::x86_xop_vfrcz_ss);
	return true;
	}
	if (Name.startswith("xop.vfrcz.sd") && F->arg_size() == 2) {
	rename(F);
	NewFn = Intrinsic::getDeclaration(F->getParent(),
	Intrinsic::x86_xop_vfrcz_sd);
	return true;
	}
	// Upgrade any XOP PERMIL2 index operand still using a float/double vector.
	if (Name.startswith("xop.vpermil2")) { // Added in 3.9
	auto Idx = F->getFunctionType()->getParamType(2);
	if (Idx->isFPOrFPVectorTy()) {
	rename(F);
	unsigned IdxSize = Idx->getPrimitiveSizeInBits();
	unsigned EltSize = Idx->getScalarSizeInBits();
	Intrinsic::ID Permil2ID;
	if (EltSize == 64 && IdxSize == 128)
	Permil2ID = Intrinsic::x86_xop_vpermil2pd;
	else if (EltSize == 32 && IdxSize == 128)
	Permil2ID = Intrinsic::x86_xop_vpermil2ps;
	else if (EltSize == 64 && IdxSize == 256)
	Permil2ID = Intrinsic::x86_xop_vpermil2pd_256;
	else
	Permil2ID = Intrinsic::x86_xop_vpermil2ps_256;
	NewFn = Intrinsic::getDeclaration(F->getParent(), Permil2ID);
	return true;
	}
	}

	return false;
	}

	static bool UpgradeIntrinsicFunction1(Function F, Function &NewFn) {
	assert(F && "Illegal to upgrade a non-existent Function.");

	// Quickly eliminate it, if it's not a candidate.
	StringRef Name = F->getName();
	if (Name.size() <= 8 \|\| !Name.startswith("llvm."))
	return false;
	Name = Name.substr(5); // Strip off "llvm."

	switch (Name[0]) {
	default: break;
	case 'a': {
	if (Name.startswith("arm.rbit") \|\| Name.startswith("aarch64.rbit")) {
	NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::bitreverse,
	F->arg_begin()->getType());
	return true;
	}
	if (Name.startswith("arm.neon.vclz")) {
	Type* args[2] = {
	F->arg_begin()->getType(),
	Type::getInt1Ty(F->getContext())
	};
	// Can't use Intrinsic::getDeclaration here as it adds a ".i1" to
	// the end of the name. Change name from llvm.arm.neon.vclz.* to
	// llvm.ctlz.*
	FunctionType* fType = FunctionType::get(F->getReturnType(), args, false);
	NewFn = Function::Create(fType, F->getLinkage(),
	"llvm.ctlz." + Name.substr(14), F->getParent());
	return true;
	}
	if (Name.startswith("arm.neon.vcnt")) {
	NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctpop,
	F->arg_begin()->getType());
	return true;
	}
	Regex vldRegex("^arm\\.neon\\.vld([1234]\|[234]lane)\\.v[a-z0-9]*$");
	if (vldRegex.match(Name)) {
	auto fArgs = F->getFunctionType()->params();
	SmallVector<Type *, 4> Tys(fArgs.begin(), fArgs.end());
	// Can't use Intrinsic::getDeclaration here as the return types might
	// then only be structurally equal.
	FunctionType* fType = FunctionType::get(F->getReturnType(), Tys, false);
	NewFn = Function::Create(fType, F->getLinkage(),
	"llvm." + Name + ".p0i8", F->getParent());
	return true;
	}
	Regex vstRegex("^arm\\.neon\\.vst([1234]\|[234]lane)\\.v[a-z0-9]*$");
	if (vstRegex.match(Name)) {
	static const Intrinsic::ID StoreInts[] = {Intrinsic::arm_neon_vst1,
	Intrinsic::arm_neon_vst2,
	Intrinsic::arm_neon_vst3,
	Intrinsic::arm_neon_vst4};

	static const Intrinsic::ID StoreLaneInts[] = {
	Intrinsic::arm_neon_vst2lane, Intrinsic::arm_neon_vst3lane,
	Intrinsic::arm_neon_vst4lane
	};

	auto fArgs = F->getFunctionType()->params();
	Type *Tys[] = {fArgs[0], fArgs[1]};
	if (Name.find("lane") == StringRef::npos)
	NewFn = Intrinsic::getDeclaration(F->getParent(),
	StoreInts[fArgs.size() - 3], Tys);
	else
	NewFn = Intrinsic::getDeclaration(F->getParent(),
	StoreLaneInts[fArgs.size() - 5], Tys);
	return true;
	}
	if (Name == "aarch64.thread.pointer" \|\| Name == "arm.thread.pointer") {
	NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::thread_pointer);
	return true;
	}
	break;
	}

	case 'c': {
	if (Name.startswith("ctlz.") && F->arg_size() == 1) {
	rename(F);
	NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz,
	F->arg_begin()->getType());
	return true;
	}
	if (Name.startswith("cttz.") && F->arg_size() == 1) {
	rename(F);
	NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::cttz,
	F->arg_begin()->getType());
	return true;
	}
	break;
	}
	case 'd': {
	if (Name == "dbg.value" && F->arg_size() == 4) {
	rename(F);
	NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::dbg_value);
	return true;
	}
	break;
	}
	case 'i':
	case 'l': {
	bool IsLifetimeStart = Name.startswith("lifetime.start");
	if (IsLifetimeStart \|\| Name.startswith("invariant.start")) {
	Intrinsic::ID ID = IsLifetimeStart ?
	Intrinsic::lifetime_start : Intrinsic::invariant_start;
	auto Args = F->getFunctionType()->params();
	Type* ObjectPtr[1] = {Args[1]};
	if (F->getName() != Intrinsic::getName(ID, ObjectPtr)) {
	rename(F);
	NewFn = Intrinsic::getDeclaration(F->getParent(), ID, ObjectPtr);
	return true;
	}
	}

	bool IsLifetimeEnd = Name.startswith("lifetime.end");
	if (IsLifetimeEnd \|\| Name.startswith("invariant.end")) {
	Intrinsic::ID ID = IsLifetimeEnd ?
	Intrinsic::lifetime_end : Intrinsic::invariant_end;

	auto Args = F->getFunctionType()->params();
	Type* ObjectPtr[1] = {Args[IsLifetimeEnd ? 1 : 2]};
	if (F->getName() != Intrinsic::getName(ID, ObjectPtr)) {
	rename(F);
	NewFn = Intrinsic::getDeclaration(F->getParent(), ID, ObjectPtr);
	return true;
	}
	}
	break;
	}
	case 'm': {
	if (Name.startswith("masked.load.")) {
	Type *Tys[] = { F->getReturnType(), F->arg_begin()->getType() };
	if (F->getName() != Intrinsic::getName(Intrinsic::masked_load, Tys)) {
	rename(F);
	NewFn = Intrinsic::getDeclaration(F->getParent(),
	Intrinsic::masked_load,
	Tys);
	return true;
	}
	}
	if (Name.startswith("masked.store.")) {
	auto Args = F->getFunctionType()->params();
	Type *Tys[] = { Args[0], Args[1] };
	if (F->getName() != Intrinsic::getName(Intrinsic::masked_store, Tys)) {
	rename(F);
	NewFn = Intrinsic::getDeclaration(F->getParent(),
	Intrinsic::masked_store,
	Tys);
	return true;
	}
	}
	// Renaming gather/scatter intrinsics with no address space overloading
	// to the new overload which includes an address space
	if (Name.startswith("masked.gather.")) {
	Type *Tys[] = {F->getReturnType(), F->arg_begin()->getType()};
	if (F->getName() != Intrinsic::getName(Intrinsic::masked_gather, Tys)) {
	rename(F);
	NewFn = Intrinsic::getDeclaration(F->getParent(),
	Intrinsic::masked_gather, Tys);
	return true;
	}
	}
	if (Name.startswith("masked.scatter.")) {
	auto Args = F->getFunctionType()->params();
	Type *Tys[] = {Args[0], Args[1]};
	if (F->getName() != Intrinsic::getName(Intrinsic::masked_scatter, Tys)) {
	rename(F);
	NewFn = Intrinsic::getDeclaration(F->getParent(),
	Intrinsic::masked_scatter, Tys);
	return true;
	}
	}
	break;
	}
	case 'n': {
	if (Name.startswith("nvvm.")) {
	Name = Name.substr(5);

	// The following nvvm intrinsics correspond exactly to an LLVM intrinsic.
	Intrinsic::ID IID = StringSwitch<Intrinsic::ID>(Name)
	.Cases("brev32", "brev64", Intrinsic::bitreverse)
	.Case("clz.i", Intrinsic::ctlz)
	.Case("popc.i", Intrinsic::ctpop)
	.Default(Intrinsic::not_intrinsic);
	if (IID != Intrinsic::not_intrinsic && F->arg_size() == 1) {
	NewFn = Intrinsic::getDeclaration(F->getParent(), IID,
	{F->getReturnType()});
	return true;
	}

	// The following nvvm intrinsics correspond exactly to an LLVM idiom, but
	// not to an intrinsic alone. We expand them in UpgradeIntrinsicCall.
	//
	// TODO: We could add lohi.i2d.
	bool Expand = StringSwitch<bool>(Name)
	.Cases("abs.i", "abs.ll", true)
	.Cases("clz.ll", "popc.ll", "h2f", true)
	.Cases("max.i", "max.ll", "max.ui", "max.ull", true)
	.Cases("min.i", "min.ll", "min.ui", "min.ull", true)
	.Default(false);
	if (Expand) {
	NewFn = nullptr;
	return true;
	}
	}
	break;
	}
	case 'o':
	// We only need to change the name to match the mangling including the
	// address space.
	if (Name.startswith("objectsize.")) {
	Type *Tys[2] = { F->getReturnType(), F->arg_begin()->getType() };
	if (F->arg_size() == 2 \|\|
	F->getName() != Intrinsic::getName(Intrinsic::objectsize, Tys)) {
	rename(F);
	NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::objectsize,
	Tys);
	return true;
	}
	}
	break;

	case 's':
	if (Name == "stackprotectorcheck") {
	NewFn = nullptr;
	return true;
	}
	break;

	case 'x':
	if (UpgradeX86IntrinsicFunction(F, Name, NewFn))
	return true;
	}
	// Remangle our intrinsic since we upgrade the mangling
	auto Result = llvm::Intrinsic::remangleIntrinsicFunction(F);
	if (Result != None) {
	NewFn = Result.getValue();
	return true;
	}

	// This may not belong here. This function is effectively being overloaded
	// to both detect an intrinsic which needs upgrading, and to provide the
	// upgraded form of the intrinsic. We should perhaps have two separate
	// functions for this.
	return false;
	}

	bool llvm::UpgradeIntrinsicFunction(Function F, Function &NewFn) {
	NewFn = nullptr;
	bool Upgraded = UpgradeIntrinsicFunction1(F, NewFn);
	assert(F != NewFn && "Intrinsic function upgraded to the same function");

	// Upgrade intrinsic attributes. This does not change the function.
	if (NewFn)
	F = NewFn;
	if (Intrinsic::ID id = F->getIntrinsicID())
	F->setAttributes(Intrinsic::getAttributes(F->getContext(), id));
	return Upgraded;
	}

	bool llvm::UpgradeGlobalVariable(GlobalVariable *GV) {
	// Nothing to do yet.
	return false;
	}

	// Handles upgrading SSE2/AVX2/AVX512BW PSLLDQ intrinsics by converting them
	// to byte shuffles.
	static Value *UpgradeX86PSLLDQIntrinsics(IRBuilder<> &Builder,
	Value *Op, unsigned Shift) {
	Type *ResultTy = Op->getType();
	unsigned NumElts = ResultTy->getVectorNumElements() * 8;

	// Bitcast from a 64-bit element type to a byte element type.
	Type *VecTy = VectorType::get(Builder.getInt8Ty(), NumElts);
	Op = Builder.CreateBitCast(Op, VecTy, "cast");

	// We'll be shuffling in zeroes.
	Value *Res = Constant::getNullValue(VecTy);

	// If shift is less than 16, emit a shuffle to move the bytes. Otherwise,
	// we'll just return the zero vector.
	if (Shift < 16) {
	uint32_t Idxs[64];
	// 256/512-bit version is split into 2/4 16-byte lanes.
	for (unsigned l = 0; l != NumElts; l += 16)
	for (unsigned i = 0; i != 16; ++i) {
	unsigned Idx = NumElts + i - Shift;
	if (Idx < NumElts)
	Idx -= NumElts - 16; // end of lane, switch operand.
	Idxs[l + i] = Idx + l;
	}

	Res = Builder.CreateShuffleVector(Res, Op, makeArrayRef(Idxs, NumElts));
	}

	// Bitcast back to a 64-bit element type.
	return Builder.CreateBitCast(Res, ResultTy, "cast");
	}

	// Handles upgrading SSE2/AVX2/AVX512BW PSRLDQ intrinsics by converting them
	// to byte shuffles.
	static Value UpgradeX86PSRLDQIntrinsics(IRBuilder<> &Builder, Value Op,
	unsigned Shift) {
	Type *ResultTy = Op->getType();
	unsigned NumElts = ResultTy->getVectorNumElements() * 8;

	// Bitcast from a 64-bit element type to a byte element type.
	Type *VecTy = VectorType::get(Builder.getInt8Ty(), NumElts);
	Op = Builder.CreateBitCast(Op, VecTy, "cast");

	// We'll be shuffling in zeroes.
	Value *Res = Constant::getNullValue(VecTy);

	// If shift is less than 16, emit a shuffle to move the bytes. Otherwise,
	// we'll just return the zero vector.
	if (Shift < 16) {
	uint32_t Idxs[64];
	// 256/512-bit version is split into 2/4 16-byte lanes.
	for (unsigned l = 0; l != NumElts; l += 16)
	for (unsigned i = 0; i != 16; ++i) {
	unsigned Idx = i + Shift;
	if (Idx >= 16)
	Idx += NumElts - 16; // end of lane, switch operand.
	Idxs[l + i] = Idx + l;
	}

	Res = Builder.CreateShuffleVector(Op, Res, makeArrayRef(Idxs, NumElts));
	}

	// Bitcast back to a 64-bit element type.
	return Builder.CreateBitCast(Res, ResultTy, "cast");
	}

	static Value getX86MaskVec(IRBuilder<> &Builder, Value Mask,
	unsigned NumElts) {
	llvm::VectorType *MaskTy = llvm::VectorType::get(Builder.getInt1Ty(),
	cast<IntegerType>(Mask->getType())->getBitWidth());
	Mask = Builder.CreateBitCast(Mask, MaskTy);

	// If we have less than 8 elements, then the starting mask was an i8 and
	// we need to extract down to the right number of elements.
	if (NumElts < 8) {
	uint32_t Indices[4];
	for (unsigned i = 0; i != NumElts; ++i)
	Indices[i] = i;
	Mask = Builder.CreateShuffleVector(Mask, Mask,
	makeArrayRef(Indices, NumElts),
	"extract");
	}

	return Mask;
	}

	static Value EmitX86Select(IRBuilder<> &Builder, Value Mask,
	Value Op0, Value Op1) {
	// If the mask is all ones just emit the align operation.
	if (const auto *C = dyn_cast<Constant>(Mask))
	if (C->isAllOnesValue())
	return Op0;

	Mask = getX86MaskVec(Builder, Mask, Op0->getType()->getVectorNumElements());
	return Builder.CreateSelect(Mask, Op0, Op1);
	}

	// Handle autoupgrade for masked PALIGNR and VALIGND/Q intrinsics.
	// PALIGNR handles large immediates by shifting while VALIGN masks the immediate
	// so we need to handle both cases. VALIGN also doesn't have 128-bit lanes.
	static Value UpgradeX86ALIGNIntrinsics(IRBuilder<> &Builder, Value Op0,
	Value Op1, Value Shift,
	Value Passthru, Value Mask,
	bool IsVALIGN) {
	unsigned ShiftVal = cast<llvm::ConstantInt>(Shift)->getZExtValue();

	unsigned NumElts = Op0->getType()->getVectorNumElements();
	assert((IsVALIGN \|\| NumElts % 16 == 0) && "Illegal NumElts for PALIGNR!");
	assert((!IsVALIGN \|\| NumElts <= 16) && "NumElts too large for VALIGN!");
	assert(isPowerOf2_32(NumElts) && "NumElts not a power of 2!");

	// Mask the immediate for VALIGN.
	if (IsVALIGN)
	ShiftVal &= (NumElts - 1);

	// If palignr is shifting the pair of vectors more than the size of two
	// lanes, emit zero.
	if (ShiftVal >= 32)
	return llvm::Constant::getNullValue(Op0->getType());

	// If palignr is shifting the pair of input vectors more than one lane,
	// but less than two lanes, convert to shifting in zeroes.
	if (ShiftVal > 16) {
	ShiftVal -= 16;
	Op1 = Op0;
	Op0 = llvm::Constant::getNullValue(Op0->getType());
	}

	uint32_t Indices[64];
	// 256-bit palignr operates on 128-bit lanes so we need to handle that
	for (unsigned l = 0; l < NumElts; l += 16) {
	for (unsigned i = 0; i != 16; ++i) {
	unsigned Idx = ShiftVal + i;
	if (!IsVALIGN && Idx >= 16) // Disable wrap for VALIGN.
	Idx += NumElts - 16; // End of lane, switch operand.
	Indices[l + i] = Idx + l;
	}
	}

	Value *Align = Builder.CreateShuffleVector(Op1, Op0,
	makeArrayRef(Indices, NumElts),
	"palignr");

	return EmitX86Select(Builder, Mask, Align, Passthru);
	}

	static Value *UpgradeMaskedStore(IRBuilder<> &Builder,
	Value Ptr, Value Data, Value *Mask,
	bool Aligned) {
	// Cast the pointer to the right type.
	Ptr = Builder.CreateBitCast(Ptr,
	llvm::PointerType::getUnqual(Data->getType()));
	unsigned Align =
	Aligned ? cast<VectorType>(Data->getType())->getBitWidth() / 8 : 1;

	// If the mask is all ones just emit a regular store.
	if (const auto *C = dyn_cast<Constant>(Mask))
	if (C->isAllOnesValue())
	return Builder.CreateAlignedStore(Data, Ptr, Align);

	// Convert the mask from an integer type to a vector of i1.
	unsigned NumElts = Data->getType()->getVectorNumElements();
	Mask = getX86MaskVec(Builder, Mask, NumElts);
	return Builder.CreateMaskedStore(Data, Ptr, Align, Mask);
	}

	static Value *UpgradeMaskedLoad(IRBuilder<> &Builder,
	Value Ptr, Value Passthru, Value *Mask,
	bool Aligned) {
	// Cast the pointer to the right type.
	Ptr = Builder.CreateBitCast(Ptr,
	llvm::PointerType::getUnqual(Passthru->getType()));
	unsigned Align =
	Aligned ? cast<VectorType>(Passthru->getType())->getBitWidth() / 8 : 1;

	// If the mask is all ones just emit a regular store.
	if (const auto *C = dyn_cast<Constant>(Mask))
	if (C->isAllOnesValue())
	return Builder.CreateAlignedLoad(Ptr, Align);

	// Convert the mask from an integer type to a vector of i1.
	unsigned NumElts = Passthru->getType()->getVectorNumElements();
	Mask = getX86MaskVec(Builder, Mask, NumElts);
	return Builder.CreateMaskedLoad(Ptr, Align, Mask, Passthru);
	}

	static Value *upgradeAbs(IRBuilder<> &Builder, CallInst &CI) {
	Value *Op0 = CI.getArgOperand(0);
	llvm::Type *Ty = Op0->getType();
	Value *Zero = llvm::Constant::getNullValue(Ty);
	Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_SGT, Op0, Zero);
	Value *Neg = Builder.CreateNeg(Op0);
	Value *Res = Builder.CreateSelect(Cmp, Op0, Neg);

	if (CI.getNumArgOperands() == 3)
	Res = EmitX86Select(Builder,CI.getArgOperand(2), Res, CI.getArgOperand(1));

	return Res;
	}

	static Value *upgradeIntMinMax(IRBuilder<> &Builder, CallInst &CI,
	ICmpInst::Predicate Pred) {
	Value *Op0 = CI.getArgOperand(0);
	Value *Op1 = CI.getArgOperand(1);
	Value *Cmp = Builder.CreateICmp(Pred, Op0, Op1);
	Value *Res = Builder.CreateSelect(Cmp, Op0, Op1);

	if (CI.getNumArgOperands() == 4)
	Res = EmitX86Select(Builder, CI.getArgOperand(3), Res, CI.getArgOperand(2));

	return Res;
	}

	// Applying mask on vector of i1's and make sure result is at least 8 bits wide.
	static Value ApplyX86MaskOn1BitsVec(IRBuilder<> &Builder,Value Vec, Value *Mask,
	unsigned NumElts) {
	const auto *C = dyn_cast<Constant>(Mask);
	if (!C \|\| !C->isAllOnesValue())
	Vec = Builder.CreateAnd(Vec, getX86MaskVec(Builder, Mask, NumElts));

	if (NumElts < 8) {
	uint32_t Indices[8];
	for (unsigned i = 0; i != NumElts; ++i)
	Indices[i] = i;
	for (unsigned i = NumElts; i != 8; ++i)
	Indices[i] = NumElts + i % NumElts;
	Vec = Builder.CreateShuffleVector(Vec,
	Constant::getNullValue(Vec->getType()),
	Indices);
	}
	return Builder.CreateBitCast(Vec, Builder.getIntNTy(std::max(NumElts, 8U)));
	}

	static Value *upgradeMaskedCompare(IRBuilder<> &Builder, CallInst &CI,
	unsigned CC, bool Signed) {
	Value *Op0 = CI.getArgOperand(0);
	unsigned NumElts = Op0->getType()->getVectorNumElements();

	Value *Cmp;
	if (CC == 3) {
	Cmp = Constant::getNullValue(llvm::VectorType::get(Builder.getInt1Ty(), NumElts));
	} else if (CC == 7) {
	Cmp = Constant::getAllOnesValue(llvm::VectorType::get(Builder.getInt1Ty(), NumElts));
	} else {
	ICmpInst::Predicate Pred;
	switch (CC) {
	default: llvm_unreachable("Unknown condition code");
	case 0: Pred = ICmpInst::ICMP_EQ; break;
	case 1: Pred = Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; break;
	case 2: Pred = Signed ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE; break;
	case 4: Pred = ICmpInst::ICMP_NE; break;
	case 5: Pred = Signed ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; break;
	case 6: Pred = Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; break;
	}
	Cmp = Builder.CreateICmp(Pred, Op0, CI.getArgOperand(1));
	}

	Value *Mask = CI.getArgOperand(CI.getNumArgOperands() - 1);

	return ApplyX86MaskOn1BitsVec(Builder, Cmp, Mask, NumElts);
	}

	// Replace a masked intrinsic with an older unmasked intrinsic.
	static Value *UpgradeX86MaskedShift(IRBuilder<> &Builder, CallInst &CI,
	Intrinsic::ID IID) {
	Function *F = CI.getCalledFunction();
	Function *Intrin = Intrinsic::getDeclaration(F->getParent(), IID);
	Value *Rep = Builder.CreateCall(Intrin,
	{ CI.getArgOperand(0), CI.getArgOperand(1) });
	return EmitX86Select(Builder, CI.getArgOperand(3), Rep, CI.getArgOperand(2));
	}

	static Value* upgradeMaskedMove(IRBuilder<> &Builder, CallInst &CI) {
	Value* A = CI.getArgOperand(0);
	Value* B = CI.getArgOperand(1);
	Value* Src = CI.getArgOperand(2);
	Value* Mask = CI.getArgOperand(3);

	Value* AndNode = Builder.CreateAnd(Mask, APInt(8, 1));
	Value* Cmp = Builder.CreateIsNotNull(AndNode);
	Value* Extract1 = Builder.CreateExtractElement(B, (uint64_t)0);
	Value* Extract2 = Builder.CreateExtractElement(Src, (uint64_t)0);
	Value* Select = Builder.CreateSelect(Cmp, Extract1, Extract2);
	return Builder.CreateInsertElement(A, Select, (uint64_t)0);
	}


	static Value* UpgradeMaskToInt(IRBuilder<> &Builder, CallInst &CI) {
	Value* Op = CI.getArgOperand(0);
	Type* ReturnOp = CI.getType();
	unsigned NumElts = CI.getType()->getVectorNumElements();
	Value *Mask = getX86MaskVec(Builder, Op, NumElts);
	return Builder.CreateSExt(Mask, ReturnOp, "vpmovm2");
	}

	/// Upgrade a call to an old intrinsic. All argument and return casting must be
	/// provided to seamlessly integrate with existing context.
	void llvm::UpgradeIntrinsicCall(CallInst CI, Function NewFn) {
	Function *F = CI->getCalledFunction();
	LLVMContext &C = CI->getContext();
	IRBuilder<> Builder(C);
	Builder.SetInsertPoint(CI->getParent(), CI->getIterator());

	assert(F && "Intrinsic call is not direct?");

	if (!NewFn) {
	// Get the Function's name.
	StringRef Name = F->getName();

	assert(Name.startswith("llvm.") && "Intrinsic doesn't start with 'llvm.'");
	Name = Name.substr(5);

	bool IsX86 = Name.startswith("x86.");
	if (IsX86)
	Name = Name.substr(4);
	bool IsNVVM = Name.startswith("nvvm.");
	if (IsNVVM)
	Name = Name.substr(5);

	if (IsX86 && Name.startswith("sse4a.movnt.")) {
	Module *M = F->getParent();
	SmallVector<Metadata *, 1> Elts;
	Elts.push_back(
	ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(C), 1)));
	MDNode *Node = MDNode::get(C, Elts);

	Value *Arg0 = CI->getArgOperand(0);
	Value *Arg1 = CI->getArgOperand(1);

	// Nontemporal (unaligned) store of the 0'th element of the float/double
	// vector.
	Type *SrcEltTy = cast<VectorType>(Arg1->getType())->getElementType();
	PointerType *EltPtrTy = PointerType::getUnqual(SrcEltTy);
	Value *Addr = Builder.CreateBitCast(Arg0, EltPtrTy, "cast");
	Value *Extract =
	Builder.CreateExtractElement(Arg1, (uint64_t)0, "extractelement");

	StoreInst *SI = Builder.CreateAlignedStore(Extract, Addr, 1);
	SI->setMetadata(M->getMDKindID("nontemporal"), Node);

	// Remove intrinsic.
	CI->eraseFromParent();
	return;
	}

	if (IsX86 && (Name.startswith("avx.movnt.") \|\|
	Name.startswith("avx512.storent."))) {
	Module *M = F->getParent();
	SmallVector<Metadata *, 1> Elts;
	Elts.push_back(
	ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(C), 1)));
	MDNode *Node = MDNode::get(C, Elts);

	Value *Arg0 = CI->getArgOperand(0);
	Value *Arg1 = CI->getArgOperand(1);

	// Convert the type of the pointer to a pointer to the stored type.
	Value *BC = Builder.CreateBitCast(Arg0,
	PointerType::getUnqual(Arg1->getType()),
	"cast");
	VectorType *VTy = cast<VectorType>(Arg1->getType());
	StoreInst *SI = Builder.CreateAlignedStore(Arg1, BC,
	VTy->getBitWidth() / 8);
	SI->setMetadata(M->getMDKindID("nontemporal"), Node);

	// Remove intrinsic.
	CI->eraseFromParent();
	return;
	}

	if (IsX86 && Name == "sse2.storel.dq") {
	Value *Arg0 = CI->getArgOperand(0);
	Value *Arg1 = CI->getArgOperand(1);

	Type *NewVecTy = VectorType::get(Type::getInt64Ty(C), 2);
	Value *BC0 = Builder.CreateBitCast(Arg1, NewVecTy, "cast");
	Value *Elt = Builder.CreateExtractElement(BC0, (uint64_t)0);
	Value *BC = Builder.CreateBitCast(Arg0,
	PointerType::getUnqual(Elt->getType()),
	"cast");
	Builder.CreateAlignedStore(Elt, BC, 1);

	// Remove intrinsic.
	CI->eraseFromParent();
	return;
	}

	if (IsX86 && (Name.startswith("sse.storeu.") \|\|
	Name.startswith("sse2.storeu.") \|\|
	Name.startswith("avx.storeu."))) {
	Value *Arg0 = CI->getArgOperand(0);
	Value *Arg1 = CI->getArgOperand(1);

	Arg0 = Builder.CreateBitCast(Arg0,
	PointerType::getUnqual(Arg1->getType()),
	"cast");
	Builder.CreateAlignedStore(Arg1, Arg0, 1);

	// Remove intrinsic.
	CI->eraseFromParent();
	return;
	}

	if (IsX86 && (Name.startswith("avx512.mask.store"))) {
	// "avx512.mask.storeu." or "avx512.mask.store."
	bool Aligned = Name[17] != 'u'; // "avx512.mask.storeu".
	UpgradeMaskedStore(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
	CI->getArgOperand(2), Aligned);

	// Remove intrinsic.
	CI->eraseFromParent();
	return;
	}

	Value *Rep;
	// Upgrade packed integer vector compare intrinsics to compare instructions.
	if (IsX86 && (Name.startswith("sse2.pcmp") \|\|
	Name.startswith("avx2.pcmp"))) {
	// "sse2.pcpmpeq." "sse2.pcmpgt." "avx2.pcmpeq." or "avx2.pcmpgt."
	bool CmpEq = Name[9] == 'e';
	Rep = Builder.CreateICmp(CmpEq ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_SGT,
	CI->getArgOperand(0), CI->getArgOperand(1));
	Rep = Builder.CreateSExt(Rep, CI->getType(), "");
	} else if (IsX86 && (Name.startswith("avx512.broadcastm"))) {
	Type *ExtTy = Type::getInt32Ty(C);
	if (CI->getOperand(0)->getType()->isIntegerTy(8))
	ExtTy = Type::getInt64Ty(C);
	unsigned NumElts = CI->getType()->getPrimitiveSizeInBits() /
	ExtTy->getPrimitiveSizeInBits();
	Rep = Builder.CreateZExt(CI->getArgOperand(0), ExtTy);
	Rep = Builder.CreateVectorSplat(NumElts, Rep);
	} else if (IsX86 && (Name.startswith("avx512.ptestm") \|\|
	Name.startswith("avx512.ptestnm"))) {
	Value *Op0 = CI->getArgOperand(0);
	Value *Op1 = CI->getArgOperand(1);
	Value *Mask = CI->getArgOperand(2);
	Rep = Builder.CreateAnd(Op0, Op1);
	llvm::Type *Ty = Op0->getType();
	Value *Zero = llvm::Constant::getNullValue(Ty);
	ICmpInst::Predicate Pred =
	Name.startswith("avx512.ptestm") ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ;
	Rep = Builder.CreateICmp(Pred, Rep, Zero);
	unsigned NumElts = Op0->getType()->getVectorNumElements();
	Rep = ApplyX86MaskOn1BitsVec(Builder, Rep, Mask, NumElts);
	} else if (IsX86 && (Name.startswith("avx512.mask.pbroadcast"))){
	unsigned NumElts =
	CI->getArgOperand(1)->getType()->getVectorNumElements();
	Rep = Builder.CreateVectorSplat(NumElts, CI->getArgOperand(0));
	Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
	CI->getArgOperand(1));
	- } else if (IsX86 && (Name.startswith("avx512.kunpck"))) {
	- uint64_t Shift = CI->getType()->getScalarSizeInBits() / 2;
	- uint64_t And = (1ULL << Shift) - 1;
	- Value* LowBits = Builder.CreateAnd(CI->getArgOperand(0), And);
	- Value* HighBits = Builder.CreateShl(CI->getArgOperand(1), Shift);
	- Rep = Builder.CreateOr(LowBits, HighBits);
	} else if (IsX86 && (Name == "sse.add.ss" \|\| Name == "sse2.add.sd")) {
	Type *I32Ty = Type::getInt32Ty(C);
	Value *Elt0 = Builder.CreateExtractElement(CI->getArgOperand(0),
	ConstantInt::get(I32Ty, 0));
	Value *Elt1 = Builder.CreateExtractElement(CI->getArgOperand(1),
	ConstantInt::get(I32Ty, 0));
	Rep = Builder.CreateInsertElement(CI->getArgOperand(0),
	Builder.CreateFAdd(Elt0, Elt1),
	ConstantInt::get(I32Ty, 0));
	} else if (IsX86 && (Name == "sse.sub.ss" \|\| Name == "sse2.sub.sd")) {
	Type *I32Ty = Type::getInt32Ty(C);
	Value *Elt0 = Builder.CreateExtractElement(CI->getArgOperand(0),
	ConstantInt::get(I32Ty, 0));
	Value *Elt1 = Builder.CreateExtractElement(CI->getArgOperand(1),
	ConstantInt::get(I32Ty, 0));
	Rep = Builder.CreateInsertElement(CI->getArgOperand(0),
	Builder.CreateFSub(Elt0, Elt1),
	ConstantInt::get(I32Ty, 0));
	} else if (IsX86 && (Name == "sse.mul.ss" \|\| Name == "sse2.mul.sd")) {
	Type *I32Ty = Type::getInt32Ty(C);
	Value *Elt0 = Builder.CreateExtractElement(CI->getArgOperand(0),
	ConstantInt::get(I32Ty, 0));
	Value *Elt1 = Builder.CreateExtractElement(CI->getArgOperand(1),
	ConstantInt::get(I32Ty, 0));
	Rep = Builder.CreateInsertElement(CI->getArgOperand(0),
	Builder.CreateFMul(Elt0, Elt1),
	ConstantInt::get(I32Ty, 0));
	} else if (IsX86 && (Name == "sse.div.ss" \|\| Name == "sse2.div.sd")) {
	Type *I32Ty = Type::getInt32Ty(C);
	Value *Elt0 = Builder.CreateExtractElement(CI->getArgOperand(0),
	ConstantInt::get(I32Ty, 0));
	Value *Elt1 = Builder.CreateExtractElement(CI->getArgOperand(1),
	ConstantInt::get(I32Ty, 0));
	Rep = Builder.CreateInsertElement(CI->getArgOperand(0),
	Builder.CreateFDiv(Elt0, Elt1),
	ConstantInt::get(I32Ty, 0));
	} else if (IsX86 && Name.startswith("avx512.mask.pcmp")) {
	// "avx512.mask.pcmpeq." or "avx512.mask.pcmpgt."
	bool CmpEq = Name[16] == 'e';
	Rep = upgradeMaskedCompare(Builder, *CI, CmpEq ? 0 : 6, true);
	} else if (IsX86 && Name.startswith("avx512.mask.cmp")) {
	unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
	Rep = upgradeMaskedCompare(Builder, *CI, Imm, true);
	} else if (IsX86 && Name.startswith("avx512.mask.ucmp")) {
	unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
	Rep = upgradeMaskedCompare(Builder, *CI, Imm, false);
	} else if(IsX86 && (Name == "ssse3.pabs.b.128" \|\|
	Name == "ssse3.pabs.w.128" \|\|
	Name == "ssse3.pabs.d.128" \|\|
	Name.startswith("avx2.pabs") \|\|
	Name.startswith("avx512.mask.pabs"))) {
	Rep = upgradeAbs(Builder, *CI);
	} else if (IsX86 && (Name == "sse41.pmaxsb" \|\|
	Name == "sse2.pmaxs.w" \|\|
	Name == "sse41.pmaxsd" \|\|
	Name.startswith("avx2.pmaxs") \|\|
	Name.startswith("avx512.mask.pmaxs"))) {
	Rep = upgradeIntMinMax(Builder, *CI, ICmpInst::ICMP_SGT);
	} else if (IsX86 && (Name == "sse2.pmaxu.b" \|\|
	Name == "sse41.pmaxuw" \|\|
	Name == "sse41.pmaxud" \|\|
	Name.startswith("avx2.pmaxu") \|\|
	Name.startswith("avx512.mask.pmaxu"))) {
	Rep = upgradeIntMinMax(Builder, *CI, ICmpInst::ICMP_UGT);
	} else if (IsX86 && (Name == "sse41.pminsb" \|\|
	Name == "sse2.pmins.w" \|\|
	Name == "sse41.pminsd" \|\|
	Name.startswith("avx2.pmins") \|\|
	Name.startswith("avx512.mask.pmins"))) {
	Rep = upgradeIntMinMax(Builder, *CI, ICmpInst::ICMP_SLT);
	} else if (IsX86 && (Name == "sse2.pminu.b" \|\|
	Name == "sse41.pminuw" \|\|
	Name == "sse41.pminud" \|\|
	Name.startswith("avx2.pminu") \|\|
	Name.startswith("avx512.mask.pminu"))) {
	Rep = upgradeIntMinMax(Builder, *CI, ICmpInst::ICMP_ULT);
	} else if (IsX86 && (Name == "sse2.cvtdq2pd" \|\|
	Name == "sse2.cvtps2pd" \|\|
	Name == "avx.cvtdq2.pd.256" \|\|
	Name == "avx.cvt.ps2.pd.256" \|\|
	Name.startswith("avx512.mask.cvtdq2pd.") \|\|
	Name.startswith("avx512.mask.cvtudq2pd."))) {
	// Lossless i32/float to double conversion.
	// Extract the bottom elements if necessary and convert to double vector.
	Value *Src = CI->getArgOperand(0);
	VectorType *SrcTy = cast<VectorType>(Src->getType());
	VectorType *DstTy = cast<VectorType>(CI->getType());
	Rep = CI->getArgOperand(0);

	unsigned NumDstElts = DstTy->getNumElements();
	if (NumDstElts < SrcTy->getNumElements()) {
	assert(NumDstElts == 2 && "Unexpected vector size");
	uint32_t ShuffleMask[2] = { 0, 1 };
	Rep = Builder.CreateShuffleVector(Rep, UndefValue::get(SrcTy),
	ShuffleMask);
	}

	bool SInt2Double = (StringRef::npos != Name.find("cvtdq2"));
	bool UInt2Double = (StringRef::npos != Name.find("cvtudq2"));
	if (SInt2Double)
	Rep = Builder.CreateSIToFP(Rep, DstTy, "cvtdq2pd");
	else if (UInt2Double)
	Rep = Builder.CreateUIToFP(Rep, DstTy, "cvtudq2pd");
	else
	Rep = Builder.CreateFPExt(Rep, DstTy, "cvtps2pd");

	if (CI->getNumArgOperands() == 3)
	Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
	CI->getArgOperand(1));
	} else if (IsX86 && (Name.startswith("avx512.mask.loadu."))) {
	Rep = UpgradeMaskedLoad(Builder, CI->getArgOperand(0),
	CI->getArgOperand(1), CI->getArgOperand(2),
	/Aligned/false);
	} else if (IsX86 && (Name.startswith("avx512.mask.load."))) {
	Rep = UpgradeMaskedLoad(Builder, CI->getArgOperand(0),
	CI->getArgOperand(1),CI->getArgOperand(2),
	/Aligned/true);
	} else if (IsX86 && Name.startswith("xop.vpcom")) {
	Intrinsic::ID intID;
	if (Name.endswith("ub"))
	intID = Intrinsic::x86_xop_vpcomub;
	else if (Name.endswith("uw"))
	intID = Intrinsic::x86_xop_vpcomuw;
	else if (Name.endswith("ud"))
	intID = Intrinsic::x86_xop_vpcomud;
	else if (Name.endswith("uq"))
	intID = Intrinsic::x86_xop_vpcomuq;
	else if (Name.endswith("b"))
	intID = Intrinsic::x86_xop_vpcomb;
	else if (Name.endswith("w"))
	intID = Intrinsic::x86_xop_vpcomw;
	else if (Name.endswith("d"))
	intID = Intrinsic::x86_xop_vpcomd;
	else if (Name.endswith("q"))
	intID = Intrinsic::x86_xop_vpcomq;
	else
	llvm_unreachable("Unknown suffix");

	Name = Name.substr(9); // strip off "xop.vpcom"
	unsigned Imm;
	if (Name.startswith("lt"))
	Imm = 0;
	else if (Name.startswith("le"))
	Imm = 1;
	else if (Name.startswith("gt"))
	Imm = 2;
	else if (Name.startswith("ge"))
	Imm = 3;
	else if (Name.startswith("eq"))
	Imm = 4;
	else if (Name.startswith("ne"))
	Imm = 5;
	else if (Name.startswith("false"))
	Imm = 6;
	else if (Name.startswith("true"))
	Imm = 7;
	else
	llvm_unreachable("Unknown condition");

	Function *VPCOM = Intrinsic::getDeclaration(F->getParent(), intID);
	Rep =
	Builder.CreateCall(VPCOM, {CI->getArgOperand(0), CI->getArgOperand(1),
	Builder.getInt8(Imm)});
	} else if (IsX86 && Name.startswith("xop.vpcmov")) {
	Value *Sel = CI->getArgOperand(2);
	Value *NotSel = Builder.CreateNot(Sel);
	Value *Sel0 = Builder.CreateAnd(CI->getArgOperand(0), Sel);
	Value *Sel1 = Builder.CreateAnd(CI->getArgOperand(1), NotSel);
	Rep = Builder.CreateOr(Sel0, Sel1);
	} else if (IsX86 && Name == "sse42.crc32.64.8") {
	Function *CRC32 = Intrinsic::getDeclaration(F->getParent(),
	Intrinsic::x86_sse42_crc32_32_8);
	Value *Trunc0 = Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(C));
	Rep = Builder.CreateCall(CRC32, {Trunc0, CI->getArgOperand(1)});
	Rep = Builder.CreateZExt(Rep, CI->getType(), "");
	} else if (IsX86 && Name.startswith("avx.vbroadcast.s")) {
	// Replace broadcasts with a series of insertelements.
	Type *VecTy = CI->getType();
	Type *EltTy = VecTy->getVectorElementType();
	unsigned EltNum = VecTy->getVectorNumElements();
	Value *Cast = Builder.CreateBitCast(CI->getArgOperand(0),
	EltTy->getPointerTo());
	Value *Load = Builder.CreateLoad(EltTy, Cast);
	Type *I32Ty = Type::getInt32Ty(C);
	Rep = UndefValue::get(VecTy);
	for (unsigned I = 0; I < EltNum; ++I)
	Rep = Builder.CreateInsertElement(Rep, Load,
	ConstantInt::get(I32Ty, I));
	} else if (IsX86 && (Name.startswith("sse41.pmovsx") \|\|
	Name.startswith("sse41.pmovzx") \|\|
	Name.startswith("avx2.pmovsx") \|\|
	Name.startswith("avx2.pmovzx") \|\|
	Name.startswith("avx512.mask.pmovsx") \|\|
	Name.startswith("avx512.mask.pmovzx"))) {
	VectorType *SrcTy = cast<VectorType>(CI->getArgOperand(0)->getType());
	VectorType *DstTy = cast<VectorType>(CI->getType());
	unsigned NumDstElts = DstTy->getNumElements();

	// Extract a subvector of the first NumDstElts lanes and sign/zero extend.
	SmallVector<uint32_t, 8> ShuffleMask(NumDstElts);
	for (unsigned i = 0; i != NumDstElts; ++i)
	ShuffleMask[i] = i;

	Value *SV = Builder.CreateShuffleVector(
	CI->getArgOperand(0), UndefValue::get(SrcTy), ShuffleMask);

	bool DoSext = (StringRef::npos != Name.find("pmovsx"));
	Rep = DoSext ? Builder.CreateSExt(SV, DstTy)
	: Builder.CreateZExt(SV, DstTy);
	// If there are 3 arguments, it's a masked intrinsic so we need a select.
	if (CI->getNumArgOperands() == 3)
	Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
	CI->getArgOperand(1));
	} else if (IsX86 && (Name.startswith("avx.vbroadcastf128") \|\|
	Name == "avx2.vbroadcasti128")) {
	// Replace vbroadcastf128/vbroadcasti128 with a vector load+shuffle.
	Type *EltTy = CI->getType()->getVectorElementType();
	unsigned NumSrcElts = 128 / EltTy->getPrimitiveSizeInBits();
	Type *VT = VectorType::get(EltTy, NumSrcElts);
	Value *Op = Builder.CreatePointerCast(CI->getArgOperand(0),
	PointerType::getUnqual(VT));
	Value *Load = Builder.CreateAlignedLoad(Op, 1);
	if (NumSrcElts == 2)
	Rep = Builder.CreateShuffleVector(Load, UndefValue::get(Load->getType()),
	{ 0, 1, 0, 1 });
	else
	Rep = Builder.CreateShuffleVector(Load, UndefValue::get(Load->getType()),
	{ 0, 1, 2, 3, 0, 1, 2, 3 });
	} else if (IsX86 && (Name.startswith("avx512.mask.shuf.i") \|\|
	Name.startswith("avx512.mask.shuf.f"))) {
	unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
	Type *VT = CI->getType();
	unsigned NumLanes = VT->getPrimitiveSizeInBits() / 128;
	unsigned NumElementsInLane = 128 / VT->getScalarSizeInBits();
	unsigned ControlBitsMask = NumLanes - 1;
	unsigned NumControlBits = NumLanes / 2;
	SmallVector<uint32_t, 8> ShuffleMask(0);

	for (unsigned l = 0; l != NumLanes; ++l) {
	unsigned LaneMask = (Imm >> (l * NumControlBits)) & ControlBitsMask;
	// We actually need the other source.
	if (l >= NumLanes / 2)
	LaneMask += NumLanes;
	for (unsigned i = 0; i != NumElementsInLane; ++i)
	ShuffleMask.push_back(LaneMask * NumElementsInLane + i);
	}
	Rep = Builder.CreateShuffleVector(CI->getArgOperand(0),
	CI->getArgOperand(1), ShuffleMask);
	Rep = EmitX86Select(Builder, CI->getArgOperand(4), Rep,
	CI->getArgOperand(3));
	}else if (IsX86 && (Name.startswith("avx512.mask.broadcastf") \|\|
	Name.startswith("avx512.mask.broadcasti"))) {
	unsigned NumSrcElts =
	CI->getArgOperand(0)->getType()->getVectorNumElements();
	unsigned NumDstElts = CI->getType()->getVectorNumElements();

	SmallVector<uint32_t, 8> ShuffleMask(NumDstElts);
	for (unsigned i = 0; i != NumDstElts; ++i)
	ShuffleMask[i] = i % NumSrcElts;

	Rep = Builder.CreateShuffleVector(CI->getArgOperand(0),
	CI->getArgOperand(0),
	ShuffleMask);
	Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
	CI->getArgOperand(1));
	} else if (IsX86 && (Name.startswith("avx2.pbroadcast") \|\|
	Name.startswith("avx2.vbroadcast") \|\|
	Name.startswith("avx512.pbroadcast") \|\|
	Name.startswith("avx512.mask.broadcast.s"))) {
	// Replace vp?broadcasts with a vector shuffle.
	Value *Op = CI->getArgOperand(0);
	unsigned NumElts = CI->getType()->getVectorNumElements();
	Type *MaskTy = VectorType::get(Type::getInt32Ty(C), NumElts);
	Rep = Builder.CreateShuffleVector(Op, UndefValue::get(Op->getType()),
	Constant::getNullValue(MaskTy));

	if (CI->getNumArgOperands() == 3)
	Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
	CI->getArgOperand(1));
	} else if (IsX86 && Name.startswith("avx512.mask.palignr.")) {
	Rep = UpgradeX86ALIGNIntrinsics(Builder, CI->getArgOperand(0),
	CI->getArgOperand(1),
	CI->getArgOperand(2),
	CI->getArgOperand(3),
	CI->getArgOperand(4),
	false);
	} else if (IsX86 && Name.startswith("avx512.mask.valign.")) {
	Rep = UpgradeX86ALIGNIntrinsics(Builder, CI->getArgOperand(0),
	CI->getArgOperand(1),
	CI->getArgOperand(2),
	CI->getArgOperand(3),
	CI->getArgOperand(4),
	true);
	} else if (IsX86 && (Name == "sse2.psll.dq" \|\|
	Name == "avx2.psll.dq")) {
	// 128/256-bit shift left specified in bits.
	unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
	Rep = UpgradeX86PSLLDQIntrinsics(Builder, CI->getArgOperand(0),
	Shift / 8); // Shift is in bits.
	} else if (IsX86 && (Name == "sse2.psrl.dq" \|\|
	Name == "avx2.psrl.dq")) {
	// 128/256-bit shift right specified in bits.
	unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
	Rep = UpgradeX86PSRLDQIntrinsics(Builder, CI->getArgOperand(0),
	Shift / 8); // Shift is in bits.
	} else if (IsX86 && (Name == "sse2.psll.dq.bs" \|\|
	Name == "avx2.psll.dq.bs" \|\|
	Name == "avx512.psll.dq.512")) {
	// 128/256/512-bit shift left specified in bytes.
	unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
	Rep = UpgradeX86PSLLDQIntrinsics(Builder, CI->getArgOperand(0), Shift);
	} else if (IsX86 && (Name == "sse2.psrl.dq.bs" \|\|
	Name == "avx2.psrl.dq.bs" \|\|
	Name == "avx512.psrl.dq.512")) {
	// 128/256/512-bit shift right specified in bytes.
	unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
	Rep = UpgradeX86PSRLDQIntrinsics(Builder, CI->getArgOperand(0), Shift);
	} else if (IsX86 && (Name == "sse41.pblendw" \|\|
	Name.startswith("sse41.blendp") \|\|
	Name.startswith("avx.blend.p") \|\|
	Name == "avx2.pblendw" \|\|
	Name.startswith("avx2.pblendd."))) {
	Value *Op0 = CI->getArgOperand(0);
	Value *Op1 = CI->getArgOperand(1);
	unsigned Imm = cast <ConstantInt>(CI->getArgOperand(2))->getZExtValue();
	VectorType *VecTy = cast<VectorType>(CI->getType());
	unsigned NumElts = VecTy->getNumElements();

	SmallVector<uint32_t, 16> Idxs(NumElts);
	for (unsigned i = 0; i != NumElts; ++i)
	Idxs[i] = ((Imm >> (i%8)) & 1) ? i + NumElts : i;

	Rep = Builder.CreateShuffleVector(Op0, Op1, Idxs);
	} else if (IsX86 && (Name.startswith("avx.vinsertf128.") \|\|
	Name == "avx2.vinserti128" \|\|
	Name.startswith("avx512.mask.insert"))) {
	Value *Op0 = CI->getArgOperand(0);
	Value *Op1 = CI->getArgOperand(1);
	unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
	unsigned DstNumElts = CI->getType()->getVectorNumElements();
	unsigned SrcNumElts = Op1->getType()->getVectorNumElements();
	unsigned Scale = DstNumElts / SrcNumElts;

	// Mask off the high bits of the immediate value; hardware ignores those.
	Imm = Imm % Scale;

	// Extend the second operand into a vector the size of the destination.
	Value *UndefV = UndefValue::get(Op1->getType());
	SmallVector<uint32_t, 8> Idxs(DstNumElts);
	for (unsigned i = 0; i != SrcNumElts; ++i)
	Idxs[i] = i;
	for (unsigned i = SrcNumElts; i != DstNumElts; ++i)
	Idxs[i] = SrcNumElts;
	Rep = Builder.CreateShuffleVector(Op1, UndefV, Idxs);

	// Insert the second operand into the first operand.

	// Note that there is no guarantee that instruction lowering will actually
	// produce a vinsertf128 instruction for the created shuffles. In
	// particular, the 0 immediate case involves no lane changes, so it can
	// be handled as a blend.

	// Example of shuffle mask for 32-bit elements:
	// Imm = 1 <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
	// Imm = 0 <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7 >

	// First fill with identify mask.
	for (unsigned i = 0; i != DstNumElts; ++i)
	Idxs[i] = i;
	// Then replace the elements where we need to insert.
	for (unsigned i = 0; i != SrcNumElts; ++i)
	Idxs[i + Imm * SrcNumElts] = i + DstNumElts;
	Rep = Builder.CreateShuffleVector(Op0, Rep, Idxs);

	// If the intrinsic has a mask operand, handle that.
	if (CI->getNumArgOperands() == 5)
	Rep = EmitX86Select(Builder, CI->getArgOperand(4), Rep,
	CI->getArgOperand(3));
	} else if (IsX86 && (Name.startswith("avx.vextractf128.") \|\|
	Name == "avx2.vextracti128" \|\|
	Name.startswith("avx512.mask.vextract"))) {
	Value *Op0 = CI->getArgOperand(0);
	unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
	unsigned DstNumElts = CI->getType()->getVectorNumElements();
	unsigned SrcNumElts = Op0->getType()->getVectorNumElements();
	unsigned Scale = SrcNumElts / DstNumElts;

	// Mask off the high bits of the immediate value; hardware ignores those.
	Imm = Imm % Scale;

	// Get indexes for the subvector of the input vector.
	SmallVector<uint32_t, 8> Idxs(DstNumElts);
	for (unsigned i = 0; i != DstNumElts; ++i) {
	Idxs[i] = i + (Imm * DstNumElts);
	}
	Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);

	// If the intrinsic has a mask operand, handle that.
	if (CI->getNumArgOperands() == 4)
	Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
	CI->getArgOperand(2));
	} else if (!IsX86 && Name == "stackprotectorcheck") {
	Rep = nullptr;
	} else if (IsX86 && (Name.startswith("avx512.mask.perm.df.") \|\|
	Name.startswith("avx512.mask.perm.di."))) {
	Value *Op0 = CI->getArgOperand(0);
	unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
	VectorType *VecTy = cast<VectorType>(CI->getType());
	unsigned NumElts = VecTy->getNumElements();

	SmallVector<uint32_t, 8> Idxs(NumElts);
	for (unsigned i = 0; i != NumElts; ++i)
	Idxs[i] = (i & ~0x3) + ((Imm >> (2 * (i & 0x3))) & 3);

	Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);

	if (CI->getNumArgOperands() == 4)
	Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
	CI->getArgOperand(2));
	} else if (IsX86 && (Name.startswith("avx.vperm2f128.") \|\|
	Name == "avx2.vperm2i128")) {
	// The immediate permute control byte looks like this:
	// [1:0] - select 128 bits from sources for low half of destination
	// [2] - ignore
	// [3] - zero low half of destination
	// [5:4] - select 128 bits from sources for high half of destination
	// [6] - ignore
	// [7] - zero high half of destination

	uint8_t Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();

	unsigned NumElts = CI->getType()->getVectorNumElements();
	unsigned HalfSize = NumElts / 2;
	SmallVector<uint32_t, 8> ShuffleMask(NumElts);

	// Determine which operand(s) are actually in use for this instruction.
	Value *V0 = (Imm & 0x02) ? CI->getArgOperand(1) : CI->getArgOperand(0);
	Value *V1 = (Imm & 0x20) ? CI->getArgOperand(1) : CI->getArgOperand(0);

	// If needed, replace operands based on zero mask.
	V0 = (Imm & 0x08) ? ConstantAggregateZero::get(CI->getType()) : V0;
	V1 = (Imm & 0x80) ? ConstantAggregateZero::get(CI->getType()) : V1;

	// Permute low half of result.
	unsigned StartIndex = (Imm & 0x01) ? HalfSize : 0;
	for (unsigned i = 0; i < HalfSize; ++i)
	ShuffleMask[i] = StartIndex + i;

	// Permute high half of result.
	StartIndex = (Imm & 0x10) ? HalfSize : 0;
	for (unsigned i = 0; i < HalfSize; ++i)
	ShuffleMask[i + HalfSize] = NumElts + StartIndex + i;

	Rep = Builder.CreateShuffleVector(V0, V1, ShuffleMask);

	} else if (IsX86 && (Name.startswith("avx.vpermil.") \|\|
	Name == "sse2.pshuf.d" \|\|
	Name.startswith("avx512.mask.vpermil.p") \|\|
	Name.startswith("avx512.mask.pshuf.d."))) {
	Value *Op0 = CI->getArgOperand(0);
	unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
	VectorType *VecTy = cast<VectorType>(CI->getType());
	unsigned NumElts = VecTy->getNumElements();
	// Calculate the size of each index in the immediate.
	unsigned IdxSize = 64 / VecTy->getScalarSizeInBits();
	unsigned IdxMask = ((1 << IdxSize) - 1);

	SmallVector<uint32_t, 8> Idxs(NumElts);
	// Lookup the bits for this element, wrapping around the immediate every
	// 8-bits. Elements are grouped into sets of 2 or 4 elements so we need
	// to offset by the first index of each group.
	for (unsigned i = 0; i != NumElts; ++i)
	Idxs[i] = ((Imm >> ((i * IdxSize) % 8)) & IdxMask) \| (i & ~IdxMask);

	Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);

	if (CI->getNumArgOperands() == 4)
	Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
	CI->getArgOperand(2));
	} else if (IsX86 && (Name == "sse2.pshufl.w" \|\|
	Name.startswith("avx512.mask.pshufl.w."))) {
	Value *Op0 = CI->getArgOperand(0);
	unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
	unsigned NumElts = CI->getType()->getVectorNumElements();

	SmallVector<uint32_t, 16> Idxs(NumElts);
	for (unsigned l = 0; l != NumElts; l += 8) {
	for (unsigned i = 0; i != 4; ++i)
	Idxs[i + l] = ((Imm >> (2 * i)) & 0x3) + l;
	for (unsigned i = 4; i != 8; ++i)
	Idxs[i + l] = i + l;
	}

	Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);

	if (CI->getNumArgOperands() == 4)
	Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
	CI->getArgOperand(2));
	} else if (IsX86 && (Name == "sse2.pshufh.w" \|\|
	Name.startswith("avx512.mask.pshufh.w."))) {
	Value *Op0 = CI->getArgOperand(0);
	unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
	unsigned NumElts = CI->getType()->getVectorNumElements();

	SmallVector<uint32_t, 16> Idxs(NumElts);
	for (unsigned l = 0; l != NumElts; l += 8) {
	for (unsigned i = 0; i != 4; ++i)
	Idxs[i + l] = i + l;
	for (unsigned i = 0; i != 4; ++i)
	Idxs[i + l + 4] = ((Imm >> (2 * i)) & 0x3) + 4 + l;
	}

	Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);

	if (CI->getNumArgOperands() == 4)
	Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
	CI->getArgOperand(2));
	} else if (IsX86 && Name.startswith("avx512.mask.shuf.p")) {
	Value *Op0 = CI->getArgOperand(0);
	Value *Op1 = CI->getArgOperand(1);
	unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
	unsigned NumElts = CI->getType()->getVectorNumElements();

	unsigned NumLaneElts = 128/CI->getType()->getScalarSizeInBits();
	unsigned HalfLaneElts = NumLaneElts / 2;

	SmallVector<uint32_t, 16> Idxs(NumElts);
	for (unsigned i = 0; i != NumElts; ++i) {
	// Base index is the starting element of the lane.
	Idxs[i] = i - (i % NumLaneElts);
	// If we are half way through the lane switch to the other source.
	if ((i % NumLaneElts) >= HalfLaneElts)
	Idxs[i] += NumElts;
	// Now select the specific element. By adding HalfLaneElts bits from
	// the immediate. Wrapping around the immediate every 8-bits.
	Idxs[i] += (Imm >> ((i * HalfLaneElts) % 8)) & ((1 << HalfLaneElts) - 1);
	}

	Rep = Builder.CreateShuffleVector(Op0, Op1, Idxs);

	Rep = EmitX86Select(Builder, CI->getArgOperand(4), Rep,
	CI->getArgOperand(3));
	} else if (IsX86 && (Name.startswith("avx512.mask.movddup") \|\|
	Name.startswith("avx512.mask.movshdup") \|\|
	Name.startswith("avx512.mask.movsldup"))) {
	Value *Op0 = CI->getArgOperand(0);
	unsigned NumElts = CI->getType()->getVectorNumElements();
	unsigned NumLaneElts = 128/CI->getType()->getScalarSizeInBits();

	unsigned Offset = 0;
	if (Name.startswith("avx512.mask.movshdup."))
	Offset = 1;

	SmallVector<uint32_t, 16> Idxs(NumElts);
	for (unsigned l = 0; l != NumElts; l += NumLaneElts)
	for (unsigned i = 0; i != NumLaneElts; i += 2) {
	Idxs[i + l + 0] = i + l + Offset;
	Idxs[i + l + 1] = i + l + Offset;
	}

	Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);

	Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
	CI->getArgOperand(1));
	} else if (IsX86 && (Name.startswith("avx512.mask.punpckl") \|\|
	Name.startswith("avx512.mask.unpckl."))) {
	Value *Op0 = CI->getArgOperand(0);
	Value *Op1 = CI->getArgOperand(1);
	int NumElts = CI->getType()->getVectorNumElements();
	int NumLaneElts = 128/CI->getType()->getScalarSizeInBits();

	SmallVector<uint32_t, 64> Idxs(NumElts);
	for (int l = 0; l != NumElts; l += NumLaneElts)
	for (int i = 0; i != NumLaneElts; ++i)
	Idxs[i + l] = l + (i / 2) + NumElts * (i % 2);

	Rep = Builder.CreateShuffleVector(Op0, Op1, Idxs);

	Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
	CI->getArgOperand(2));
	} else if (IsX86 && (Name.startswith("avx512.mask.punpckh") \|\|
	Name.startswith("avx512.mask.unpckh."))) {
	Value *Op0 = CI->getArgOperand(0);
	Value *Op1 = CI->getArgOperand(1);
	int NumElts = CI->getType()->getVectorNumElements();
	int NumLaneElts = 128/CI->getType()->getScalarSizeInBits();

	SmallVector<uint32_t, 64> Idxs(NumElts);
	for (int l = 0; l != NumElts; l += NumLaneElts)
	for (int i = 0; i != NumLaneElts; ++i)
	Idxs[i + l] = (NumLaneElts / 2) + l + (i / 2) + NumElts * (i % 2);

	Rep = Builder.CreateShuffleVector(Op0, Op1, Idxs);

	Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
	CI->getArgOperand(2));
	} else if (IsX86 && Name.startswith("avx512.mask.pand.")) {
	Rep = Builder.CreateAnd(CI->getArgOperand(0), CI->getArgOperand(1));
	Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
	CI->getArgOperand(2));
	} else if (IsX86 && Name.startswith("avx512.mask.pandn.")) {
	Rep = Builder.CreateAnd(Builder.CreateNot(CI->getArgOperand(0)),
	CI->getArgOperand(1));
	Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
	CI->getArgOperand(2));
	} else if (IsX86 && Name.startswith("avx512.mask.por.")) {
	Rep = Builder.CreateOr(CI->getArgOperand(0), CI->getArgOperand(1));
	Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
	CI->getArgOperand(2));
	} else if (IsX86 && Name.startswith("avx512.mask.pxor.")) {
	Rep = Builder.CreateXor(CI->getArgOperand(0), CI->getArgOperand(1));
	Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
	CI->getArgOperand(2));
	} else if (IsX86 && Name.startswith("avx512.mask.and.")) {
	VectorType *FTy = cast<VectorType>(CI->getType());
	VectorType *ITy = VectorType::getInteger(FTy);
	Rep = Builder.CreateAnd(Builder.CreateBitCast(CI->getArgOperand(0), ITy),
	Builder.CreateBitCast(CI->getArgOperand(1), ITy));
	Rep = Builder.CreateBitCast(Rep, FTy);
	Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
	CI->getArgOperand(2));
	} else if (IsX86 && Name.startswith("avx512.mask.andn.")) {
	VectorType *FTy = cast<VectorType>(CI->getType());
	VectorType *ITy = VectorType::getInteger(FTy);
	Rep = Builder.CreateNot(Builder.CreateBitCast(CI->getArgOperand(0), ITy));
	Rep = Builder.CreateAnd(Rep,
	Builder.CreateBitCast(CI->getArgOperand(1), ITy));
	Rep = Builder.CreateBitCast(Rep, FTy);
	Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
	CI->getArgOperand(2));
	} else if (IsX86 && Name.startswith("avx512.mask.or.")) {
	VectorType *FTy = cast<VectorType>(CI->getType());
	VectorType *ITy = VectorType::getInteger(FTy);
	Rep = Builder.CreateOr(Builder.CreateBitCast(CI->getArgOperand(0), ITy),
	Builder.CreateBitCast(CI->getArgOperand(1), ITy));
	Rep = Builder.CreateBitCast(Rep, FTy);
	Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
	CI->getArgOperand(2));
	} else if (IsX86 && Name.startswith("avx512.mask.xor.")) {
	VectorType *FTy = cast<VectorType>(CI->getType());
	VectorType *ITy = VectorType::getInteger(FTy);
	Rep = Builder.CreateXor(Builder.CreateBitCast(CI->getArgOperand(0), ITy),
	Builder.CreateBitCast(CI->getArgOperand(1), ITy));
	Rep = Builder.CreateBitCast(Rep, FTy);
	Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
	CI->getArgOperand(2));
	} else if (IsX86 && Name.startswith("avx512.mask.padd.")) {
	Rep = Builder.CreateAdd(CI->getArgOperand(0), CI->getArgOperand(1));
	Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
	CI->getArgOperand(2));
	} else if (IsX86 && Name.startswith("avx512.mask.psub.")) {
	Rep = Builder.CreateSub(CI->getArgOperand(0), CI->getArgOperand(1));
	Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
	CI->getArgOperand(2));
	} else if (IsX86 && Name.startswith("avx512.mask.pmull.")) {
	Rep = Builder.CreateMul(CI->getArgOperand(0), CI->getArgOperand(1));
	Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
	CI->getArgOperand(2));
	} else if (IsX86 && (Name.startswith("avx512.mask.add.p"))) {
	Rep = Builder.CreateFAdd(CI->getArgOperand(0), CI->getArgOperand(1));
	Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
	CI->getArgOperand(2));
	} else if (IsX86 && Name.startswith("avx512.mask.div.p")) {
	Rep = Builder.CreateFDiv(CI->getArgOperand(0), CI->getArgOperand(1));
	Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
	CI->getArgOperand(2));
	} else if (IsX86 && Name.startswith("avx512.mask.mul.p")) {
	Rep = Builder.CreateFMul(CI->getArgOperand(0), CI->getArgOperand(1));
	Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
	CI->getArgOperand(2));
	} else if (IsX86 && Name.startswith("avx512.mask.sub.p")) {
	Rep = Builder.CreateFSub(CI->getArgOperand(0), CI->getArgOperand(1));
	Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
	CI->getArgOperand(2));
	} else if (IsX86 && Name.startswith("avx512.mask.lzcnt.")) {
	Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(),
	Intrinsic::ctlz,
	CI->getType()),
	{ CI->getArgOperand(0), Builder.getInt1(false) });
	Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
	CI->getArgOperand(1));
	} else if (IsX86 && (Name.startswith("avx512.mask.max.p") \|\|
	Name.startswith("avx512.mask.min.p"))) {
	bool IsMin = Name[13] == 'i';
	VectorType *VecTy = cast<VectorType>(CI->getType());
	unsigned VecWidth = VecTy->getPrimitiveSizeInBits();
	unsigned EltWidth = VecTy->getScalarSizeInBits();
	Intrinsic::ID IID;
	if (!IsMin && VecWidth == 128 && EltWidth == 32)
	IID = Intrinsic::x86_sse_max_ps;
	else if (!IsMin && VecWidth == 128 && EltWidth == 64)
	IID = Intrinsic::x86_sse2_max_pd;
	else if (!IsMin && VecWidth == 256 && EltWidth == 32)
	IID = Intrinsic::x86_avx_max_ps_256;
	else if (!IsMin && VecWidth == 256 && EltWidth == 64)
	IID = Intrinsic::x86_avx_max_pd_256;
	else if (IsMin && VecWidth == 128 && EltWidth == 32)
	IID = Intrinsic::x86_sse_min_ps;
	else if (IsMin && VecWidth == 128 && EltWidth == 64)
	IID = Intrinsic::x86_sse2_min_pd;
	else if (IsMin && VecWidth == 256 && EltWidth == 32)
	IID = Intrinsic::x86_avx_min_ps_256;
	else if (IsMin && VecWidth == 256 && EltWidth == 64)
	IID = Intrinsic::x86_avx_min_pd_256;
	else
	llvm_unreachable("Unexpected intrinsic");

	Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
	{ CI->getArgOperand(0), CI->getArgOperand(1) });
	Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
	CI->getArgOperand(2));
	} else if (IsX86 && Name.startswith("avx512.mask.pshuf.b.")) {
	VectorType *VecTy = cast<VectorType>(CI->getType());
	Intrinsic::ID IID;
	if (VecTy->getPrimitiveSizeInBits() == 128)
	IID = Intrinsic::x86_ssse3_pshuf_b_128;
	else if (VecTy->getPrimitiveSizeInBits() == 256)
	IID = Intrinsic::x86_avx2_pshuf_b;
	else if (VecTy->getPrimitiveSizeInBits() == 512)
	IID = Intrinsic::x86_avx512_pshuf_b_512;
	else
	llvm_unreachable("Unexpected intrinsic");

	Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
	{ CI->getArgOperand(0), CI->getArgOperand(1) });
	Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
	CI->getArgOperand(2));
	} else if (IsX86 && (Name.startswith("avx512.mask.pmul.dq.") \|\|
	Name.startswith("avx512.mask.pmulu.dq."))) {
	bool IsUnsigned = Name[16] == 'u';
	VectorType *VecTy = cast<VectorType>(CI->getType());
	Intrinsic::ID IID;
	if (!IsUnsigned && VecTy->getPrimitiveSizeInBits() == 128)
	IID = Intrinsic::x86_sse41_pmuldq;
	else if (!IsUnsigned && VecTy->getPrimitiveSizeInBits() == 256)
	IID = Intrinsic::x86_avx2_pmul_dq;
	else if (!IsUnsigned && VecTy->getPrimitiveSizeInBits() == 512)
	IID = Intrinsic::x86_avx512_pmul_dq_512;
	else if (IsUnsigned && VecTy->getPrimitiveSizeInBits() == 128)
	IID = Intrinsic::x86_sse2_pmulu_dq;
	else if (IsUnsigned && VecTy->getPrimitiveSizeInBits() == 256)
	IID = Intrinsic::x86_avx2_pmulu_dq;
	else if (IsUnsigned && VecTy->getPrimitiveSizeInBits() == 512)
	IID = Intrinsic::x86_avx512_pmulu_dq_512;
	else
	llvm_unreachable("Unexpected intrinsic");

	Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
	{ CI->getArgOperand(0), CI->getArgOperand(1) });
	Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
	CI->getArgOperand(2));
	} else if (IsX86 && Name.startswith("avx512.mask.pack")) {
	bool IsUnsigned = Name[16] == 'u';
	bool IsDW = Name[18] == 'd';
	VectorType *VecTy = cast<VectorType>(CI->getType());
	Intrinsic::ID IID;
	if (!IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 128)
	IID = Intrinsic::x86_sse2_packsswb_128;
	else if (!IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 256)
	IID = Intrinsic::x86_avx2_packsswb;
	else if (!IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 512)
	IID = Intrinsic::x86_avx512_packsswb_512;
	else if (!IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 128)
	IID = Intrinsic::x86_sse2_packssdw_128;
	else if (!IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 256)
	IID = Intrinsic::x86_avx2_packssdw;
	else if (!IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 512)
	IID = Intrinsic::x86_avx512_packssdw_512;
	else if (IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 128)
	IID = Intrinsic::x86_sse2_packuswb_128;
	else if (IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 256)
	IID = Intrinsic::x86_avx2_packuswb;
	else if (IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 512)
	IID = Intrinsic::x86_avx512_packuswb_512;
	else if (IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 128)
	IID = Intrinsic::x86_sse41_packusdw;
	else if (IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 256)
	IID = Intrinsic::x86_avx2_packusdw;
	else if (IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 512)
	IID = Intrinsic::x86_avx512_packusdw_512;
	else
	llvm_unreachable("Unexpected intrinsic");

	Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
	{ CI->getArgOperand(0), CI->getArgOperand(1) });
	Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
	CI->getArgOperand(2));
	} else if (IsX86 && Name.startswith("avx512.mask.psll")) {
	bool IsImmediate = Name[16] == 'i' \|\|
	(Name.size() > 18 && Name[18] == 'i');
	bool IsVariable = Name[16] == 'v';
	char Size = Name[16] == '.' ? Name[17] :
	Name[17] == '.' ? Name[18] :
	Name[18] == '.' ? Name[19] :
	Name[20];

	Intrinsic::ID IID;
	if (IsVariable && Name[17] != '.') {
	if (Size == 'd' && Name[17] == '2') // avx512.mask.psllv2.di
	IID = Intrinsic::x86_avx2_psllv_q;
	else if (Size == 'd' && Name[17] == '4') // avx512.mask.psllv4.di
	IID = Intrinsic::x86_avx2_psllv_q_256;
	else if (Size == 's' && Name[17] == '4') // avx512.mask.psllv4.si
	IID = Intrinsic::x86_avx2_psllv_d;
	else if (Size == 's' && Name[17] == '8') // avx512.mask.psllv8.si
	IID = Intrinsic::x86_avx2_psllv_d_256;
	else if (Size == 'h' && Name[17] == '8') // avx512.mask.psllv8.hi
	IID = Intrinsic::x86_avx512_psllv_w_128;
	else if (Size == 'h' && Name[17] == '1') // avx512.mask.psllv16.hi
	IID = Intrinsic::x86_avx512_psllv_w_256;
	else if (Name[17] == '3' && Name[18] == '2') // avx512.mask.psllv32hi
	IID = Intrinsic::x86_avx512_psllv_w_512;
	else
	llvm_unreachable("Unexpected size");
	} else if (Name.endswith(".128")) {
	if (Size == 'd') // avx512.mask.psll.d.128, avx512.mask.psll.di.128
	IID = IsImmediate ? Intrinsic::x86_sse2_pslli_d
	: Intrinsic::x86_sse2_psll_d;
	else if (Size == 'q') // avx512.mask.psll.q.128, avx512.mask.psll.qi.128
	IID = IsImmediate ? Intrinsic::x86_sse2_pslli_q
	: Intrinsic::x86_sse2_psll_q;
	else if (Size == 'w') // avx512.mask.psll.w.128, avx512.mask.psll.wi.128
	IID = IsImmediate ? Intrinsic::x86_sse2_pslli_w
	: Intrinsic::x86_sse2_psll_w;
	else
	llvm_unreachable("Unexpected size");
	} else if (Name.endswith(".256")) {
	if (Size == 'd') // avx512.mask.psll.d.256, avx512.mask.psll.di.256
	IID = IsImmediate ? Intrinsic::x86_avx2_pslli_d
	: Intrinsic::x86_avx2_psll_d;
	else if (Size == 'q') // avx512.mask.psll.q.256, avx512.mask.psll.qi.256
	IID = IsImmediate ? Intrinsic::x86_avx2_pslli_q
	: Intrinsic::x86_avx2_psll_q;
	else if (Size == 'w') // avx512.mask.psll.w.256, avx512.mask.psll.wi.256
	IID = IsImmediate ? Intrinsic::x86_avx2_pslli_w
	: Intrinsic::x86_avx2_psll_w;
	else
	llvm_unreachable("Unexpected size");
	} else {
	if (Size == 'd') // psll.di.512, pslli.d, psll.d, psllv.d.512
	IID = IsImmediate ? Intrinsic::x86_avx512_pslli_d_512 :
	IsVariable ? Intrinsic::x86_avx512_psllv_d_512 :
	Intrinsic::x86_avx512_psll_d_512;
	else if (Size == 'q') // psll.qi.512, pslli.q, psll.q, psllv.q.512
	IID = IsImmediate ? Intrinsic::x86_avx512_pslli_q_512 :
	IsVariable ? Intrinsic::x86_avx512_psllv_q_512 :
	Intrinsic::x86_avx512_psll_q_512;
	else if (Size == 'w') // psll.wi.512, pslli.w, psll.w
	IID = IsImmediate ? Intrinsic::x86_avx512_pslli_w_512
	: Intrinsic::x86_avx512_psll_w_512;
	else
	llvm_unreachable("Unexpected size");
	}

	Rep = UpgradeX86MaskedShift(Builder, *CI, IID);
	} else if (IsX86 && Name.startswith("avx512.mask.psrl")) {
	bool IsImmediate = Name[16] == 'i' \|\|
	(Name.size() > 18 && Name[18] == 'i');
	bool IsVariable = Name[16] == 'v';
	char Size = Name[16] == '.' ? Name[17] :
	Name[17] == '.' ? Name[18] :
	Name[18] == '.' ? Name[19] :
	Name[20];

	Intrinsic::ID IID;
	if (IsVariable && Name[17] != '.') {
	if (Size == 'd' && Name[17] == '2') // avx512.mask.psrlv2.di
	IID = Intrinsic::x86_avx2_psrlv_q;
	else if (Size == 'd' && Name[17] == '4') // avx512.mask.psrlv4.di
	IID = Intrinsic::x86_avx2_psrlv_q_256;
	else if (Size == 's' && Name[17] == '4') // avx512.mask.psrlv4.si
	IID = Intrinsic::x86_avx2_psrlv_d;
	else if (Size == 's' && Name[17] == '8') // avx512.mask.psrlv8.si
	IID = Intrinsic::x86_avx2_psrlv_d_256;
	else if (Size == 'h' && Name[17] == '8') // avx512.mask.psrlv8.hi
	IID = Intrinsic::x86_avx512_psrlv_w_128;
	else if (Size == 'h' && Name[17] == '1') // avx512.mask.psrlv16.hi
	IID = Intrinsic::x86_avx512_psrlv_w_256;
	else if (Name[17] == '3' && Name[18] == '2') // avx512.mask.psrlv32hi
	IID = Intrinsic::x86_avx512_psrlv_w_512;
	else
	llvm_unreachable("Unexpected size");
	} else if (Name.endswith(".128")) {
	if (Size == 'd') // avx512.mask.psrl.d.128, avx512.mask.psrl.di.128
	IID = IsImmediate ? Intrinsic::x86_sse2_psrli_d
	: Intrinsic::x86_sse2_psrl_d;
	else if (Size == 'q') // avx512.mask.psrl.q.128, avx512.mask.psrl.qi.128
	IID = IsImmediate ? Intrinsic::x86_sse2_psrli_q
	: Intrinsic::x86_sse2_psrl_q;
	else if (Size == 'w') // avx512.mask.psrl.w.128, avx512.mask.psrl.wi.128
	IID = IsImmediate ? Intrinsic::x86_sse2_psrli_w
	: Intrinsic::x86_sse2_psrl_w;
	else
	llvm_unreachable("Unexpected size");
	} else if (Name.endswith(".256")) {
	if (Size == 'd') // avx512.mask.psrl.d.256, avx512.mask.psrl.di.256
	IID = IsImmediate ? Intrinsic::x86_avx2_psrli_d
	: Intrinsic::x86_avx2_psrl_d;
	else if (Size == 'q') // avx512.mask.psrl.q.256, avx512.mask.psrl.qi.256
	IID = IsImmediate ? Intrinsic::x86_avx2_psrli_q
	: Intrinsic::x86_avx2_psrl_q;
	else if (Size == 'w') // avx512.mask.psrl.w.256, avx512.mask.psrl.wi.256
	IID = IsImmediate ? Intrinsic::x86_avx2_psrli_w
	: Intrinsic::x86_avx2_psrl_w;
	else
	llvm_unreachable("Unexpected size");
	} else {
	if (Size == 'd') // psrl.di.512, psrli.d, psrl.d, psrl.d.512
	IID = IsImmediate ? Intrinsic::x86_avx512_psrli_d_512 :
	IsVariable ? Intrinsic::x86_avx512_psrlv_d_512 :
	Intrinsic::x86_avx512_psrl_d_512;
	else if (Size == 'q') // psrl.qi.512, psrli.q, psrl.q, psrl.q.512
	IID = IsImmediate ? Intrinsic::x86_avx512_psrli_q_512 :
	IsVariable ? Intrinsic::x86_avx512_psrlv_q_512 :
	Intrinsic::x86_avx512_psrl_q_512;
	else if (Size == 'w') // psrl.wi.512, psrli.w, psrl.w)
	IID = IsImmediate ? Intrinsic::x86_avx512_psrli_w_512
	: Intrinsic::x86_avx512_psrl_w_512;
	else
	llvm_unreachable("Unexpected size");
	}

	Rep = UpgradeX86MaskedShift(Builder, *CI, IID);
	} else if (IsX86 && Name.startswith("avx512.mask.psra")) {
	bool IsImmediate = Name[16] == 'i' \|\|
	(Name.size() > 18 && Name[18] == 'i');
	bool IsVariable = Name[16] == 'v';
	char Size = Name[16] == '.' ? Name[17] :
	Name[17] == '.' ? Name[18] :
	Name[18] == '.' ? Name[19] :
	Name[20];

	Intrinsic::ID IID;
	if (IsVariable && Name[17] != '.') {
	if (Size == 's' && Name[17] == '4') // avx512.mask.psrav4.si
	IID = Intrinsic::x86_avx2_psrav_d;
	else if (Size == 's' && Name[17] == '8') // avx512.mask.psrav8.si
	IID = Intrinsic::x86_avx2_psrav_d_256;
	else if (Size == 'h' && Name[17] == '8') // avx512.mask.psrav8.hi
	IID = Intrinsic::x86_avx512_psrav_w_128;
	else if (Size == 'h' && Name[17] == '1') // avx512.mask.psrav16.hi
	IID = Intrinsic::x86_avx512_psrav_w_256;
	else if (Name[17] == '3' && Name[18] == '2') // avx512.mask.psrav32hi
	IID = Intrinsic::x86_avx512_psrav_w_512;
	else
	llvm_unreachable("Unexpected size");
	} else if (Name.endswith(".128")) {
	if (Size == 'd') // avx512.mask.psra.d.128, avx512.mask.psra.di.128
	IID = IsImmediate ? Intrinsic::x86_sse2_psrai_d
	: Intrinsic::x86_sse2_psra_d;
	else if (Size == 'q') // avx512.mask.psra.q.128, avx512.mask.psra.qi.128
	IID = IsImmediate ? Intrinsic::x86_avx512_psrai_q_128 :
	IsVariable ? Intrinsic::x86_avx512_psrav_q_128 :
	Intrinsic::x86_avx512_psra_q_128;
	else if (Size == 'w') // avx512.mask.psra.w.128, avx512.mask.psra.wi.128
	IID = IsImmediate ? Intrinsic::x86_sse2_psrai_w
	: Intrinsic::x86_sse2_psra_w;
	else
	llvm_unreachable("Unexpected size");
	} else if (Name.endswith(".256")) {
	if (Size == 'd') // avx512.mask.psra.d.256, avx512.mask.psra.di.256
	IID = IsImmediate ? Intrinsic::x86_avx2_psrai_d
	: Intrinsic::x86_avx2_psra_d;
	else if (Size == 'q') // avx512.mask.psra.q.256, avx512.mask.psra.qi.256
	IID = IsImmediate ? Intrinsic::x86_avx512_psrai_q_256 :
	IsVariable ? Intrinsic::x86_avx512_psrav_q_256 :
	Intrinsic::x86_avx512_psra_q_256;
	else if (Size == 'w') // avx512.mask.psra.w.256, avx512.mask.psra.wi.256
	IID = IsImmediate ? Intrinsic::x86_avx2_psrai_w
	: Intrinsic::x86_avx2_psra_w;
	else
	llvm_unreachable("Unexpected size");
	} else {
	if (Size == 'd') // psra.di.512, psrai.d, psra.d, psrav.d.512
	IID = IsImmediate ? Intrinsic::x86_avx512_psrai_d_512 :
	IsVariable ? Intrinsic::x86_avx512_psrav_d_512 :
	Intrinsic::x86_avx512_psra_d_512;
	else if (Size == 'q') // psra.qi.512, psrai.q, psra.q
	IID = IsImmediate ? Intrinsic::x86_avx512_psrai_q_512 :
	IsVariable ? Intrinsic::x86_avx512_psrav_q_512 :
	Intrinsic::x86_avx512_psra_q_512;
	else if (Size == 'w') // psra.wi.512, psrai.w, psra.w
	IID = IsImmediate ? Intrinsic::x86_avx512_psrai_w_512
	: Intrinsic::x86_avx512_psra_w_512;
	else
	llvm_unreachable("Unexpected size");
	}

	Rep = UpgradeX86MaskedShift(Builder, *CI, IID);
	} else if (IsX86 && Name.startswith("avx512.mask.move.s")) {
	Rep = upgradeMaskedMove(Builder, *CI);
	} else if (IsX86 && Name.startswith("avx512.cvtmask2")) {
	Rep = UpgradeMaskToInt(Builder, *CI);
	} else if (IsX86 && Name.startswith("avx512.mask.vpermilvar.")) {
	Intrinsic::ID IID;
	if (Name.endswith("ps.128"))
	IID = Intrinsic::x86_avx_vpermilvar_ps;
	else if (Name.endswith("pd.128"))
	IID = Intrinsic::x86_avx_vpermilvar_pd;
	else if (Name.endswith("ps.256"))
	IID = Intrinsic::x86_avx_vpermilvar_ps_256;
	else if (Name.endswith("pd.256"))
	IID = Intrinsic::x86_avx_vpermilvar_pd_256;
	else if (Name.endswith("ps.512"))
	IID = Intrinsic::x86_avx512_vpermilvar_ps_512;
	else if (Name.endswith("pd.512"))
	IID = Intrinsic::x86_avx512_vpermilvar_pd_512;
	else
	llvm_unreachable("Unexpected vpermilvar intrinsic");

	Function *Intrin = Intrinsic::getDeclaration(F->getParent(), IID);
	Rep = Builder.CreateCall(Intrin,
	{ CI->getArgOperand(0), CI->getArgOperand(1) });
	Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
	CI->getArgOperand(2));
	} else if (IsX86 && Name.endswith(".movntdqa")) {
	Module *M = F->getParent();
	MDNode *Node = MDNode::get(
	C, ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(C), 1)));

	Value *Ptr = CI->getArgOperand(0);
	VectorType *VTy = cast<VectorType>(CI->getType());

	// Convert the type of the pointer to a pointer to the stored type.
	Value *BC =
	Builder.CreateBitCast(Ptr, PointerType::getUnqual(VTy), "cast");
	LoadInst *LI = Builder.CreateAlignedLoad(BC, VTy->getBitWidth() / 8);
	LI->setMetadata(M->getMDKindID("nontemporal"), Node);
	Rep = LI;
	} else if (IsX86 &&
	(Name.startswith("sse2.pavg") \|\| Name.startswith("avx2.pavg") \|\|
	Name.startswith("avx512.mask.pavg"))) {
	// llvm.x86.sse2.pavg.b/w, llvm.x86.avx2.pavg.b/w,
	// llvm.x86.avx512.mask.pavg.b/w
	Value *A = CI->getArgOperand(0);
	Value *B = CI->getArgOperand(1);
	VectorType *ZextType = VectorType::getExtendedElementVectorType(
	cast<VectorType>(A->getType()));
	Value *ExtendedA = Builder.CreateZExt(A, ZextType);
	Value *ExtendedB = Builder.CreateZExt(B, ZextType);
	Value *Sum = Builder.CreateAdd(ExtendedA, ExtendedB);
	Value *AddOne = Builder.CreateAdd(Sum, ConstantInt::get(ZextType, 1));
	Value *ShiftR = Builder.CreateLShr(AddOne, ConstantInt::get(ZextType, 1));
	Rep = Builder.CreateTrunc(ShiftR, A->getType());
	if (CI->getNumArgOperands() > 2) {
	Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
	CI->getArgOperand(2));
	}
	} else if (IsNVVM && (Name == "abs.i" \|\| Name == "abs.ll")) {
	Value *Arg = CI->getArgOperand(0);
	Value *Neg = Builder.CreateNeg(Arg, "neg");
	Value *Cmp = Builder.CreateICmpSGE(
	Arg, llvm::Constant::getNullValue(Arg->getType()), "abs.cond");
	Rep = Builder.CreateSelect(Cmp, Arg, Neg, "abs");
	} else if (IsNVVM && (Name == "max.i" \|\| Name == "max.ll" \|\|
	Name == "max.ui" \|\| Name == "max.ull")) {
	Value *Arg0 = CI->getArgOperand(0);
	Value *Arg1 = CI->getArgOperand(1);
	Value *Cmp = Name.endswith(".ui") \|\| Name.endswith(".ull")
	? Builder.CreateICmpUGE(Arg0, Arg1, "max.cond")
	: Builder.CreateICmpSGE(Arg0, Arg1, "max.cond");
	Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "max");
	} else if (IsNVVM && (Name == "min.i" \|\| Name == "min.ll" \|\|
	Name == "min.ui" \|\| Name == "min.ull")) {
	Value *Arg0 = CI->getArgOperand(0);
	Value *Arg1 = CI->getArgOperand(1);
	Value *Cmp = Name.endswith(".ui") \|\| Name.endswith(".ull")
	? Builder.CreateICmpULE(Arg0, Arg1, "min.cond")
	: Builder.CreateICmpSLE(Arg0, Arg1, "min.cond");
	Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "min");
	} else if (IsNVVM && Name == "clz.ll") {
	// llvm.nvvm.clz.ll returns an i32, but llvm.ctlz.i64 and returns an i64.
	Value *Arg = CI->getArgOperand(0);
	Value *Ctlz = Builder.CreateCall(
	Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz,
	{Arg->getType()}),
	{Arg, Builder.getFalse()}, "ctlz");
	Rep = Builder.CreateTrunc(Ctlz, Builder.getInt32Ty(), "ctlz.trunc");
	} else if (IsNVVM && Name == "popc.ll") {
	// llvm.nvvm.popc.ll returns an i32, but llvm.ctpop.i64 and returns an
	// i64.
	Value *Arg = CI->getArgOperand(0);
	Value *Popc = Builder.CreateCall(
	Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctpop,
	{Arg->getType()}),
	Arg, "ctpop");
	Rep = Builder.CreateTrunc(Popc, Builder.getInt32Ty(), "ctpop.trunc");
	} else if (IsNVVM && Name == "h2f") {
	Rep = Builder.CreateCall(Intrinsic::getDeclaration(
	F->getParent(), Intrinsic::convert_from_fp16,
	{Builder.getFloatTy()}),
	CI->getArgOperand(0), "h2f");
	} else {
	llvm_unreachable("Unknown function for CallInst upgrade.");
	}

	if (Rep)
	CI->replaceAllUsesWith(Rep);
	CI->eraseFromParent();
	return;
	}

	CallInst *NewCall = nullptr;
	switch (NewFn->getIntrinsicID()) {
	default: {
	// Handle generic mangling change, but nothing else
	assert(
	(CI->getCalledFunction()->getName() != NewFn->getName()) &&
	"Unknown function for CallInst upgrade and isn't just a name change");
	CI->setCalledFunction(NewFn);
	return;
	}

	case Intrinsic::arm_neon_vld1:
	case Intrinsic::arm_neon_vld2:
	case Intrinsic::arm_neon_vld3:
	case Intrinsic::arm_neon_vld4:
	case Intrinsic::arm_neon_vld2lane:
	case Intrinsic::arm_neon_vld3lane:
	case Intrinsic::arm_neon_vld4lane:
	case Intrinsic::arm_neon_vst1:
	case Intrinsic::arm_neon_vst2:
	case Intrinsic::arm_neon_vst3:
	case Intrinsic::arm_neon_vst4:
	case Intrinsic::arm_neon_vst2lane:
	case Intrinsic::arm_neon_vst3lane:
	case Intrinsic::arm_neon_vst4lane: {
	SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
	CI->arg_operands().end());
	NewCall = Builder.CreateCall(NewFn, Args);
	break;
	}

	case Intrinsic::bitreverse:
	NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(0)});
	break;

	case Intrinsic::ctlz:
	case Intrinsic::cttz:
	assert(CI->getNumArgOperands() == 1 &&
	"Mismatch between function args and call args");
	NewCall =
	Builder.CreateCall(NewFn, {CI->getArgOperand(0), Builder.getFalse()});
	break;

	case Intrinsic::objectsize: {
	Value *NullIsUnknownSize = CI->getNumArgOperands() == 2
	? Builder.getFalse()
	: CI->getArgOperand(2);
	NewCall = Builder.CreateCall(
	NewFn, {CI->getArgOperand(0), CI->getArgOperand(1), NullIsUnknownSize});
	break;
	}

	case Intrinsic::ctpop:
	NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(0)});
	break;

	case Intrinsic::convert_from_fp16:
	NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(0)});
	break;

	case Intrinsic::dbg_value:
	// Upgrade from the old version that had an extra offset argument.
	assert(CI->getNumArgOperands() == 4);
	// Drop nonzero offsets instead of attempting to upgrade them.
	if (auto *Offset = dyn_cast_or_null<Constant>(CI->getArgOperand(1)))
	if (Offset->isZeroValue()) {
	NewCall = Builder.CreateCall(
	NewFn,
	{CI->getArgOperand(0), CI->getArgOperand(2), CI->getArgOperand(3)});
	break;
	}
	CI->eraseFromParent();
	return;

	case Intrinsic::x86_xop_vfrcz_ss:
	case Intrinsic::x86_xop_vfrcz_sd:
	NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(1)});
	break;

	case Intrinsic::x86_xop_vpermil2pd:
	case Intrinsic::x86_xop_vpermil2ps:
	case Intrinsic::x86_xop_vpermil2pd_256:
	case Intrinsic::x86_xop_vpermil2ps_256: {
	SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
	CI->arg_operands().end());
	VectorType *FltIdxTy = cast<VectorType>(Args[2]->getType());
	VectorType *IntIdxTy = VectorType::getInteger(FltIdxTy);
	Args[2] = Builder.CreateBitCast(Args[2], IntIdxTy);
	NewCall = Builder.CreateCall(NewFn, Args);
	break;
	}

	case Intrinsic::x86_sse41_ptestc:
	case Intrinsic::x86_sse41_ptestz:
	case Intrinsic::x86_sse41_ptestnzc: {
	// The arguments for these intrinsics used to be v4f32, and changed
	// to v2i64. This is purely a nop, since those are bitwise intrinsics.
	// So, the only thing required is a bitcast for both arguments.
	// First, check the arguments have the old type.
	Value *Arg0 = CI->getArgOperand(0);
	if (Arg0->getType() != VectorType::get(Type::getFloatTy(C), 4))
	return;

	// Old intrinsic, add bitcasts
	Value *Arg1 = CI->getArgOperand(1);

	Type *NewVecTy = VectorType::get(Type::getInt64Ty(C), 2);

	Value *BC0 = Builder.CreateBitCast(Arg0, NewVecTy, "cast");
	Value *BC1 = Builder.CreateBitCast(Arg1, NewVecTy, "cast");

	NewCall = Builder.CreateCall(NewFn, {BC0, BC1});
	break;
	}

	case Intrinsic::x86_sse41_insertps:
	case Intrinsic::x86_sse41_dppd:
	case Intrinsic::x86_sse41_dpps:
	case Intrinsic::x86_sse41_mpsadbw:
	case Intrinsic::x86_avx_dp_ps_256:
	case Intrinsic::x86_avx2_mpsadbw: {
	// Need to truncate the last argument from i32 to i8 -- this argument models
	// an inherently 8-bit immediate operand to these x86 instructions.
	SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
	CI->arg_operands().end());

	// Replace the last argument with a trunc.
	Args.back() = Builder.CreateTrunc(Args.back(), Type::getInt8Ty(C), "trunc");
	NewCall = Builder.CreateCall(NewFn, Args);
	break;
	}

	case Intrinsic::thread_pointer: {
	NewCall = Builder.CreateCall(NewFn, {});
	break;
	}

	case Intrinsic::invariant_start:
	case Intrinsic::invariant_end:
	case Intrinsic::masked_load:
	case Intrinsic::masked_store:
	case Intrinsic::masked_gather:
	case Intrinsic::masked_scatter: {
	SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
	CI->arg_operands().end());
	NewCall = Builder.CreateCall(NewFn, Args);
	break;
	}
	}
	assert(NewCall && "Should have either set this variable or returned through "
	"the default case");
	std::string Name = CI->getName();
	if (!Name.empty()) {
	CI->setName(Name + ".old");
	NewCall->setName(Name);
	}
	CI->replaceAllUsesWith(NewCall);
	CI->eraseFromParent();
	}

	void llvm::UpgradeCallsToIntrinsic(Function *F) {
	assert(F && "Illegal attempt to upgrade a non-existent intrinsic.");

	// Check if this function should be upgraded and get the replacement function
	// if there is one.
	Function *NewFn;
	if (UpgradeIntrinsicFunction(F, NewFn)) {
	// Replace all users of the old function with the new function or new
	// instructions. This is not a range loop because the call is deleted.
	for (auto UI = F->user_begin(), UE = F->user_end(); UI != UE; )
	if (CallInst CI = dyn_cast<CallInst>(UI++))
	UpgradeIntrinsicCall(CI, NewFn);

	// Remove old function, no longer used, from the module.
	F->eraseFromParent();
	}
	}

	MDNode *llvm::UpgradeTBAANode(MDNode &MD) {
	// Check if the tag uses struct-path aware TBAA format.
	if (isa<MDNode>(MD.getOperand(0)) && MD.getNumOperands() >= 3)
	return &MD;

	auto &Context = MD.getContext();
	if (MD.getNumOperands() == 3) {
	Metadata *Elts[] = {MD.getOperand(0), MD.getOperand(1)};
	MDNode *ScalarType = MDNode::get(Context, Elts);
	// Create a MDNode <ScalarType, ScalarType, offset 0, const>
	Metadata *Elts2[] = {ScalarType, ScalarType,
	ConstantAsMetadata::get(
	Constant::getNullValue(Type::getInt64Ty(Context))),
	MD.getOperand(2)};
	return MDNode::get(Context, Elts2);
	}
	// Create a MDNode <MD, MD, offset 0>
	Metadata *Elts[] = {&MD, &MD, ConstantAsMetadata::get(Constant::getNullValue(
	Type::getInt64Ty(Context)))};
	return MDNode::get(Context, Elts);
	}

	Instruction llvm::UpgradeBitCastInst(unsigned Opc, Value V, Type *DestTy,
	Instruction *&Temp) {
	if (Opc != Instruction::BitCast)
	return nullptr;

	Temp = nullptr;
	Type *SrcTy = V->getType();
	if (SrcTy->isPtrOrPtrVectorTy() && DestTy->isPtrOrPtrVectorTy() &&
	SrcTy->getPointerAddressSpace() != DestTy->getPointerAddressSpace()) {
	LLVMContext &Context = V->getContext();

	// We have no information about target data layout, so we assume that
	// the maximum pointer size is 64bit.
	Type *MidTy = Type::getInt64Ty(Context);
	Temp = CastInst::Create(Instruction::PtrToInt, V, MidTy);

	return CastInst::Create(Instruction::IntToPtr, Temp, DestTy);
	}

	return nullptr;
	}

	Value llvm::UpgradeBitCastExpr(unsigned Opc, Constant C, Type *DestTy) {
	if (Opc != Instruction::BitCast)
	return nullptr;

	Type *SrcTy = C->getType();
	if (SrcTy->isPtrOrPtrVectorTy() && DestTy->isPtrOrPtrVectorTy() &&
	SrcTy->getPointerAddressSpace() != DestTy->getPointerAddressSpace()) {
	LLVMContext &Context = C->getContext();

	// We have no information about target data layout, so we assume that
	// the maximum pointer size is 64bit.
	Type *MidTy = Type::getInt64Ty(Context);

	return ConstantExpr::getIntToPtr(ConstantExpr::getPtrToInt(C, MidTy),
	DestTy);
	}

	return nullptr;
	}

	/// Check the debug info version number, if it is out-dated, drop the debug
	/// info. Return true if module is modified.
	bool llvm::UpgradeDebugInfo(Module &M) {
	unsigned Version = getDebugMetadataVersionFromModule(M);
	if (Version == DEBUG_METADATA_VERSION) {
	bool BrokenDebugInfo = false;
	if (verifyModule(M, &llvm::errs(), &BrokenDebugInfo))
	report_fatal_error("Broken module found, compilation aborted!");
	if (!BrokenDebugInfo)
	// Everything is ok.
	return false;
	else {
	// Diagnose malformed debug info.
	DiagnosticInfoIgnoringInvalidDebugMetadata Diag(M);
	M.getContext().diagnose(Diag);
	}
	}
	bool Modified = StripDebugInfo(M);
	if (Modified && Version != DEBUG_METADATA_VERSION) {
	// Diagnose a version mismatch.
	DiagnosticInfoDebugMetadataVersion DiagVersion(M, Version);
	M.getContext().diagnose(DiagVersion);
	}
	return Modified;
	}

	bool llvm::UpgradeModuleFlags(Module &M) {
	NamedMDNode *ModFlags = M.getModuleFlagsMetadata();
	if (!ModFlags)
	return false;

	bool HasObjCFlag = false, HasClassProperties = false, Changed = false;
	for (unsigned I = 0, E = ModFlags->getNumOperands(); I != E; ++I) {
	MDNode *Op = ModFlags->getOperand(I);
	if (Op->getNumOperands() != 3)
	continue;
	MDString *ID = dyn_cast_or_null<MDString>(Op->getOperand(1));
	if (!ID)
	continue;
	if (ID->getString() == "Objective-C Image Info Version")
	HasObjCFlag = true;
	if (ID->getString() == "Objective-C Class Properties")
	HasClassProperties = true;
	// Upgrade PIC/PIE Module Flags. The module flag behavior for these two
	// field was Error and now they are Max.
	if (ID->getString() == "PIC Level" \|\| ID->getString() == "PIE Level") {
	if (auto *Behavior =
	mdconst::dyn_extract_or_null<ConstantInt>(Op->getOperand(0))) {
	if (Behavior->getLimitedValue() == Module::Error) {
	Type *Int32Ty = Type::getInt32Ty(M.getContext());
	Metadata *Ops[3] = {
	ConstantAsMetadata::get(ConstantInt::get(Int32Ty, Module::Max)),
	MDString::get(M.getContext(), ID->getString()),
	Op->getOperand(2)};
	ModFlags->setOperand(I, MDNode::get(M.getContext(), Ops));
	Changed = true;
	}
	}
	}
	// Upgrade Objective-C Image Info Section. Removed the whitespce in the
	// section name so that llvm-lto will not complain about mismatching
	// module flags that is functionally the same.
	if (ID->getString() == "Objective-C Image Info Section") {
	if (auto *Value = dyn_cast_or_null<MDString>(Op->getOperand(2))) {
	SmallVector<StringRef, 4> ValueComp;
	Value->getString().split(ValueComp, " ");
	if (ValueComp.size() != 1) {
	std::string NewValue;
	for (auto &S : ValueComp)
	NewValue += S.str();
	Metadata *Ops[3] = {Op->getOperand(0), Op->getOperand(1),
	MDString::get(M.getContext(), NewValue)};
	ModFlags->setOperand(I, MDNode::get(M.getContext(), Ops));
	Changed = true;
	}
	}
	}
	}

	// "Objective-C Class Properties" is recently added for Objective-C. We
	// upgrade ObjC bitcodes to contain a "Objective-C Class Properties" module
	// flag of value 0, so we can correclty downgrade this flag when trying to
	// link an ObjC bitcode without this module flag with an ObjC bitcode with
	// this module flag.
	if (HasObjCFlag && !HasClassProperties) {
	M.addModuleFlag(llvm::Module::Override, "Objective-C Class Properties",
	(uint32_t)0);
	Changed = true;
	}

	return Changed;
	}

	void llvm::UpgradeSectionAttributes(Module &M) {
	auto TrimSpaces = [](StringRef Section) -> std::string {
	SmallVector<StringRef, 5> Components;
	Section.split(Components, ',');

	SmallString<32> Buffer;
	raw_svector_ostream OS(Buffer);

	for (auto Component : Components)
	OS << ',' << Component.trim();

	return OS.str().substr(1);
	};

	for (auto &GV : M.globals()) {
	if (!GV.hasSection())
	continue;

	StringRef Section = GV.getSection();

	if (!Section.startswith("__DATA, __objc_catlist"))
	continue;

	// __DATA, __objc_catlist, regular, no_dead_strip
	// __DATA,__objc_catlist,regular,no_dead_strip
	GV.setSection(TrimSpaces(Section));
	}
	}

	static bool isOldLoopArgument(Metadata *MD) {
	auto *T = dyn_cast_or_null<MDTuple>(MD);
	if (!T)
	return false;
	if (T->getNumOperands() < 1)
	return false;
	auto *S = dyn_cast_or_null<MDString>(T->getOperand(0));
	if (!S)
	return false;
	return S->getString().startswith("llvm.vectorizer.");
	}

	static MDString *upgradeLoopTag(LLVMContext &C, StringRef OldTag) {
	StringRef OldPrefix = "llvm.vectorizer.";
	assert(OldTag.startswith(OldPrefix) && "Expected old prefix");

	if (OldTag == "llvm.vectorizer.unroll")
	return MDString::get(C, "llvm.loop.interleave.count");

	return MDString::get(
	C, (Twine("llvm.loop.vectorize.") + OldTag.drop_front(OldPrefix.size()))
	.str());
	}

	static Metadata upgradeLoopArgument(Metadata MD) {
	auto *T = dyn_cast_or_null<MDTuple>(MD);
	if (!T)
	return MD;
	if (T->getNumOperands() < 1)
	return MD;
	auto *OldTag = dyn_cast_or_null<MDString>(T->getOperand(0));
	if (!OldTag)
	return MD;
	if (!OldTag->getString().startswith("llvm.vectorizer."))
	return MD;

	// This has an old tag. Upgrade it.
	SmallVector<Metadata *, 8> Ops;
	Ops.reserve(T->getNumOperands());
	Ops.push_back(upgradeLoopTag(T->getContext(), OldTag->getString()));
	for (unsigned I = 1, E = T->getNumOperands(); I != E; ++I)
	Ops.push_back(T->getOperand(I));

	return MDTuple::get(T->getContext(), Ops);
	}

	MDNode *llvm::upgradeInstructionLoopAttachment(MDNode &N) {
	auto *T = dyn_cast<MDTuple>(&N);
	if (!T)
	return &N;

	if (none_of(T->operands(), isOldLoopArgument))
	return &N;

	SmallVector<Metadata *, 8> Ops;
	Ops.reserve(T->getNumOperands());
	for (Metadata *MD : T->operands())
	Ops.push_back(upgradeLoopArgument(MD));

	return MDTuple::get(T->getContext(), Ops);
	}
	Index: head/contrib/llvm/lib/MC/MCParser/AsmParser.cpp
	===================================================================
	--- head/contrib/llvm/lib/MC/MCParser/AsmParser.cpp (revision 329409)
	+++ head/contrib/llvm/lib/MC/MCParser/AsmParser.cpp (revision 329410)
	@@ -1,5834 +1,5788 @@
	//===- AsmParser.cpp - Parser for Assembly Files --------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This class implements the parser for assembly files.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/None.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallString.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/ADT/StringMap.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/BinaryFormat/Dwarf.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCCodeView.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCDirectives.h"
	#include "llvm/MC/MCDwarf.h"
	#include "llvm/MC/MCExpr.h"
	#include "llvm/MC/MCInstPrinter.h"
	#include "llvm/MC/MCInstrDesc.h"
	#include "llvm/MC/MCInstrInfo.h"
	#include "llvm/MC/MCObjectFileInfo.h"
	#include "llvm/MC/MCParser/AsmCond.h"
	#include "llvm/MC/MCParser/AsmLexer.h"
	#include "llvm/MC/MCParser/MCAsmLexer.h"
	#include "llvm/MC/MCParser/MCAsmParser.h"
	#include "llvm/MC/MCParser/MCAsmParserExtension.h"
	#include "llvm/MC/MCParser/MCAsmParserUtils.h"
	#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
	#include "llvm/MC/MCParser/MCTargetAsmParser.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/MC/MCSection.h"
	#include "llvm/MC/MCStreamer.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/MC/MCTargetOptions.h"
	#include "llvm/MC/MCValue.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/MemoryBuffer.h"
	#include "llvm/Support/SMLoc.h"
	#include "llvm/Support/SourceMgr.h"
	#include "llvm/Support/raw_ostream.h"
	#include <algorithm>
	#include <cassert>
	#include <cctype>
	#include <climits>
	#include <cstddef>
	#include <cstdint>
	#include <deque>
	#include <memory>
	#include <sstream>
	#include <string>
	#include <tuple>
	#include <utility>
	#include <vector>

	using namespace llvm;

	MCAsmParserSemaCallback::~MCAsmParserSemaCallback() = default;

	static cl::opt<unsigned> AsmMacroMaxNestingDepth(
	"asm-macro-max-nesting-depth", cl::init(20), cl::Hidden,
	cl::desc("The maximum nesting depth allowed for assembly macros."));

	namespace {

	/// \brief Helper types for tracking macro definitions.
	typedef std::vector<AsmToken> MCAsmMacroArgument;
	typedef std::vector<MCAsmMacroArgument> MCAsmMacroArguments;

	-struct MCAsmMacroParameter {
	- StringRef Name;
	- MCAsmMacroArgument Value;
	- bool Required = false;
	- bool Vararg = false;
	-
	- MCAsmMacroParameter() = default;
	-};
	-
	-typedef std::vector<MCAsmMacroParameter> MCAsmMacroParameters;
	-
	-struct MCAsmMacro {
	- StringRef Name;
	- StringRef Body;
	- MCAsmMacroParameters Parameters;
	-
	-public:
	- MCAsmMacro(StringRef N, StringRef B, MCAsmMacroParameters P)
	- : Name(N), Body(B), Parameters(std::move(P)) {}
	-};
	-
	/// \brief Helper class for storing information about an active macro
	/// instantiation.
	struct MacroInstantiation {
	/// The location of the instantiation.
	SMLoc InstantiationLoc;

	/// The buffer where parsing should resume upon instantiation completion.
	int ExitBuffer;

	/// The location where parsing should resume upon instantiation completion.
	SMLoc ExitLoc;

	/// The depth of TheCondStack at the start of the instantiation.
	size_t CondStackDepth;

	public:
	MacroInstantiation(SMLoc IL, int EB, SMLoc EL, size_t CondStackDepth);
	};

	struct ParseStatementInfo {
	/// \brief The parsed operands from the last parsed statement.
	SmallVector<std::unique_ptr<MCParsedAsmOperand>, 8> ParsedOperands;

	/// \brief The opcode from the last parsed instruction.
	unsigned Opcode = ~0U;

	/// \brief Was there an error parsing the inline assembly?
	bool ParseError = false;

	SmallVectorImpl<AsmRewrite> *AsmRewrites = nullptr;

	ParseStatementInfo() = delete;
	ParseStatementInfo(SmallVectorImpl<AsmRewrite> *rewrites)
	: AsmRewrites(rewrites) {}
	};

	/// \brief The concrete assembly parser instance.
	class AsmParser : public MCAsmParser {
	private:
	AsmLexer Lexer;
	MCContext &Ctx;
	MCStreamer &Out;
	const MCAsmInfo &MAI;
	SourceMgr &SrcMgr;
	SourceMgr::DiagHandlerTy SavedDiagHandler;
	void *SavedDiagContext;
	std::unique_ptr<MCAsmParserExtension> PlatformParser;

	/// This is the current buffer index we're lexing from as managed by the
	/// SourceMgr object.
	unsigned CurBuffer;

	AsmCond TheCondState;
	std::vector<AsmCond> TheCondStack;

	/// \brief maps directive names to handler methods in parser
	/// extensions. Extensions register themselves in this map by calling
	/// addDirectiveHandler.
	StringMap<ExtensionDirectiveHandler> ExtensionDirectiveMap;

	- /// \brief Map of currently defined macros.
	- StringMap<MCAsmMacro> MacroMap;
	-
	/// \brief Stack of active macro instantiations.
	std::vector<MacroInstantiation*> ActiveMacros;

	/// \brief List of bodies of anonymous macros.
	std::deque<MCAsmMacro> MacroLikeBodies;

	/// Boolean tracking whether macro substitution is enabled.
	unsigned MacrosEnabledFlag : 1;

	/// \brief Keeps track of how many .macro's have been instantiated.
	unsigned NumOfMacroInstantiations;

	/// The values from the last parsed cpp hash file line comment if any.
	struct CppHashInfoTy {
	StringRef Filename;
	int64_t LineNumber = 0;
	SMLoc Loc;
	unsigned Buf = 0;
	};
	CppHashInfoTy CppHashInfo;

	/// \brief List of forward directional labels for diagnosis at the end.
	SmallVector<std::tuple<SMLoc, CppHashInfoTy, MCSymbol *>, 4> DirLabels;

	/// When generating dwarf for assembly source files we need to calculate the
	/// logical line number based on the last parsed cpp hash file line comment
	/// and current line. Since this is slow and messes up the SourceMgr's
	/// cache we save the last info we queried with SrcMgr.FindLineNumber().
	SMLoc LastQueryIDLoc;
	unsigned LastQueryBuffer;
	unsigned LastQueryLine;

	/// AssemblerDialect. ~OU means unset value and use value provided by MAI.
	unsigned AssemblerDialect = ~0U;

	/// \brief is Darwin compatibility enabled?
	bool IsDarwin = false;

	/// \brief Are we parsing ms-style inline assembly?
	bool ParsingInlineAsm = false;

	public:
	AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
	const MCAsmInfo &MAI, unsigned CB);
	AsmParser(const AsmParser &) = delete;
	AsmParser &operator=(const AsmParser &) = delete;
	~AsmParser() override;

	bool Run(bool NoInitialTextSection, bool NoFinalize = false) override;

	void addDirectiveHandler(StringRef Directive,
	ExtensionDirectiveHandler Handler) override {
	ExtensionDirectiveMap[Directive] = Handler;
	}

	void addAliasForDirective(StringRef Directive, StringRef Alias) override {
	DirectiveKindMap[Directive] = DirectiveKindMap[Alias];
	}

	/// @name MCAsmParser Interface
	/// {

	SourceMgr &getSourceManager() override { return SrcMgr; }
	MCAsmLexer &getLexer() override { return Lexer; }
	MCContext &getContext() override { return Ctx; }
	MCStreamer &getStreamer() override { return Out; }

	CodeViewContext &getCVContext() { return Ctx.getCVContext(); }

	unsigned getAssemblerDialect() override {
	if (AssemblerDialect == ~0U)
	return MAI.getAssemblerDialect();
	else
	return AssemblerDialect;
	}
	void setAssemblerDialect(unsigned i) override {
	AssemblerDialect = i;
	}

	void Note(SMLoc L, const Twine &Msg, SMRange Range = None) override;
	bool Warning(SMLoc L, const Twine &Msg, SMRange Range = None) override;
	bool printError(SMLoc L, const Twine &Msg, SMRange Range = None) override;

	const AsmToken &Lex() override;

	void setParsingInlineAsm(bool V) override {
	ParsingInlineAsm = V;
	Lexer.setParsingMSInlineAsm(V);
	}
	bool isParsingInlineAsm() override { return ParsingInlineAsm; }

	bool parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
	unsigned &NumOutputs, unsigned &NumInputs,
	SmallVectorImpl<std::pair<void *,bool>> &OpDecls,
	SmallVectorImpl<std::string> &Constraints,
	SmallVectorImpl<std::string> &Clobbers,
	const MCInstrInfo MII, const MCInstPrinter IP,
	MCAsmParserSemaCallback &SI) override;

	bool parseExpression(const MCExpr *&Res);
	bool parseExpression(const MCExpr *&Res, SMLoc &EndLoc) override;
	bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) override;
	bool parseParenExpression(const MCExpr *&Res, SMLoc &EndLoc) override;
	bool parseParenExprOfDepth(unsigned ParenDepth, const MCExpr *&Res,
	SMLoc &EndLoc) override;
	bool parseAbsoluteExpression(int64_t &Res) override;

	/// \brief Parse a floating point expression using the float \p Semantics
	/// and set \p Res to the value.
	bool parseRealValue(const fltSemantics &Semantics, APInt &Res);

	/// \brief Parse an identifier or string (as a quoted identifier)
	/// and set \p Res to the identifier contents.
	bool parseIdentifier(StringRef &Res) override;
	void eatToEndOfStatement() override;

	bool checkForValidSection() override;

	/// }

	private:
	bool isAltmacroString(SMLoc &StrLoc, SMLoc &EndLoc);
	void altMacroString(StringRef AltMacroStr, std::string &Res);
	bool parseStatement(ParseStatementInfo &Info,
	MCAsmParserSemaCallback *SI);
	bool parseCurlyBlockScope(SmallVectorImpl<AsmRewrite>& AsmStrRewrites);
	bool parseCppHashLineFilenameComment(SMLoc L);

	void checkForBadMacro(SMLoc DirectiveLoc, StringRef Name, StringRef Body,
	ArrayRef<MCAsmMacroParameter> Parameters);
	bool expandMacro(raw_svector_ostream &OS, StringRef Body,
	ArrayRef<MCAsmMacroParameter> Parameters,
	ArrayRef<MCAsmMacroArgument> A, bool EnableAtPseudoVariable,
	SMLoc L);

	/// \brief Are macros enabled in the parser?
	bool areMacrosEnabled() {return MacrosEnabledFlag;}

	/// \brief Control a flag in the parser that enables or disables macros.
	void setMacrosEnabled(bool Flag) {MacrosEnabledFlag = Flag;}

	- /// \brief Lookup a previously defined macro.
	- /// \param Name Macro name.
	- /// \returns Pointer to macro. NULL if no such macro was defined.
	- const MCAsmMacro* lookupMacro(StringRef Name);
	-
	- /// \brief Define a new macro with the given name and information.
	- void defineMacro(StringRef Name, MCAsmMacro Macro);
	-
	- /// \brief Undefine a macro. If no such macro was defined, it's a no-op.
	- void undefineMacro(StringRef Name);
	-
	/// \brief Are we inside a macro instantiation?
	bool isInsideMacroInstantiation() {return !ActiveMacros.empty();}

	/// \brief Handle entry to macro instantiation.
	///
	/// \param M The macro.
	/// \param NameLoc Instantiation location.
	bool handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc);

	/// \brief Handle exit from macro instantiation.
	void handleMacroExit();

	/// \brief Extract AsmTokens for a macro argument.
	bool parseMacroArgument(MCAsmMacroArgument &MA, bool Vararg);

	/// \brief Parse all macro arguments for a given macro.
	bool parseMacroArguments(const MCAsmMacro *M, MCAsmMacroArguments &A);

	void printMacroInstantiations();
	void printMessage(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Msg,
	SMRange Range = None) const {
	ArrayRef<SMRange> Ranges(Range);
	SrcMgr.PrintMessage(Loc, Kind, Msg, Ranges);
	}
	static void DiagHandler(const SMDiagnostic &Diag, void *Context);

	/// \brief Enter the specified file. This returns true on failure.
	bool enterIncludeFile(const std::string &Filename);

	/// \brief Process the specified file for the .incbin directive.
	/// This returns true on failure.
	bool processIncbinFile(const std::string &Filename, int64_t Skip = 0,
	const MCExpr *Count = nullptr, SMLoc Loc = SMLoc());

	/// \brief Reset the current lexer position to that given by \p Loc. The
	/// current token is not set; clients should ensure Lex() is called
	/// subsequently.
	///
	/// \param InBuffer If not 0, should be the known buffer id that contains the
	/// location.
	void jumpToLoc(SMLoc Loc, unsigned InBuffer = 0);

	/// \brief Parse up to the end of statement and a return the contents from the
	/// current token until the end of the statement; the current token on exit
	/// will be either the EndOfStatement or EOF.
	StringRef parseStringToEndOfStatement() override;

	/// \brief Parse until the end of a statement or a comma is encountered,
	/// return the contents from the current token up to the end or comma.
	StringRef parseStringToComma();

	bool parseAssignment(StringRef Name, bool allow_redef,
	bool NoDeadStrip = false);

	unsigned getBinOpPrecedence(AsmToken::TokenKind K,
	MCBinaryExpr::Opcode &Kind);

	bool parseBinOpRHS(unsigned Precedence, const MCExpr *&Res, SMLoc &EndLoc);
	bool parseParenExpr(const MCExpr *&Res, SMLoc &EndLoc);
	bool parseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc);

	bool parseRegisterOrRegisterNumber(int64_t &Register, SMLoc DirectiveLoc);

	bool parseCVFunctionId(int64_t &FunctionId, StringRef DirectiveName);
	bool parseCVFileId(int64_t &FileId, StringRef DirectiveName);

	// Generic (target and platform independent) directive parsing.
	enum DirectiveKind {
	DK_NO_DIRECTIVE, // Placeholder
	DK_SET,
	DK_EQU,
	DK_EQUIV,
	DK_ASCII,
	DK_ASCIZ,
	DK_STRING,
	DK_BYTE,
	DK_SHORT,
	DK_RELOC,
	DK_VALUE,
	DK_2BYTE,
	DK_LONG,
	DK_INT,
	DK_4BYTE,
	DK_QUAD,
	DK_8BYTE,
	DK_OCTA,
	DK_DC,
	DK_DC_A,
	DK_DC_B,
	DK_DC_D,
	DK_DC_L,
	DK_DC_S,
	DK_DC_W,
	DK_DC_X,
	DK_DCB,
	DK_DCB_B,
	DK_DCB_D,
	DK_DCB_L,
	DK_DCB_S,
	DK_DCB_W,
	DK_DCB_X,
	DK_DS,
	DK_DS_B,
	DK_DS_D,
	DK_DS_L,
	DK_DS_P,
	DK_DS_S,
	DK_DS_W,
	DK_DS_X,
	DK_SINGLE,
	DK_FLOAT,
	DK_DOUBLE,
	DK_ALIGN,
	DK_ALIGN32,
	DK_BALIGN,
	DK_BALIGNW,
	DK_BALIGNL,
	DK_P2ALIGN,
	DK_P2ALIGNW,
	DK_P2ALIGNL,
	DK_ORG,
	DK_FILL,
	DK_ENDR,
	DK_BUNDLE_ALIGN_MODE,
	DK_BUNDLE_LOCK,
	DK_BUNDLE_UNLOCK,
	DK_ZERO,
	DK_EXTERN,
	DK_GLOBL,
	DK_GLOBAL,
	DK_LAZY_REFERENCE,
	DK_NO_DEAD_STRIP,
	DK_SYMBOL_RESOLVER,
	DK_PRIVATE_EXTERN,
	DK_REFERENCE,
	DK_WEAK_DEFINITION,
	DK_WEAK_REFERENCE,
	DK_WEAK_DEF_CAN_BE_HIDDEN,
	DK_COMM,
	DK_COMMON,
	DK_LCOMM,
	DK_ABORT,
	DK_INCLUDE,
	DK_INCBIN,
	DK_CODE16,
	DK_CODE16GCC,
	DK_REPT,
	DK_IRP,
	DK_IRPC,
	DK_IF,
	DK_IFEQ,
	DK_IFGE,
	DK_IFGT,
	DK_IFLE,
	DK_IFLT,
	DK_IFNE,
	DK_IFB,
	DK_IFNB,
	DK_IFC,
	DK_IFEQS,
	DK_IFNC,
	DK_IFNES,
	DK_IFDEF,
	DK_IFNDEF,
	DK_IFNOTDEF,
	DK_ELSEIF,
	DK_ELSE,
	DK_ENDIF,
	DK_SPACE,
	DK_SKIP,
	DK_FILE,
	DK_LINE,
	DK_LOC,
	DK_STABS,
	DK_CV_FILE,
	DK_CV_FUNC_ID,
	DK_CV_INLINE_SITE_ID,
	DK_CV_LOC,
	DK_CV_LINETABLE,
	DK_CV_INLINE_LINETABLE,
	DK_CV_DEF_RANGE,
	DK_CV_STRINGTABLE,
	DK_CV_FILECHECKSUMS,
	DK_CV_FILECHECKSUM_OFFSET,
	DK_CV_FPO_DATA,
	DK_CFI_SECTIONS,
	DK_CFI_STARTPROC,
	DK_CFI_ENDPROC,
	DK_CFI_DEF_CFA,
	DK_CFI_DEF_CFA_OFFSET,
	DK_CFI_ADJUST_CFA_OFFSET,
	DK_CFI_DEF_CFA_REGISTER,
	DK_CFI_OFFSET,
	DK_CFI_REL_OFFSET,
	DK_CFI_PERSONALITY,
	DK_CFI_LSDA,
	DK_CFI_REMEMBER_STATE,
	DK_CFI_RESTORE_STATE,
	DK_CFI_SAME_VALUE,
	DK_CFI_RESTORE,
	DK_CFI_ESCAPE,
	DK_CFI_RETURN_COLUMN,
	DK_CFI_SIGNAL_FRAME,
	DK_CFI_UNDEFINED,
	DK_CFI_REGISTER,
	DK_CFI_WINDOW_SAVE,
	DK_MACROS_ON,
	DK_MACROS_OFF,
	DK_ALTMACRO,
	DK_NOALTMACRO,
	DK_MACRO,
	DK_EXITM,
	DK_ENDM,
	DK_ENDMACRO,
	DK_PURGEM,
	DK_SLEB128,
	DK_ULEB128,
	DK_ERR,
	DK_ERROR,
	DK_WARNING,
	DK_PRINT,
	DK_END
	};

	/// \brief Maps directive name --> DirectiveKind enum, for
	/// directives parsed by this class.
	StringMap<DirectiveKind> DirectiveKindMap;

	// ".ascii", ".asciz", ".string"
	bool parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated);
	bool parseDirectiveReloc(SMLoc DirectiveLoc); // ".reloc"
	bool parseDirectiveValue(StringRef IDVal,
	unsigned Size); // ".byte", ".long", ...
	bool parseDirectiveOctaValue(StringRef IDVal); // ".octa", ...
	bool parseDirectiveRealValue(StringRef IDVal,
	const fltSemantics &); // ".single", ...
	bool parseDirectiveFill(); // ".fill"
	bool parseDirectiveZero(); // ".zero"
	// ".set", ".equ", ".equiv"
	bool parseDirectiveSet(StringRef IDVal, bool allow_redef);
	bool parseDirectiveOrg(); // ".org"
	// ".align{,32}", ".p2align{,w,l}"
	bool parseDirectiveAlign(bool IsPow2, unsigned ValueSize);

	// ".file", ".line", ".loc", ".stabs"
	bool parseDirectiveFile(SMLoc DirectiveLoc);
	bool parseDirectiveLine();
	bool parseDirectiveLoc();
	bool parseDirectiveStabs();

	// ".cv_file", ".cv_func_id", ".cv_inline_site_id", ".cv_loc", ".cv_linetable",
	// ".cv_inline_linetable", ".cv_def_range"
	bool parseDirectiveCVFile();
	bool parseDirectiveCVFuncId();
	bool parseDirectiveCVInlineSiteId();
	bool parseDirectiveCVLoc();
	bool parseDirectiveCVLinetable();
	bool parseDirectiveCVInlineLinetable();
	bool parseDirectiveCVDefRange();
	bool parseDirectiveCVStringTable();
	bool parseDirectiveCVFileChecksums();
	bool parseDirectiveCVFileChecksumOffset();
	bool parseDirectiveCVFPOData();

	// .cfi directives
	bool parseDirectiveCFIRegister(SMLoc DirectiveLoc);
	bool parseDirectiveCFIWindowSave();
	bool parseDirectiveCFISections();
	bool parseDirectiveCFIStartProc();
	bool parseDirectiveCFIEndProc();
	bool parseDirectiveCFIDefCfaOffset();
	bool parseDirectiveCFIDefCfa(SMLoc DirectiveLoc);
	bool parseDirectiveCFIAdjustCfaOffset();
	bool parseDirectiveCFIDefCfaRegister(SMLoc DirectiveLoc);
	bool parseDirectiveCFIOffset(SMLoc DirectiveLoc);
	bool parseDirectiveCFIRelOffset(SMLoc DirectiveLoc);
	bool parseDirectiveCFIPersonalityOrLsda(bool IsPersonality);
	bool parseDirectiveCFIRememberState();
	bool parseDirectiveCFIRestoreState();
	bool parseDirectiveCFISameValue(SMLoc DirectiveLoc);
	bool parseDirectiveCFIRestore(SMLoc DirectiveLoc);
	bool parseDirectiveCFIEscape();
	bool parseDirectiveCFIReturnColumn(SMLoc DirectiveLoc);
	bool parseDirectiveCFISignalFrame();
	bool parseDirectiveCFIUndefined(SMLoc DirectiveLoc);

	// macro directives
	bool parseDirectivePurgeMacro(SMLoc DirectiveLoc);
	bool parseDirectiveExitMacro(StringRef Directive);
	bool parseDirectiveEndMacro(StringRef Directive);
	bool parseDirectiveMacro(SMLoc DirectiveLoc);
	bool parseDirectiveMacrosOnOff(StringRef Directive);
	// alternate macro mode directives
	bool parseDirectiveAltmacro(StringRef Directive);
	// ".bundle_align_mode"
	bool parseDirectiveBundleAlignMode();
	// ".bundle_lock"
	bool parseDirectiveBundleLock();
	// ".bundle_unlock"
	bool parseDirectiveBundleUnlock();

	// ".space", ".skip"
	bool parseDirectiveSpace(StringRef IDVal);

	// ".dcb"
	bool parseDirectiveDCB(StringRef IDVal, unsigned Size);
	bool parseDirectiveRealDCB(StringRef IDVal, const fltSemantics &);
	// ".ds"
	bool parseDirectiveDS(StringRef IDVal, unsigned Size);

	// .sleb128 (Signed=true) and .uleb128 (Signed=false)
	bool parseDirectiveLEB128(bool Signed);

	/// \brief Parse a directive like ".globl" which
	/// accepts a single symbol (which should be a label or an external).
	bool parseDirectiveSymbolAttribute(MCSymbolAttr Attr);

	bool parseDirectiveComm(bool IsLocal); // ".comm" and ".lcomm"

	bool parseDirectiveAbort(); // ".abort"
	bool parseDirectiveInclude(); // ".include"
	bool parseDirectiveIncbin(); // ".incbin"

	// ".if", ".ifeq", ".ifge", ".ifgt" , ".ifle", ".iflt" or ".ifne"
	bool parseDirectiveIf(SMLoc DirectiveLoc, DirectiveKind DirKind);
	// ".ifb" or ".ifnb", depending on ExpectBlank.
	bool parseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank);
	// ".ifc" or ".ifnc", depending on ExpectEqual.
	bool parseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual);
	// ".ifeqs" or ".ifnes", depending on ExpectEqual.
	bool parseDirectiveIfeqs(SMLoc DirectiveLoc, bool ExpectEqual);
	// ".ifdef" or ".ifndef", depending on expect_defined
	bool parseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined);
	bool parseDirectiveElseIf(SMLoc DirectiveLoc); // ".elseif"
	bool parseDirectiveElse(SMLoc DirectiveLoc); // ".else"
	bool parseDirectiveEndIf(SMLoc DirectiveLoc); // .endif
	bool parseEscapedString(std::string &Data) override;

	const MCExpr applyModifierToExpr(const MCExpr E,
	MCSymbolRefExpr::VariantKind Variant);

	// Macro-like directives
	MCAsmMacro *parseMacroLikeBody(SMLoc DirectiveLoc);
	void instantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
	raw_svector_ostream &OS);
	bool parseDirectiveRept(SMLoc DirectiveLoc, StringRef Directive);
	bool parseDirectiveIrp(SMLoc DirectiveLoc); // ".irp"
	bool parseDirectiveIrpc(SMLoc DirectiveLoc); // ".irpc"
	bool parseDirectiveEndr(SMLoc DirectiveLoc); // ".endr"

	// "_emit" or "__emit"
	bool parseDirectiveMSEmit(SMLoc DirectiveLoc, ParseStatementInfo &Info,
	size_t Len);

	// "align"
	bool parseDirectiveMSAlign(SMLoc DirectiveLoc, ParseStatementInfo &Info);

	// "end"
	bool parseDirectiveEnd(SMLoc DirectiveLoc);

	// ".err" or ".error"
	bool parseDirectiveError(SMLoc DirectiveLoc, bool WithMessage);

	// ".warning"
	bool parseDirectiveWarning(SMLoc DirectiveLoc);

	// .print <double-quotes-string>
	bool parseDirectivePrint(SMLoc DirectiveLoc);

	void initializeDirectiveKindMap();
	};

	} // end anonymous namespace

	namespace llvm {

	extern MCAsmParserExtension *createDarwinAsmParser();
	extern MCAsmParserExtension *createELFAsmParser();
	extern MCAsmParserExtension *createCOFFAsmParser();

	} // end namespace llvm

	enum { DEFAULT_ADDRSPACE = 0 };

	AsmParser::AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
	const MCAsmInfo &MAI, unsigned CB = 0)
	: Lexer(MAI), Ctx(Ctx), Out(Out), MAI(MAI), SrcMgr(SM),
	CurBuffer(CB ? CB : SM.getMainFileID()), MacrosEnabledFlag(true) {
	HadError = false;
	// Save the old handler.
	SavedDiagHandler = SrcMgr.getDiagHandler();
	SavedDiagContext = SrcMgr.getDiagContext();
	// Set our own handler which calls the saved handler.
	SrcMgr.setDiagHandler(DiagHandler, this);
	Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer());

	// Initialize the platform / file format parser.
	switch (Ctx.getObjectFileInfo()->getObjectFileType()) {
	case MCObjectFileInfo::IsCOFF:
	PlatformParser.reset(createCOFFAsmParser());
	break;
	case MCObjectFileInfo::IsMachO:
	PlatformParser.reset(createDarwinAsmParser());
	IsDarwin = true;
	break;
	case MCObjectFileInfo::IsELF:
	PlatformParser.reset(createELFAsmParser());
	break;
	case MCObjectFileInfo::IsWasm:
	llvm_unreachable("Wasm parsing not supported yet");
	break;
	}

	PlatformParser->Initialize(*this);
	initializeDirectiveKindMap();

	NumOfMacroInstantiations = 0;
	}

	AsmParser::~AsmParser() {
	assert((HadError \|\| ActiveMacros.empty()) &&
	"Unexpected active macro instantiation!");

	// Restore the saved diagnostics handler and context for use during
	// finalization.
	SrcMgr.setDiagHandler(SavedDiagHandler, SavedDiagContext);
	}

	void AsmParser::printMacroInstantiations() {
	// Print the active macro instantiation stack.
	for (std::vector<MacroInstantiation *>::const_reverse_iterator
	it = ActiveMacros.rbegin(),
	ie = ActiveMacros.rend();
	it != ie; ++it)
	printMessage((*it)->InstantiationLoc, SourceMgr::DK_Note,
	"while in macro instantiation");
	}

	void AsmParser::Note(SMLoc L, const Twine &Msg, SMRange Range) {
	printPendingErrors();
	printMessage(L, SourceMgr::DK_Note, Msg, Range);
	printMacroInstantiations();
	}

	bool AsmParser::Warning(SMLoc L, const Twine &Msg, SMRange Range) {
	if(getTargetParser().getTargetOptions().MCNoWarn)
	return false;
	if (getTargetParser().getTargetOptions().MCFatalWarnings)
	return Error(L, Msg, Range);
	printMessage(L, SourceMgr::DK_Warning, Msg, Range);
	printMacroInstantiations();
	return false;
	}

	bool AsmParser::printError(SMLoc L, const Twine &Msg, SMRange Range) {
	HadError = true;
	printMessage(L, SourceMgr::DK_Error, Msg, Range);
	printMacroInstantiations();
	return true;
	}

	bool AsmParser::enterIncludeFile(const std::string &Filename) {
	std::string IncludedFile;
	unsigned NewBuf =
	SrcMgr.AddIncludeFile(Filename, Lexer.getLoc(), IncludedFile);
	if (!NewBuf)
	return true;

	CurBuffer = NewBuf;
	Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer());
	return false;
	}

	/// Process the specified .incbin file by searching for it in the include paths
	/// then just emitting the byte contents of the file to the streamer. This
	/// returns true on failure.
	bool AsmParser::processIncbinFile(const std::string &Filename, int64_t Skip,
	const MCExpr *Count, SMLoc Loc) {
	std::string IncludedFile;
	unsigned NewBuf =
	SrcMgr.AddIncludeFile(Filename, Lexer.getLoc(), IncludedFile);
	if (!NewBuf)
	return true;

	// Pick up the bytes from the file and emit them.
	StringRef Bytes = SrcMgr.getMemoryBuffer(NewBuf)->getBuffer();
	Bytes = Bytes.drop_front(Skip);
	if (Count) {
	int64_t Res;
	if (!Count->evaluateAsAbsolute(Res))
	return Error(Loc, "expected absolute expression");
	if (Res < 0)
	return Warning(Loc, "negative count has no effect");
	Bytes = Bytes.take_front(Res);
	}
	getStreamer().EmitBytes(Bytes);
	return false;
	}

	void AsmParser::jumpToLoc(SMLoc Loc, unsigned InBuffer) {
	CurBuffer = InBuffer ? InBuffer : SrcMgr.FindBufferContainingLoc(Loc);
	Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(),
	Loc.getPointer());
	}

	const AsmToken &AsmParser::Lex() {
	if (Lexer.getTok().is(AsmToken::Error))
	Error(Lexer.getErrLoc(), Lexer.getErr());

	// if it's a end of statement with a comment in it
	if (getTok().is(AsmToken::EndOfStatement)) {
	// if this is a line comment output it.
	if (!getTok().getString().empty() && getTok().getString().front() != '\n' &&
	getTok().getString().front() != '\r' && MAI.preserveAsmComments())
	Out.addExplicitComment(Twine(getTok().getString()));
	}

	const AsmToken *tok = &Lexer.Lex();

	// Parse comments here to be deferred until end of next statement.
	while (tok->is(AsmToken::Comment)) {
	if (MAI.preserveAsmComments())
	Out.addExplicitComment(Twine(tok->getString()));
	tok = &Lexer.Lex();
	}

	if (tok->is(AsmToken::Eof)) {
	// If this is the end of an included file, pop the parent file off the
	// include stack.
	SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
	if (ParentIncludeLoc != SMLoc()) {
	jumpToLoc(ParentIncludeLoc);
	return Lex();
	}
	}

	return *tok;
	}

	bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
	// Create the initial section, if requested.
	if (!NoInitialTextSection)
	Out.InitSections(false);

	// Prime the lexer.
	Lex();

	HadError = false;
	AsmCond StartingCondState = TheCondState;
	SmallVector<AsmRewrite, 4> AsmStrRewrites;

	// If we are generating dwarf for assembly source files save the initial text
	// section and generate a .file directive.
	if (getContext().getGenDwarfForAssembly()) {
	MCSection *Sec = getStreamer().getCurrentSectionOnly();
	if (!Sec->getBeginSymbol()) {
	MCSymbol *SectionStartSym = getContext().createTempSymbol();
	getStreamer().EmitLabel(SectionStartSym);
	Sec->setBeginSymbol(SectionStartSym);
	}
	bool InsertResult = getContext().addGenDwarfSection(Sec);
	assert(InsertResult && ".text section should not have debug info yet");
	(void)InsertResult;
	getContext().setGenDwarfFileNumber(getStreamer().EmitDwarfFileDirective(
	0, StringRef(), getContext().getMainFileName()));
	}

	// While we have input, parse each statement.
	while (Lexer.isNot(AsmToken::Eof)) {
	ParseStatementInfo Info(&AsmStrRewrites);
	if (!parseStatement(Info, nullptr))
	continue;

	// If we have a Lexer Error we are on an Error Token. Load in Lexer Error
	// for printing ErrMsg via Lex() only if no (presumably better) parser error
	// exists.
	if (!hasPendingError() && Lexer.getTok().is(AsmToken::Error)) {
	Lex();
	}

	// parseStatement returned true so may need to emit an error.
	printPendingErrors();

	// Skipping to the next line if needed.
	if (!getLexer().isAtStartOfStatement())
	eatToEndOfStatement();
	}

	// All errors should have been emitted.
	assert(!hasPendingError() && "unexpected error from parseStatement");

	getTargetParser().flushPendingInstructions(getStreamer());

	if (TheCondState.TheCond != StartingCondState.TheCond \|\|
	TheCondState.Ignore != StartingCondState.Ignore)
	printError(getTok().getLoc(), "unmatched .ifs or .elses");
	// Check to see there are no empty DwarfFile slots.
	const auto &LineTables = getContext().getMCDwarfLineTables();
	if (!LineTables.empty()) {
	unsigned Index = 0;
	for (const auto &File : LineTables.begin()->second.getMCDwarfFiles()) {
	if (File.Name.empty() && Index != 0)
	printError(getTok().getLoc(), "unassigned file number: " +
	Twine(Index) +
	" for .file directives");
	++Index;
	}
	}

	// Check to see that all assembler local symbols were actually defined.
	// Targets that don't do subsections via symbols may not want this, though,
	// so conservatively exclude them. Only do this if we're finalizing, though,
	// as otherwise we won't necessarilly have seen everything yet.
	if (!NoFinalize) {
	if (MAI.hasSubsectionsViaSymbols()) {
	for (const auto &TableEntry : getContext().getSymbols()) {
	MCSymbol *Sym = TableEntry.getValue();
	// Variable symbols may not be marked as defined, so check those
	// explicitly. If we know it's a variable, we have a definition for
	// the purposes of this check.
	if (Sym->isTemporary() && !Sym->isVariable() && !Sym->isDefined())
	// FIXME: We would really like to refer back to where the symbol was
	// first referenced for a source location. We need to add something
	// to track that. Currently, we just point to the end of the file.
	printError(getTok().getLoc(), "assembler local symbol '" +
	Sym->getName() + "' not defined");
	}
	}

	// Temporary symbols like the ones for directional jumps don't go in the
	// symbol table. They also need to be diagnosed in all (final) cases.
	for (std::tuple<SMLoc, CppHashInfoTy, MCSymbol *> &LocSym : DirLabels) {
	if (std::get<2>(LocSym)->isUndefined()) {
	// Reset the state of any "# line file" directives we've seen to the
	// context as it was at the diagnostic site.
	CppHashInfo = std::get<1>(LocSym);
	printError(std::get<0>(LocSym), "directional label undefined");
	}
	}
	}

	// Finalize the output stream if there are no errors and if the client wants
	// us to.
	if (!HadError && !NoFinalize)
	Out.Finish();

	return HadError \|\| getContext().hadError();
	}

	bool AsmParser::checkForValidSection() {
	if (!ParsingInlineAsm && !getStreamer().getCurrentSectionOnly()) {
	Out.InitSections(false);
	return Error(getTok().getLoc(),
	"expected section directive before assembly directive");
	}
	return false;
	}

	/// \brief Throw away the rest of the line for testing purposes.
	void AsmParser::eatToEndOfStatement() {
	while (Lexer.isNot(AsmToken::EndOfStatement) && Lexer.isNot(AsmToken::Eof))
	Lexer.Lex();

	// Eat EOL.
	if (Lexer.is(AsmToken::EndOfStatement))
	Lexer.Lex();
	}

	StringRef AsmParser::parseStringToEndOfStatement() {
	const char *Start = getTok().getLoc().getPointer();

	while (Lexer.isNot(AsmToken::EndOfStatement) && Lexer.isNot(AsmToken::Eof))
	Lexer.Lex();

	const char *End = getTok().getLoc().getPointer();
	return StringRef(Start, End - Start);
	}

	StringRef AsmParser::parseStringToComma() {
	const char *Start = getTok().getLoc().getPointer();

	while (Lexer.isNot(AsmToken::EndOfStatement) &&
	Lexer.isNot(AsmToken::Comma) && Lexer.isNot(AsmToken::Eof))
	Lexer.Lex();

	const char *End = getTok().getLoc().getPointer();
	return StringRef(Start, End - Start);
	}

	/// \brief Parse a paren expression and return it.
	/// NOTE: This assumes the leading '(' has already been consumed.
	///
	/// parenexpr ::= expr)
	///
	bool AsmParser::parseParenExpr(const MCExpr *&Res, SMLoc &EndLoc) {
	if (parseExpression(Res))
	return true;
	if (Lexer.isNot(AsmToken::RParen))
	return TokError("expected ')' in parentheses expression");
	EndLoc = Lexer.getTok().getEndLoc();
	Lex();
	return false;
	}

	/// \brief Parse a bracket expression and return it.
	/// NOTE: This assumes the leading '[' has already been consumed.
	///
	/// bracketexpr ::= expr]
	///
	bool AsmParser::parseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc) {
	if (parseExpression(Res))
	return true;
	EndLoc = getTok().getEndLoc();
	if (parseToken(AsmToken::RBrac, "expected ']' in brackets expression"))
	return true;
	return false;
	}

	/// \brief Parse a primary expression and return it.
	/// primaryexpr ::= (parenexpr
	/// primaryexpr ::= symbol
	/// primaryexpr ::= number
	/// primaryexpr ::= '.'
	/// primaryexpr ::= ~,+,- primaryexpr
	bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
	SMLoc FirstTokenLoc = getLexer().getLoc();
	AsmToken::TokenKind FirstTokenKind = Lexer.getKind();
	switch (FirstTokenKind) {
	default:
	return TokError("unknown token in expression");
	// If we have an error assume that we've already handled it.
	case AsmToken::Error:
	return true;
	case AsmToken::Exclaim:
	Lex(); // Eat the operator.
	if (parsePrimaryExpr(Res, EndLoc))
	return true;
	Res = MCUnaryExpr::createLNot(Res, getContext(), FirstTokenLoc);
	return false;
	case AsmToken::Dollar:
	case AsmToken::At:
	case AsmToken::String:
	case AsmToken::Identifier: {
	StringRef Identifier;
	if (parseIdentifier(Identifier)) {
	// We may have failed but $ may be a valid token.
	if (getTok().is(AsmToken::Dollar)) {
	if (Lexer.getMAI().getDollarIsPC()) {
	Lex();
	// This is a '$' reference, which references the current PC. Emit a
	// temporary label to the streamer and refer to it.
	MCSymbol *Sym = Ctx.createTempSymbol();
	Out.EmitLabel(Sym);
	Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None,
	getContext());
	EndLoc = FirstTokenLoc;
	return false;
	}
	return Error(FirstTokenLoc, "invalid token in expression");
	}
	}
	// Parse symbol variant
	std::pair<StringRef, StringRef> Split;
	if (!MAI.useParensForSymbolVariant()) {
	if (FirstTokenKind == AsmToken::String) {
	if (Lexer.is(AsmToken::At)) {
	Lex(); // eat @
	SMLoc AtLoc = getLexer().getLoc();
	StringRef VName;
	if (parseIdentifier(VName))
	return Error(AtLoc, "expected symbol variant after '@'");

	Split = std::make_pair(Identifier, VName);
	}
	} else {
	Split = Identifier.split('@');
	}
	} else if (Lexer.is(AsmToken::LParen)) {
	Lex(); // eat '('.
	StringRef VName;
	parseIdentifier(VName);
	// eat ')'.
	if (parseToken(AsmToken::RParen,
	"unexpected token in variant, expected ')'"))
	return true;
	Split = std::make_pair(Identifier, VName);
	}

	EndLoc = SMLoc::getFromPointer(Identifier.end());

	// This is a symbol reference.
	StringRef SymbolName = Identifier;
	if (SymbolName.empty())
	return true;

	MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;

	// Lookup the symbol variant if used.
	if (!Split.second.empty()) {
	Variant = MCSymbolRefExpr::getVariantKindForName(Split.second);
	if (Variant != MCSymbolRefExpr::VK_Invalid) {
	SymbolName = Split.first;
	} else if (MAI.doesAllowAtInName() && !MAI.useParensForSymbolVariant()) {
	Variant = MCSymbolRefExpr::VK_None;
	} else {
	return Error(SMLoc::getFromPointer(Split.second.begin()),
	"invalid variant '" + Split.second + "'");
	}
	}

	MCSymbol *Sym = getContext().getOrCreateSymbol(SymbolName);

	// If this is an absolute variable reference, substitute it now to preserve
	// semantics in the face of reassignment.
	if (Sym->isVariable() &&
	isa<MCConstantExpr>(Sym->getVariableValue(/SetUsed/ false))) {
	if (Variant)
	return Error(EndLoc, "unexpected modifier on variable reference");

	Res = Sym->getVariableValue(/SetUsed/ false);
	return false;
	}

	// Otherwise create a symbol ref.
	Res = MCSymbolRefExpr::create(Sym, Variant, getContext(), FirstTokenLoc);
	return false;
	}
	case AsmToken::BigNum:
	return TokError("literal value out of range for directive");
	case AsmToken::Integer: {
	SMLoc Loc = getTok().getLoc();
	int64_t IntVal = getTok().getIntVal();
	Res = MCConstantExpr::create(IntVal, getContext());
	EndLoc = Lexer.getTok().getEndLoc();
	Lex(); // Eat token.
	// Look for 'b' or 'f' following an Integer as a directional label
	if (Lexer.getKind() == AsmToken::Identifier) {
	StringRef IDVal = getTok().getString();
	// Lookup the symbol variant if used.
	std::pair<StringRef, StringRef> Split = IDVal.split('@');
	MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
	if (Split.first.size() != IDVal.size()) {
	Variant = MCSymbolRefExpr::getVariantKindForName(Split.second);
	if (Variant == MCSymbolRefExpr::VK_Invalid)
	return TokError("invalid variant '" + Split.second + "'");
	IDVal = Split.first;
	}
	if (IDVal == "f" \|\| IDVal == "b") {
	MCSymbol *Sym =
	Ctx.getDirectionalLocalSymbol(IntVal, IDVal == "b");
	Res = MCSymbolRefExpr::create(Sym, Variant, getContext());
	if (IDVal == "b" && Sym->isUndefined())
	return Error(Loc, "directional label undefined");
	DirLabels.push_back(std::make_tuple(Loc, CppHashInfo, Sym));
	EndLoc = Lexer.getTok().getEndLoc();
	Lex(); // Eat identifier.
	}
	}
	return false;
	}
	case AsmToken::Real: {
	APFloat RealVal(APFloat::IEEEdouble(), getTok().getString());
	uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
	Res = MCConstantExpr::create(IntVal, getContext());
	EndLoc = Lexer.getTok().getEndLoc();
	Lex(); // Eat token.
	return false;
	}
	case AsmToken::Dot: {
	// This is a '.' reference, which references the current PC. Emit a
	// temporary label to the streamer and refer to it.
	MCSymbol *Sym = Ctx.createTempSymbol();
	Out.EmitLabel(Sym);
	Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
	EndLoc = Lexer.getTok().getEndLoc();
	Lex(); // Eat identifier.
	return false;
	}
	case AsmToken::LParen:
	Lex(); // Eat the '('.
	return parseParenExpr(Res, EndLoc);
	case AsmToken::LBrac:
	if (!PlatformParser->HasBracketExpressions())
	return TokError("brackets expression not supported on this target");
	Lex(); // Eat the '['.
	return parseBracketExpr(Res, EndLoc);
	case AsmToken::Minus:
	Lex(); // Eat the operator.
	if (parsePrimaryExpr(Res, EndLoc))
	return true;
	Res = MCUnaryExpr::createMinus(Res, getContext(), FirstTokenLoc);
	return false;
	case AsmToken::Plus:
	Lex(); // Eat the operator.
	if (parsePrimaryExpr(Res, EndLoc))
	return true;
	Res = MCUnaryExpr::createPlus(Res, getContext(), FirstTokenLoc);
	return false;
	case AsmToken::Tilde:
	Lex(); // Eat the operator.
	if (parsePrimaryExpr(Res, EndLoc))
	return true;
	Res = MCUnaryExpr::createNot(Res, getContext(), FirstTokenLoc);
	return false;
	// MIPS unary expression operators. The lexer won't generate these tokens if
	// MCAsmInfo::HasMipsExpressions is false for the target.
	case AsmToken::PercentCall16:
	case AsmToken::PercentCall_Hi:
	case AsmToken::PercentCall_Lo:
	case AsmToken::PercentDtprel_Hi:
	case AsmToken::PercentDtprel_Lo:
	case AsmToken::PercentGot:
	case AsmToken::PercentGot_Disp:
	case AsmToken::PercentGot_Hi:
	case AsmToken::PercentGot_Lo:
	case AsmToken::PercentGot_Ofst:
	case AsmToken::PercentGot_Page:
	case AsmToken::PercentGottprel:
	case AsmToken::PercentGp_Rel:
	case AsmToken::PercentHi:
	case AsmToken::PercentHigher:
	case AsmToken::PercentHighest:
	case AsmToken::PercentLo:
	case AsmToken::PercentNeg:
	case AsmToken::PercentPcrel_Hi:
	case AsmToken::PercentPcrel_Lo:
	case AsmToken::PercentTlsgd:
	case AsmToken::PercentTlsldm:
	case AsmToken::PercentTprel_Hi:
	case AsmToken::PercentTprel_Lo:
	Lex(); // Eat the operator.
	if (Lexer.isNot(AsmToken::LParen))
	return TokError("expected '(' after operator");
	Lex(); // Eat the operator.
	if (parseExpression(Res, EndLoc))
	return true;
	if (Lexer.isNot(AsmToken::RParen))
	return TokError("expected ')'");
	Lex(); // Eat the operator.
	Res = getTargetParser().createTargetUnaryExpr(Res, FirstTokenKind, Ctx);
	return !Res;
	}
	}

	bool AsmParser::parseExpression(const MCExpr *&Res) {
	SMLoc EndLoc;
	return parseExpression(Res, EndLoc);
	}

	const MCExpr *
	AsmParser::applyModifierToExpr(const MCExpr *E,
	MCSymbolRefExpr::VariantKind Variant) {
	// Ask the target implementation about this expression first.
	const MCExpr *NewE = getTargetParser().applyModifierToExpr(E, Variant, Ctx);
	if (NewE)
	return NewE;
	// Recurse over the given expression, rebuilding it to apply the given variant
	// if there is exactly one symbol.
	switch (E->getKind()) {
	case MCExpr::Target:
	case MCExpr::Constant:
	return nullptr;

	case MCExpr::SymbolRef: {
	const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E);

	if (SRE->getKind() != MCSymbolRefExpr::VK_None) {
	TokError("invalid variant on expression '" + getTok().getIdentifier() +
	"' (already modified)");
	return E;
	}

	return MCSymbolRefExpr::create(&SRE->getSymbol(), Variant, getContext());
	}

	case MCExpr::Unary: {
	const MCUnaryExpr *UE = cast<MCUnaryExpr>(E);
	const MCExpr *Sub = applyModifierToExpr(UE->getSubExpr(), Variant);
	if (!Sub)
	return nullptr;
	return MCUnaryExpr::create(UE->getOpcode(), Sub, getContext());
	}

	case MCExpr::Binary: {
	const MCBinaryExpr *BE = cast<MCBinaryExpr>(E);
	const MCExpr *LHS = applyModifierToExpr(BE->getLHS(), Variant);
	const MCExpr *RHS = applyModifierToExpr(BE->getRHS(), Variant);

	if (!LHS && !RHS)
	return nullptr;

	if (!LHS)
	LHS = BE->getLHS();
	if (!RHS)
	RHS = BE->getRHS();

	return MCBinaryExpr::create(BE->getOpcode(), LHS, RHS, getContext());
	}
	}

	llvm_unreachable("Invalid expression kind!");
	}

	/// This function checks if the next token is <string> type or arithmetic.
	/// string that begin with character '<' must end with character '>'.
	/// otherwise it is arithmetics.
	/// If the function returns a 'true' value,
	/// the End argument will be filled with the last location pointed to the '>'
	/// character.

	/// There is a gap between the AltMacro's documentation and the single quote implementation.
	/// GCC does not fully support this feature and so we will not support it.
	/// TODO: Adding single quote as a string.
	bool AsmParser::isAltmacroString(SMLoc &StrLoc, SMLoc &EndLoc) {
	assert((StrLoc.getPointer() != NULL) &&
	"Argument to the function cannot be a NULL value");
	const char *CharPtr = StrLoc.getPointer();
	while ((CharPtr != '>') && (CharPtr != '\n') && (*CharPtr != '\r') &&
	(*CharPtr != '\0')) {
	if (*CharPtr == '!')
	CharPtr++;
	CharPtr++;
	}
	if (*CharPtr == '>') {
	EndLoc = StrLoc.getFromPointer(CharPtr + 1);
	return true;
	}
	return false;
	}

	/// \brief creating a string without the escape characters '!'.
	void AsmParser::altMacroString(StringRef AltMacroStr,std::string &Res) {
	for (size_t Pos = 0; Pos < AltMacroStr.size(); Pos++) {
	if (AltMacroStr[Pos] == '!')
	Pos++;
	Res += AltMacroStr[Pos];
	}
	}

	/// \brief Parse an expression and return it.
	///
	/// expr ::= expr &&,\|\| expr -> lowest.
	/// expr ::= expr \|,^,&,! expr
	/// expr ::= expr ==,!=,<>,<,<=,>,>= expr
	/// expr ::= expr <<,>> expr
	/// expr ::= expr +,- expr
	/// expr ::= expr *,/,% expr -> highest.
	/// expr ::= primaryexpr
	///
	bool AsmParser::parseExpression(const MCExpr *&Res, SMLoc &EndLoc) {
	// Parse the expression.
	Res = nullptr;
	if (parsePrimaryExpr(Res, EndLoc) \|\| parseBinOpRHS(1, Res, EndLoc))
	return true;

	// As a special case, we support 'a op b @ modifier' by rewriting the
	// expression to include the modifier. This is inefficient, but in general we
	// expect users to use 'a@modifier op b'.
	if (Lexer.getKind() == AsmToken::At) {
	Lex();

	if (Lexer.isNot(AsmToken::Identifier))
	return TokError("unexpected symbol modifier following '@'");

	MCSymbolRefExpr::VariantKind Variant =
	MCSymbolRefExpr::getVariantKindForName(getTok().getIdentifier());
	if (Variant == MCSymbolRefExpr::VK_Invalid)
	return TokError("invalid variant '" + getTok().getIdentifier() + "'");

	const MCExpr *ModifiedRes = applyModifierToExpr(Res, Variant);
	if (!ModifiedRes) {
	return TokError("invalid modifier '" + getTok().getIdentifier() +
	"' (no symbols present)");
	}

	Res = ModifiedRes;
	Lex();
	}

	// Try to constant fold it up front, if possible.
	int64_t Value;
	if (Res->evaluateAsAbsolute(Value))
	Res = MCConstantExpr::create(Value, getContext());

	return false;
	}

	bool AsmParser::parseParenExpression(const MCExpr *&Res, SMLoc &EndLoc) {
	Res = nullptr;
	return parseParenExpr(Res, EndLoc) \|\| parseBinOpRHS(1, Res, EndLoc);
	}

	bool AsmParser::parseParenExprOfDepth(unsigned ParenDepth, const MCExpr *&Res,
	SMLoc &EndLoc) {
	if (parseParenExpr(Res, EndLoc))
	return true;

	for (; ParenDepth > 0; --ParenDepth) {
	if (parseBinOpRHS(1, Res, EndLoc))
	return true;

	// We don't Lex() the last RParen.
	// This is the same behavior as parseParenExpression().
	if (ParenDepth - 1 > 0) {
	EndLoc = getTok().getEndLoc();
	if (parseToken(AsmToken::RParen,
	"expected ')' in parentheses expression"))
	return true;
	}
	}
	return false;
	}

	bool AsmParser::parseAbsoluteExpression(int64_t &Res) {
	const MCExpr *Expr;

	SMLoc StartLoc = Lexer.getLoc();
	if (parseExpression(Expr))
	return true;

	if (!Expr->evaluateAsAbsolute(Res))
	return Error(StartLoc, "expected absolute expression");

	return false;
	}

	static unsigned getDarwinBinOpPrecedence(AsmToken::TokenKind K,
	MCBinaryExpr::Opcode &Kind,
	bool ShouldUseLogicalShr) {
	switch (K) {
	default:
	return 0; // not a binop.

	// Lowest Precedence: &&, \|\|
	case AsmToken::AmpAmp:
	Kind = MCBinaryExpr::LAnd;
	return 1;
	case AsmToken::PipePipe:
	Kind = MCBinaryExpr::LOr;
	return 1;

	// Low Precedence: \|, &, ^
	//
	// FIXME: gas seems to support '!' as an infix operator?
	case AsmToken::Pipe:
	Kind = MCBinaryExpr::Or;
	return 2;
	case AsmToken::Caret:
	Kind = MCBinaryExpr::Xor;
	return 2;
	case AsmToken::Amp:
	Kind = MCBinaryExpr::And;
	return 2;

	// Low Intermediate Precedence: ==, !=, <>, <, <=, >, >=
	case AsmToken::EqualEqual:
	Kind = MCBinaryExpr::EQ;
	return 3;
	case AsmToken::ExclaimEqual:
	case AsmToken::LessGreater:
	Kind = MCBinaryExpr::NE;
	return 3;
	case AsmToken::Less:
	Kind = MCBinaryExpr::LT;
	return 3;
	case AsmToken::LessEqual:
	Kind = MCBinaryExpr::LTE;
	return 3;
	case AsmToken::Greater:
	Kind = MCBinaryExpr::GT;
	return 3;
	case AsmToken::GreaterEqual:
	Kind = MCBinaryExpr::GTE;
	return 3;

	// Intermediate Precedence: <<, >>
	case AsmToken::LessLess:
	Kind = MCBinaryExpr::Shl;
	return 4;
	case AsmToken::GreaterGreater:
	Kind = ShouldUseLogicalShr ? MCBinaryExpr::LShr : MCBinaryExpr::AShr;
	return 4;

	// High Intermediate Precedence: +, -
	case AsmToken::Plus:
	Kind = MCBinaryExpr::Add;
	return 5;
	case AsmToken::Minus:
	Kind = MCBinaryExpr::Sub;
	return 5;

	// Highest Precedence: *, /, %
	case AsmToken::Star:
	Kind = MCBinaryExpr::Mul;
	return 6;
	case AsmToken::Slash:
	Kind = MCBinaryExpr::Div;
	return 6;
	case AsmToken::Percent:
	Kind = MCBinaryExpr::Mod;
	return 6;
	}
	}

	static unsigned getGNUBinOpPrecedence(AsmToken::TokenKind K,
	MCBinaryExpr::Opcode &Kind,
	bool ShouldUseLogicalShr) {
	switch (K) {
	default:
	return 0; // not a binop.

	// Lowest Precedence: &&, \|\|
	case AsmToken::AmpAmp:
	Kind = MCBinaryExpr::LAnd;
	return 2;
	case AsmToken::PipePipe:
	Kind = MCBinaryExpr::LOr;
	return 1;

	// Low Precedence: ==, !=, <>, <, <=, >, >=
	case AsmToken::EqualEqual:
	Kind = MCBinaryExpr::EQ;
	return 3;
	case AsmToken::ExclaimEqual:
	case AsmToken::LessGreater:
	Kind = MCBinaryExpr::NE;
	return 3;
	case AsmToken::Less:
	Kind = MCBinaryExpr::LT;
	return 3;
	case AsmToken::LessEqual:
	Kind = MCBinaryExpr::LTE;
	return 3;
	case AsmToken::Greater:
	Kind = MCBinaryExpr::GT;
	return 3;
	case AsmToken::GreaterEqual:
	Kind = MCBinaryExpr::GTE;
	return 3;

	// Low Intermediate Precedence: +, -
	case AsmToken::Plus:
	Kind = MCBinaryExpr::Add;
	return 4;
	case AsmToken::Minus:
	Kind = MCBinaryExpr::Sub;
	return 4;

	// High Intermediate Precedence: \|, &, ^
	//
	// FIXME: gas seems to support '!' as an infix operator?
	case AsmToken::Pipe:
	Kind = MCBinaryExpr::Or;
	return 5;
	case AsmToken::Caret:
	Kind = MCBinaryExpr::Xor;
	return 5;
	case AsmToken::Amp:
	Kind = MCBinaryExpr::And;
	return 5;

	// Highest Precedence: *, /, %, <<, >>
	case AsmToken::Star:
	Kind = MCBinaryExpr::Mul;
	return 6;
	case AsmToken::Slash:
	Kind = MCBinaryExpr::Div;
	return 6;
	case AsmToken::Percent:
	Kind = MCBinaryExpr::Mod;
	return 6;
	case AsmToken::LessLess:
	Kind = MCBinaryExpr::Shl;
	return 6;
	case AsmToken::GreaterGreater:
	Kind = ShouldUseLogicalShr ? MCBinaryExpr::LShr : MCBinaryExpr::AShr;
	return 6;
	}
	}

	unsigned AsmParser::getBinOpPrecedence(AsmToken::TokenKind K,
	MCBinaryExpr::Opcode &Kind) {
	bool ShouldUseLogicalShr = MAI.shouldUseLogicalShr();
	return IsDarwin ? getDarwinBinOpPrecedence(K, Kind, ShouldUseLogicalShr)
	: getGNUBinOpPrecedence(K, Kind, ShouldUseLogicalShr);
	}

	/// \brief Parse all binary operators with precedence >= 'Precedence'.
	/// Res contains the LHS of the expression on input.
	bool AsmParser::parseBinOpRHS(unsigned Precedence, const MCExpr *&Res,
	SMLoc &EndLoc) {
	SMLoc StartLoc = Lexer.getLoc();
	while (true) {
	MCBinaryExpr::Opcode Kind = MCBinaryExpr::Add;
	unsigned TokPrec = getBinOpPrecedence(Lexer.getKind(), Kind);

	// If the next token is lower precedence than we are allowed to eat, return
	// successfully with what we ate already.
	if (TokPrec < Precedence)
	return false;

	Lex();

	// Eat the next primary expression.
	const MCExpr *RHS;
	if (parsePrimaryExpr(RHS, EndLoc))
	return true;

	// If BinOp binds less tightly with RHS than the operator after RHS, let
	// the pending operator take RHS as its LHS.
	MCBinaryExpr::Opcode Dummy;
	unsigned NextTokPrec = getBinOpPrecedence(Lexer.getKind(), Dummy);
	if (TokPrec < NextTokPrec && parseBinOpRHS(TokPrec + 1, RHS, EndLoc))
	return true;

	// Merge LHS and RHS according to operator.
	Res = MCBinaryExpr::create(Kind, Res, RHS, getContext(), StartLoc);
	}
	}

	/// ParseStatement:
	/// ::= EndOfStatement
	/// ::= Label* Directive ...Operands... EndOfStatement
	/// ::= Label* Identifier OperandList* EndOfStatement
	bool AsmParser::parseStatement(ParseStatementInfo &Info,
	MCAsmParserSemaCallback *SI) {
	assert(!hasPendingError() && "parseStatement started with pending error");
	// Eat initial spaces and comments
	while (Lexer.is(AsmToken::Space))
	Lex();
	if (Lexer.is(AsmToken::EndOfStatement)) {
	// if this is a line comment we can drop it safely
	if (getTok().getString().empty() \|\| getTok().getString().front() == '\r' \|\|
	getTok().getString().front() == '\n')
	Out.AddBlankLine();
	Lex();
	return false;
	}
	// Statements always start with an identifier.
	AsmToken ID = getTok();
	SMLoc IDLoc = ID.getLoc();
	StringRef IDVal;
	int64_t LocalLabelVal = -1;
	if (Lexer.is(AsmToken::HashDirective))
	return parseCppHashLineFilenameComment(IDLoc);
	// Allow an integer followed by a ':' as a directional local label.
	if (Lexer.is(AsmToken::Integer)) {
	LocalLabelVal = getTok().getIntVal();
	if (LocalLabelVal < 0) {
	if (!TheCondState.Ignore) {
	Lex(); // always eat a token
	return Error(IDLoc, "unexpected token at start of statement");
	}
	IDVal = "";
	} else {
	IDVal = getTok().getString();
	Lex(); // Consume the integer token to be used as an identifier token.
	if (Lexer.getKind() != AsmToken::Colon) {
	if (!TheCondState.Ignore) {
	Lex(); // always eat a token
	return Error(IDLoc, "unexpected token at start of statement");
	}
	}
	}
	} else if (Lexer.is(AsmToken::Dot)) {
	// Treat '.' as a valid identifier in this context.
	Lex();
	IDVal = ".";
	} else if (Lexer.is(AsmToken::LCurly)) {
	// Treat '{' as a valid identifier in this context.
	Lex();
	IDVal = "{";

	} else if (Lexer.is(AsmToken::RCurly)) {
	// Treat '}' as a valid identifier in this context.
	Lex();
	IDVal = "}";
	} else if (Lexer.is(AsmToken::Star) &&
	getTargetParser().starIsStartOfStatement()) {
	// Accept '*' as a valid start of statement.
	Lex();
	IDVal = "*";
	} else if (parseIdentifier(IDVal)) {
	if (!TheCondState.Ignore) {
	Lex(); // always eat a token
	return Error(IDLoc, "unexpected token at start of statement");
	}
	IDVal = "";
	}

	// Handle conditional assembly here before checking for skipping. We
	// have to do this so that .endif isn't skipped in a ".if 0" block for
	// example.
	StringMap<DirectiveKind>::const_iterator DirKindIt =
	DirectiveKindMap.find(IDVal);
	DirectiveKind DirKind = (DirKindIt == DirectiveKindMap.end())
	? DK_NO_DIRECTIVE
	: DirKindIt->getValue();
	switch (DirKind) {
	default:
	break;
	case DK_IF:
	case DK_IFEQ:
	case DK_IFGE:
	case DK_IFGT:
	case DK_IFLE:
	case DK_IFLT:
	case DK_IFNE:
	return parseDirectiveIf(IDLoc, DirKind);
	case DK_IFB:
	return parseDirectiveIfb(IDLoc, true);
	case DK_IFNB:
	return parseDirectiveIfb(IDLoc, false);
	case DK_IFC:
	return parseDirectiveIfc(IDLoc, true);
	case DK_IFEQS:
	return parseDirectiveIfeqs(IDLoc, true);
	case DK_IFNC:
	return parseDirectiveIfc(IDLoc, false);
	case DK_IFNES:
	return parseDirectiveIfeqs(IDLoc, false);
	case DK_IFDEF:
	return parseDirectiveIfdef(IDLoc, true);
	case DK_IFNDEF:
	case DK_IFNOTDEF:
	return parseDirectiveIfdef(IDLoc, false);
	case DK_ELSEIF:
	return parseDirectiveElseIf(IDLoc);
	case DK_ELSE:
	return parseDirectiveElse(IDLoc);
	case DK_ENDIF:
	return parseDirectiveEndIf(IDLoc);
	}

	// Ignore the statement if in the middle of inactive conditional
	// (e.g. ".if 0").
	if (TheCondState.Ignore) {
	eatToEndOfStatement();
	return false;
	}

	// FIXME: Recurse on local labels?

	// See what kind of statement we have.
	switch (Lexer.getKind()) {
	case AsmToken::Colon: {
	if (!getTargetParser().isLabel(ID))
	break;
	if (checkForValidSection())
	return true;

	// identifier ':' -> Label.
	Lex();

	// Diagnose attempt to use '.' as a label.
	if (IDVal == ".")
	return Error(IDLoc, "invalid use of pseudo-symbol '.' as a label");

	// Diagnose attempt to use a variable as a label.
	//
	// FIXME: Diagnostics. Note the location of the definition as a label.
	// FIXME: This doesn't diagnose assignment to a symbol which has been
	// implicitly marked as external.
	MCSymbol *Sym;
	if (LocalLabelVal == -1) {
	if (ParsingInlineAsm && SI) {
	StringRef RewrittenLabel =
	SI->LookupInlineAsmLabel(IDVal, getSourceManager(), IDLoc, true);
	assert(!RewrittenLabel.empty() &&
	"We should have an internal name here.");
	Info.AsmRewrites->emplace_back(AOK_Label, IDLoc, IDVal.size(),
	RewrittenLabel);
	IDVal = RewrittenLabel;
	}
	Sym = getContext().getOrCreateSymbol(IDVal);
	} else
	Sym = Ctx.createDirectionalLocalSymbol(LocalLabelVal);
	// End of Labels should be treated as end of line for lexing
	// purposes but that information is not available to the Lexer who
	// does not understand Labels. This may cause us to see a Hash
	// here instead of a preprocessor line comment.
	if (getTok().is(AsmToken::Hash)) {
	StringRef CommentStr = parseStringToEndOfStatement();
	Lexer.Lex();
	Lexer.UnLex(AsmToken(AsmToken::EndOfStatement, CommentStr));
	}

	// Consume any end of statement token, if present, to avoid spurious
	// AddBlankLine calls().
	if (getTok().is(AsmToken::EndOfStatement)) {
	Lex();
	}

	// Emit the label.
	if (!getTargetParser().isParsingInlineAsm())
	Out.EmitLabel(Sym, IDLoc);

	// If we are generating dwarf for assembly source files then gather the
	// info to make a dwarf label entry for this label if needed.
	if (getContext().getGenDwarfForAssembly())
	MCGenDwarfLabelEntry::Make(Sym, &getStreamer(), getSourceManager(),
	IDLoc);

	getTargetParser().onLabelParsed(Sym);

	return false;
	}

	case AsmToken::Equal:
	if (!getTargetParser().equalIsAsmAssignment())
	break;
	// identifier '=' ... -> assignment statement
	Lex();

	return parseAssignment(IDVal, true);

	default: // Normal instruction or directive.
	break;
	}

	// If macros are enabled, check to see if this is a macro instantiation.
	if (areMacrosEnabled())
	- if (const MCAsmMacro *M = lookupMacro(IDVal)) {
	+ if (const MCAsmMacro *M = getContext().lookupMacro(IDVal)) {
	return handleMacroEntry(M, IDLoc);
	}

	// Otherwise, we have a normal instruction or directive.

	// Directives start with "."
	if (IDVal[0] == '.' && IDVal != ".") {
	// There are several entities interested in parsing directives:
	//
	// 1. The target-specific assembly parser. Some directives are target
	// specific or may potentially behave differently on certain targets.
	// 2. Asm parser extensions. For example, platform-specific parsers
	// (like the ELF parser) register themselves as extensions.
	// 3. The generic directive parser implemented by this class. These are
	// all the directives that behave in a target and platform independent
	// manner, or at least have a default behavior that's shared between
	// all targets and platforms.

	getTargetParser().flushPendingInstructions(getStreamer());

	SMLoc StartTokLoc = getTok().getLoc();
	bool TPDirectiveReturn = getTargetParser().ParseDirective(ID);

	if (hasPendingError())
	return true;
	// Currently the return value should be true if we are
	// uninterested but as this is at odds with the standard parsing
	// convention (return true = error) we have instances of a parsed
	// directive that fails returning true as an error. Catch these
	// cases as best as possible errors here.
	if (TPDirectiveReturn && StartTokLoc != getTok().getLoc())
	return true;
	// Return if we did some parsing or believe we succeeded.
	if (!TPDirectiveReturn \|\| StartTokLoc != getTok().getLoc())
	return false;

	// Next, check the extension directive map to see if any extension has
	// registered itself to parse this directive.
	std::pair<MCAsmParserExtension *, DirectiveHandler> Handler =
	ExtensionDirectiveMap.lookup(IDVal);
	if (Handler.first)
	return (*Handler.second)(Handler.first, IDVal, IDLoc);

	// Finally, if no one else is interested in this directive, it must be
	// generic and familiar to this class.
	switch (DirKind) {
	default:
	break;
	case DK_SET:
	case DK_EQU:
	return parseDirectiveSet(IDVal, true);
	case DK_EQUIV:
	return parseDirectiveSet(IDVal, false);
	case DK_ASCII:
	return parseDirectiveAscii(IDVal, false);
	case DK_ASCIZ:
	case DK_STRING:
	return parseDirectiveAscii(IDVal, true);
	case DK_BYTE:
	case DK_DC_B:
	return parseDirectiveValue(IDVal, 1);
	case DK_DC:
	case DK_DC_W:
	case DK_SHORT:
	case DK_VALUE:
	case DK_2BYTE:
	return parseDirectiveValue(IDVal, 2);
	case DK_LONG:
	case DK_INT:
	case DK_4BYTE:
	case DK_DC_L:
	return parseDirectiveValue(IDVal, 4);
	case DK_QUAD:
	case DK_8BYTE:
	return parseDirectiveValue(IDVal, 8);
	case DK_DC_A:
	return parseDirectiveValue(
	IDVal, getContext().getAsmInfo()->getCodePointerSize());
	case DK_OCTA:
	return parseDirectiveOctaValue(IDVal);
	case DK_SINGLE:
	case DK_FLOAT:
	case DK_DC_S:
	return parseDirectiveRealValue(IDVal, APFloat::IEEEsingle());
	case DK_DOUBLE:
	case DK_DC_D:
	return parseDirectiveRealValue(IDVal, APFloat::IEEEdouble());
	case DK_ALIGN: {
	bool IsPow2 = !getContext().getAsmInfo()->getAlignmentIsInBytes();
	return parseDirectiveAlign(IsPow2, /ExprSize=/1);
	}
	case DK_ALIGN32: {
	bool IsPow2 = !getContext().getAsmInfo()->getAlignmentIsInBytes();
	return parseDirectiveAlign(IsPow2, /ExprSize=/4);
	}
	case DK_BALIGN:
	return parseDirectiveAlign(/IsPow2=/false, /ExprSize=/1);
	case DK_BALIGNW:
	return parseDirectiveAlign(/IsPow2=/false, /ExprSize=/2);
	case DK_BALIGNL:
	return parseDirectiveAlign(/IsPow2=/false, /ExprSize=/4);
	case DK_P2ALIGN:
	return parseDirectiveAlign(/IsPow2=/true, /ExprSize=/1);
	case DK_P2ALIGNW:
	return parseDirectiveAlign(/IsPow2=/true, /ExprSize=/2);
	case DK_P2ALIGNL:
	return parseDirectiveAlign(/IsPow2=/true, /ExprSize=/4);
	case DK_ORG:
	return parseDirectiveOrg();
	case DK_FILL:
	return parseDirectiveFill();
	case DK_ZERO:
	return parseDirectiveZero();
	case DK_EXTERN:
	eatToEndOfStatement(); // .extern is the default, ignore it.
	return false;
	case DK_GLOBL:
	case DK_GLOBAL:
	return parseDirectiveSymbolAttribute(MCSA_Global);
	case DK_LAZY_REFERENCE:
	return parseDirectiveSymbolAttribute(MCSA_LazyReference);
	case DK_NO_DEAD_STRIP:
	return parseDirectiveSymbolAttribute(MCSA_NoDeadStrip);
	case DK_SYMBOL_RESOLVER:
	return parseDirectiveSymbolAttribute(MCSA_SymbolResolver);
	case DK_PRIVATE_EXTERN:
	return parseDirectiveSymbolAttribute(MCSA_PrivateExtern);
	case DK_REFERENCE:
	return parseDirectiveSymbolAttribute(MCSA_Reference);
	case DK_WEAK_DEFINITION:
	return parseDirectiveSymbolAttribute(MCSA_WeakDefinition);
	case DK_WEAK_REFERENCE:
	return parseDirectiveSymbolAttribute(MCSA_WeakReference);
	case DK_WEAK_DEF_CAN_BE_HIDDEN:
	return parseDirectiveSymbolAttribute(MCSA_WeakDefAutoPrivate);
	case DK_COMM:
	case DK_COMMON:
	return parseDirectiveComm(/IsLocal=/false);
	case DK_LCOMM:
	return parseDirectiveComm(/IsLocal=/true);
	case DK_ABORT:
	return parseDirectiveAbort();
	case DK_INCLUDE:
	return parseDirectiveInclude();
	case DK_INCBIN:
	return parseDirectiveIncbin();
	case DK_CODE16:
	case DK_CODE16GCC:
	return TokError(Twine(IDVal) +
	" not currently supported for this target");
	case DK_REPT:
	return parseDirectiveRept(IDLoc, IDVal);
	case DK_IRP:
	return parseDirectiveIrp(IDLoc);
	case DK_IRPC:
	return parseDirectiveIrpc(IDLoc);
	case DK_ENDR:
	return parseDirectiveEndr(IDLoc);
	case DK_BUNDLE_ALIGN_MODE:
	return parseDirectiveBundleAlignMode();
	case DK_BUNDLE_LOCK:
	return parseDirectiveBundleLock();
	case DK_BUNDLE_UNLOCK:
	return parseDirectiveBundleUnlock();
	case DK_SLEB128:
	return parseDirectiveLEB128(true);
	case DK_ULEB128:
	return parseDirectiveLEB128(false);
	case DK_SPACE:
	case DK_SKIP:
	return parseDirectiveSpace(IDVal);
	case DK_FILE:
	return parseDirectiveFile(IDLoc);
	case DK_LINE:
	return parseDirectiveLine();
	case DK_LOC:
	return parseDirectiveLoc();
	case DK_STABS:
	return parseDirectiveStabs();
	case DK_CV_FILE:
	return parseDirectiveCVFile();
	case DK_CV_FUNC_ID:
	return parseDirectiveCVFuncId();
	case DK_CV_INLINE_SITE_ID:
	return parseDirectiveCVInlineSiteId();
	case DK_CV_LOC:
	return parseDirectiveCVLoc();
	case DK_CV_LINETABLE:
	return parseDirectiveCVLinetable();
	case DK_CV_INLINE_LINETABLE:
	return parseDirectiveCVInlineLinetable();
	case DK_CV_DEF_RANGE:
	return parseDirectiveCVDefRange();
	case DK_CV_STRINGTABLE:
	return parseDirectiveCVStringTable();
	case DK_CV_FILECHECKSUMS:
	return parseDirectiveCVFileChecksums();
	case DK_CV_FILECHECKSUM_OFFSET:
	return parseDirectiveCVFileChecksumOffset();
	case DK_CV_FPO_DATA:
	return parseDirectiveCVFPOData();
	case DK_CFI_SECTIONS:
	return parseDirectiveCFISections();
	case DK_CFI_STARTPROC:
	return parseDirectiveCFIStartProc();
	case DK_CFI_ENDPROC:
	return parseDirectiveCFIEndProc();
	case DK_CFI_DEF_CFA:
	return parseDirectiveCFIDefCfa(IDLoc);
	case DK_CFI_DEF_CFA_OFFSET:
	return parseDirectiveCFIDefCfaOffset();
	case DK_CFI_ADJUST_CFA_OFFSET:
	return parseDirectiveCFIAdjustCfaOffset();
	case DK_CFI_DEF_CFA_REGISTER:
	return parseDirectiveCFIDefCfaRegister(IDLoc);
	case DK_CFI_OFFSET:
	return parseDirectiveCFIOffset(IDLoc);
	case DK_CFI_REL_OFFSET:
	return parseDirectiveCFIRelOffset(IDLoc);
	case DK_CFI_PERSONALITY:
	return parseDirectiveCFIPersonalityOrLsda(true);
	case DK_CFI_LSDA:
	return parseDirectiveCFIPersonalityOrLsda(false);
	case DK_CFI_REMEMBER_STATE:
	return parseDirectiveCFIRememberState();
	case DK_CFI_RESTORE_STATE:
	return parseDirectiveCFIRestoreState();
	case DK_CFI_SAME_VALUE:
	return parseDirectiveCFISameValue(IDLoc);
	case DK_CFI_RESTORE:
	return parseDirectiveCFIRestore(IDLoc);
	case DK_CFI_ESCAPE:
	return parseDirectiveCFIEscape();
	case DK_CFI_RETURN_COLUMN:
	return parseDirectiveCFIReturnColumn(IDLoc);
	case DK_CFI_SIGNAL_FRAME:
	return parseDirectiveCFISignalFrame();
	case DK_CFI_UNDEFINED:
	return parseDirectiveCFIUndefined(IDLoc);
	case DK_CFI_REGISTER:
	return parseDirectiveCFIRegister(IDLoc);
	case DK_CFI_WINDOW_SAVE:
	return parseDirectiveCFIWindowSave();
	case DK_MACROS_ON:
	case DK_MACROS_OFF:
	return parseDirectiveMacrosOnOff(IDVal);
	case DK_MACRO:
	return parseDirectiveMacro(IDLoc);
	case DK_ALTMACRO:
	case DK_NOALTMACRO:
	return parseDirectiveAltmacro(IDVal);
	case DK_EXITM:
	return parseDirectiveExitMacro(IDVal);
	case DK_ENDM:
	case DK_ENDMACRO:
	return parseDirectiveEndMacro(IDVal);
	case DK_PURGEM:
	return parseDirectivePurgeMacro(IDLoc);
	case DK_END:
	return parseDirectiveEnd(IDLoc);
	case DK_ERR:
	return parseDirectiveError(IDLoc, false);
	case DK_ERROR:
	return parseDirectiveError(IDLoc, true);
	case DK_WARNING:
	return parseDirectiveWarning(IDLoc);
	case DK_RELOC:
	return parseDirectiveReloc(IDLoc);
	case DK_DCB:
	case DK_DCB_W:
	return parseDirectiveDCB(IDVal, 2);
	case DK_DCB_B:
	return parseDirectiveDCB(IDVal, 1);
	case DK_DCB_D:
	return parseDirectiveRealDCB(IDVal, APFloat::IEEEdouble());
	case DK_DCB_L:
	return parseDirectiveDCB(IDVal, 4);
	case DK_DCB_S:
	return parseDirectiveRealDCB(IDVal, APFloat::IEEEsingle());
	case DK_DC_X:
	case DK_DCB_X:
	return TokError(Twine(IDVal) +
	" not currently supported for this target");
	case DK_DS:
	case DK_DS_W:
	return parseDirectiveDS(IDVal, 2);
	case DK_DS_B:
	return parseDirectiveDS(IDVal, 1);
	case DK_DS_D:
	return parseDirectiveDS(IDVal, 8);
	case DK_DS_L:
	case DK_DS_S:
	return parseDirectiveDS(IDVal, 4);
	case DK_DS_P:
	case DK_DS_X:
	return parseDirectiveDS(IDVal, 12);
	case DK_PRINT:
	return parseDirectivePrint(IDLoc);
	}

	return Error(IDLoc, "unknown directive");
	}

	// __asm _emit or __asm __emit
	if (ParsingInlineAsm && (IDVal == "_emit" \|\| IDVal == "__emit" \|\|
	IDVal == "_EMIT" \|\| IDVal == "__EMIT"))
	return parseDirectiveMSEmit(IDLoc, Info, IDVal.size());

	// __asm align
	if (ParsingInlineAsm && (IDVal == "align" \|\| IDVal == "ALIGN"))
	return parseDirectiveMSAlign(IDLoc, Info);

	if (ParsingInlineAsm && (IDVal == "even" \|\| IDVal == "EVEN"))
	Info.AsmRewrites->emplace_back(AOK_EVEN, IDLoc, 4);
	if (checkForValidSection())
	return true;

	// Canonicalize the opcode to lower case.
	std::string OpcodeStr = IDVal.lower();
	ParseInstructionInfo IInfo(Info.AsmRewrites);
	bool ParseHadError = getTargetParser().ParseInstruction(IInfo, OpcodeStr, ID,
	Info.ParsedOperands);
	Info.ParseError = ParseHadError;

	// Dump the parsed representation, if requested.
	if (getShowParsedOperands()) {
	SmallString<256> Str;
	raw_svector_ostream OS(Str);
	OS << "parsed instruction: [";
	for (unsigned i = 0; i != Info.ParsedOperands.size(); ++i) {
	if (i != 0)
	OS << ", ";
	Info.ParsedOperands[i]->print(OS);
	}
	OS << "]";

	printMessage(IDLoc, SourceMgr::DK_Note, OS.str());
	}

	// Fail even if ParseInstruction erroneously returns false.
	if (hasPendingError() \|\| ParseHadError)
	return true;

	// If we are generating dwarf for the current section then generate a .loc
	// directive for the instruction.
	if (!ParseHadError && getContext().getGenDwarfForAssembly() &&
	getContext().getGenDwarfSectionSyms().count(
	getStreamer().getCurrentSectionOnly())) {
	unsigned Line;
	if (ActiveMacros.empty())
	Line = SrcMgr.FindLineNumber(IDLoc, CurBuffer);
	else
	Line = SrcMgr.FindLineNumber(ActiveMacros.front()->InstantiationLoc,
	ActiveMacros.front()->ExitBuffer);

	// If we previously parsed a cpp hash file line comment then make sure the
	// current Dwarf File is for the CppHashFilename if not then emit the
	// Dwarf File table for it and adjust the line number for the .loc.
	if (!CppHashInfo.Filename.empty()) {
	unsigned FileNumber = getStreamer().EmitDwarfFileDirective(
	0, StringRef(), CppHashInfo.Filename);
	getContext().setGenDwarfFileNumber(FileNumber);

	// Since SrcMgr.FindLineNumber() is slow and messes up the SourceMgr's
	// cache with the different Loc from the call above we save the last
	// info we queried here with SrcMgr.FindLineNumber().
	unsigned CppHashLocLineNo;
	if (LastQueryIDLoc == CppHashInfo.Loc &&
	LastQueryBuffer == CppHashInfo.Buf)
	CppHashLocLineNo = LastQueryLine;
	else {
	CppHashLocLineNo =
	SrcMgr.FindLineNumber(CppHashInfo.Loc, CppHashInfo.Buf);
	LastQueryLine = CppHashLocLineNo;
	LastQueryIDLoc = CppHashInfo.Loc;
	LastQueryBuffer = CppHashInfo.Buf;
	}
	Line = CppHashInfo.LineNumber - 1 + (Line - CppHashLocLineNo);
	}

	getStreamer().EmitDwarfLocDirective(
	getContext().getGenDwarfFileNumber(), Line, 0,
	DWARF2_LINE_DEFAULT_IS_STMT ? DWARF2_FLAG_IS_STMT : 0, 0, 0,
	StringRef());
	}

	// If parsing succeeded, match the instruction.
	if (!ParseHadError) {
	uint64_t ErrorInfo;
	if (getTargetParser().MatchAndEmitInstruction(
	IDLoc, Info.Opcode, Info.ParsedOperands, Out, ErrorInfo,
	getTargetParser().isParsingInlineAsm()))
	return true;
	}
	return false;
	}

	// Parse and erase curly braces marking block start/end
	bool
	AsmParser::parseCurlyBlockScope(SmallVectorImpl<AsmRewrite> &AsmStrRewrites) {
	// Identify curly brace marking block start/end
	if (Lexer.isNot(AsmToken::LCurly) && Lexer.isNot(AsmToken::RCurly))
	return false;

	SMLoc StartLoc = Lexer.getLoc();
	Lex(); // Eat the brace
	if (Lexer.is(AsmToken::EndOfStatement))
	Lex(); // Eat EndOfStatement following the brace

	// Erase the block start/end brace from the output asm string
	AsmStrRewrites.emplace_back(AOK_Skip, StartLoc, Lexer.getLoc().getPointer() -
	StartLoc.getPointer());
	return true;
	}

	/// parseCppHashLineFilenameComment as this:
	/// ::= # number "filename"
	bool AsmParser::parseCppHashLineFilenameComment(SMLoc L) {
	Lex(); // Eat the hash token.
	// Lexer only ever emits HashDirective if it fully formed if it's
	// done the checking already so this is an internal error.
	assert(getTok().is(AsmToken::Integer) &&
	"Lexing Cpp line comment: Expected Integer");
	int64_t LineNumber = getTok().getIntVal();
	Lex();
	assert(getTok().is(AsmToken::String) &&
	"Lexing Cpp line comment: Expected String");
	StringRef Filename = getTok().getString();
	Lex();

	// Get rid of the enclosing quotes.
	Filename = Filename.substr(1, Filename.size() - 2);

	// Save the SMLoc, Filename and LineNumber for later use by diagnostics.
	CppHashInfo.Loc = L;
	CppHashInfo.Filename = Filename;
	CppHashInfo.LineNumber = LineNumber;
	CppHashInfo.Buf = CurBuffer;
	return false;
	}

	/// \brief will use the last parsed cpp hash line filename comment
	/// for the Filename and LineNo if any in the diagnostic.
	void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
	const AsmParser Parser = static_cast<const AsmParser >(Context);
	raw_ostream &OS = errs();

	const SourceMgr &DiagSrcMgr = *Diag.getSourceMgr();
	SMLoc DiagLoc = Diag.getLoc();
	unsigned DiagBuf = DiagSrcMgr.FindBufferContainingLoc(DiagLoc);
	unsigned CppHashBuf =
	Parser->SrcMgr.FindBufferContainingLoc(Parser->CppHashInfo.Loc);

	// Like SourceMgr::printMessage() we need to print the include stack if any
	// before printing the message.
	unsigned DiagCurBuffer = DiagSrcMgr.FindBufferContainingLoc(DiagLoc);
	if (!Parser->SavedDiagHandler && DiagCurBuffer &&
	DiagCurBuffer != DiagSrcMgr.getMainFileID()) {
	SMLoc ParentIncludeLoc = DiagSrcMgr.getParentIncludeLoc(DiagCurBuffer);
	DiagSrcMgr.PrintIncludeStack(ParentIncludeLoc, OS);
	}

	// If we have not parsed a cpp hash line filename comment or the source
	// manager changed or buffer changed (like in a nested include) then just
	// print the normal diagnostic using its Filename and LineNo.
	if (!Parser->CppHashInfo.LineNumber \|\| &DiagSrcMgr != &Parser->SrcMgr \|\|
	DiagBuf != CppHashBuf) {
	if (Parser->SavedDiagHandler)
	Parser->SavedDiagHandler(Diag, Parser->SavedDiagContext);
	else
	Diag.print(nullptr, OS);
	return;
	}

	// Use the CppHashFilename and calculate a line number based on the
	// CppHashInfo.Loc and CppHashInfo.LineNumber relative to this Diag's SMLoc
	// for the diagnostic.
	const std::string &Filename = Parser->CppHashInfo.Filename;

	int DiagLocLineNo = DiagSrcMgr.FindLineNumber(DiagLoc, DiagBuf);
	int CppHashLocLineNo =
	Parser->SrcMgr.FindLineNumber(Parser->CppHashInfo.Loc, CppHashBuf);
	int LineNo =
	Parser->CppHashInfo.LineNumber - 1 + (DiagLocLineNo - CppHashLocLineNo);

	SMDiagnostic NewDiag(*Diag.getSourceMgr(), Diag.getLoc(), Filename, LineNo,
	Diag.getColumnNo(), Diag.getKind(), Diag.getMessage(),
	Diag.getLineContents(), Diag.getRanges());

	if (Parser->SavedDiagHandler)
	Parser->SavedDiagHandler(NewDiag, Parser->SavedDiagContext);
	else
	NewDiag.print(nullptr, OS);
	}

	// FIXME: This is mostly duplicated from the function in AsmLexer.cpp. The
	// difference being that that function accepts '@' as part of identifiers and
	// we can't do that. AsmLexer.cpp should probably be changed to handle
	// '@' as a special case when needed.
	static bool isIdentifierChar(char c) {
	return isalnum(static_cast<unsigned char>(c)) \|\| c == '_' \|\| c == '$' \|\|
	c == '.';
	}

	bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
	ArrayRef<MCAsmMacroParameter> Parameters,
	ArrayRef<MCAsmMacroArgument> A,
	bool EnableAtPseudoVariable, SMLoc L) {
	unsigned NParameters = Parameters.size();
	bool HasVararg = NParameters ? Parameters.back().Vararg : false;
	if ((!IsDarwin \|\| NParameters != 0) && NParameters != A.size())
	return Error(L, "Wrong number of arguments");

	// A macro without parameters is handled differently on Darwin:
	// gas accepts no arguments and does no substitutions
	while (!Body.empty()) {
	// Scan for the next substitution.
	std::size_t End = Body.size(), Pos = 0;
	for (; Pos != End; ++Pos) {
	// Check for a substitution or escape.
	if (IsDarwin && !NParameters) {
	// This macro has no parameters, look for $0, $1, etc.
	if (Body[Pos] != '$' \|\| Pos + 1 == End)
	continue;

	char Next = Body[Pos + 1];
	if (Next == '$' \|\| Next == 'n' \|\|
	isdigit(static_cast<unsigned char>(Next)))
	break;
	} else {
	// This macro has parameters, look for \foo, \bar, etc.
	if (Body[Pos] == '\\' && Pos + 1 != End)
	break;
	}
	}

	// Add the prefix.
	OS << Body.slice(0, Pos);

	// Check if we reached the end.
	if (Pos == End)
	break;

	if (IsDarwin && !NParameters) {
	switch (Body[Pos + 1]) {
	// $$ => $
	case '$':
	OS << '$';
	break;

	// $n => number of arguments
	case 'n':
	OS << A.size();
	break;

	// $[0-9] => argument
	default: {
	// Missing arguments are ignored.
	unsigned Index = Body[Pos + 1] - '0';
	if (Index >= A.size())
	break;

	// Otherwise substitute with the token values, with spaces eliminated.
	for (const AsmToken &Token : A[Index])
	OS << Token.getString();
	break;
	}
	}
	Pos += 2;
	} else {
	unsigned I = Pos + 1;

	// Check for the \@ pseudo-variable.
	if (EnableAtPseudoVariable && Body[I] == '@' && I + 1 != End)
	++I;
	else
	while (isIdentifierChar(Body[I]) && I + 1 != End)
	++I;

	const char *Begin = Body.data() + Pos + 1;
	StringRef Argument(Begin, I - (Pos + 1));
	unsigned Index = 0;

	if (Argument == "@") {
	OS << NumOfMacroInstantiations;
	Pos += 2;
	} else {
	for (; Index < NParameters; ++Index)
	if (Parameters[Index].Name == Argument)
	break;

	if (Index == NParameters) {
	if (Body[Pos + 1] == '(' && Body[Pos + 2] == ')')
	Pos += 3;
	else {
	OS << '\\' << Argument;
	Pos = I;
	}
	} else {
	bool VarargParameter = HasVararg && Index == (NParameters - 1);
	for (const AsmToken &Token : A[Index])
	// For altmacro mode, you can write '%expr'.
	// The prefix '%' evaluates the expression 'expr'
	// and uses the result as a string (e.g. replace %(1+2) with the string "3").
	// Here, we identify the integer token which is the result of the
	// absolute expression evaluation and replace it with its string representation.
	if ((Lexer.IsaAltMacroMode()) &&
	(*(Token.getString().begin()) == '%') && Token.is(AsmToken::Integer))
	// Emit an integer value to the buffer.
	OS << Token.getIntVal();
	// Only Token that was validated as a string and begins with '<'
	// is considered altMacroString!!!
	else if ((Lexer.IsaAltMacroMode()) &&
	(*(Token.getString().begin()) == '<') &&
	Token.is(AsmToken::String)) {
	std::string Res;
	altMacroString(Token.getStringContents(), Res);
	OS << Res;
	}
	// We expect no quotes around the string's contents when
	// parsing for varargs.
	else if (Token.isNot(AsmToken::String) \|\| VarargParameter)
	OS << Token.getString();
	else
	OS << Token.getStringContents();

	Pos += 1 + Argument.size();
	}
	}
	}
	// Update the scan point.
	Body = Body.substr(Pos);
	}

	return false;
	}

	MacroInstantiation::MacroInstantiation(SMLoc IL, int EB, SMLoc EL,
	size_t CondStackDepth)
	: InstantiationLoc(IL), ExitBuffer(EB), ExitLoc(EL),
	CondStackDepth(CondStackDepth) {}

	static bool isOperator(AsmToken::TokenKind kind) {
	switch (kind) {
	default:
	return false;
	case AsmToken::Plus:
	case AsmToken::Minus:
	case AsmToken::Tilde:
	case AsmToken::Slash:
	case AsmToken::Star:
	case AsmToken::Dot:
	case AsmToken::Equal:
	case AsmToken::EqualEqual:
	case AsmToken::Pipe:
	case AsmToken::PipePipe:
	case AsmToken::Caret:
	case AsmToken::Amp:
	case AsmToken::AmpAmp:
	case AsmToken::Exclaim:
	case AsmToken::ExclaimEqual:
	case AsmToken::Less:
	case AsmToken::LessEqual:
	case AsmToken::LessLess:
	case AsmToken::LessGreater:
	case AsmToken::Greater:
	case AsmToken::GreaterEqual:
	case AsmToken::GreaterGreater:
	return true;
	}
	}

	namespace {

	class AsmLexerSkipSpaceRAII {
	public:
	AsmLexerSkipSpaceRAII(AsmLexer &Lexer, bool SkipSpace) : Lexer(Lexer) {
	Lexer.setSkipSpace(SkipSpace);
	}

	~AsmLexerSkipSpaceRAII() {
	Lexer.setSkipSpace(true);
	}

	private:
	AsmLexer &Lexer;
	};

	} // end anonymous namespace

	bool AsmParser::parseMacroArgument(MCAsmMacroArgument &MA, bool Vararg) {

	if (Vararg) {
	if (Lexer.isNot(AsmToken::EndOfStatement)) {
	StringRef Str = parseStringToEndOfStatement();
	MA.emplace_back(AsmToken::String, Str);
	}
	return false;
	}

	unsigned ParenLevel = 0;

	// Darwin doesn't use spaces to delmit arguments.
	AsmLexerSkipSpaceRAII ScopedSkipSpace(Lexer, IsDarwin);

	bool SpaceEaten;

	while (true) {
	SpaceEaten = false;
	if (Lexer.is(AsmToken::Eof) \|\| Lexer.is(AsmToken::Equal))
	return TokError("unexpected token in macro instantiation");

	if (ParenLevel == 0) {

	if (Lexer.is(AsmToken::Comma))
	break;

	if (Lexer.is(AsmToken::Space)) {
	SpaceEaten = true;
	Lexer.Lex(); // Eat spaces
	}

	// Spaces can delimit parameters, but could also be part an expression.
	// If the token after a space is an operator, add the token and the next
	// one into this argument
	if (!IsDarwin) {
	if (isOperator(Lexer.getKind())) {
	MA.push_back(getTok());
	Lexer.Lex();

	// Whitespace after an operator can be ignored.
	if (Lexer.is(AsmToken::Space))
	Lexer.Lex();

	continue;
	}
	}
	if (SpaceEaten)
	break;
	}

	// handleMacroEntry relies on not advancing the lexer here
	// to be able to fill in the remaining default parameter values
	if (Lexer.is(AsmToken::EndOfStatement))
	break;

	// Adjust the current parentheses level.
	if (Lexer.is(AsmToken::LParen))
	++ParenLevel;
	else if (Lexer.is(AsmToken::RParen) && ParenLevel)
	--ParenLevel;

	// Append the token to the current argument list.
	MA.push_back(getTok());
	Lexer.Lex();
	}

	if (ParenLevel != 0)
	return TokError("unbalanced parentheses in macro argument");
	return false;
	}

	// Parse the macro instantiation arguments.
	bool AsmParser::parseMacroArguments(const MCAsmMacro *M,
	MCAsmMacroArguments &A) {
	const unsigned NParameters = M ? M->Parameters.size() : 0;
	bool NamedParametersFound = false;
	SmallVector<SMLoc, 4> FALocs;

	A.resize(NParameters);
	FALocs.resize(NParameters);

	// Parse two kinds of macro invocations:
	// - macros defined without any parameters accept an arbitrary number of them
	// - macros defined with parameters accept at most that many of them
	bool HasVararg = NParameters ? M->Parameters.back().Vararg : false;
	for (unsigned Parameter = 0; !NParameters \|\| Parameter < NParameters;
	++Parameter) {
	SMLoc IDLoc = Lexer.getLoc();
	MCAsmMacroParameter FA;

	if (Lexer.is(AsmToken::Identifier) && Lexer.peekTok().is(AsmToken::Equal)) {
	if (parseIdentifier(FA.Name))
	return Error(IDLoc, "invalid argument identifier for formal argument");

	if (Lexer.isNot(AsmToken::Equal))
	return TokError("expected '=' after formal parameter identifier");

	Lex();

	NamedParametersFound = true;
	}
	bool Vararg = HasVararg && Parameter == (NParameters - 1);

	if (NamedParametersFound && FA.Name.empty())
	return Error(IDLoc, "cannot mix positional and keyword arguments");

	SMLoc StrLoc = Lexer.getLoc();
	SMLoc EndLoc;
	if (Lexer.IsaAltMacroMode() && Lexer.is(AsmToken::Percent)) {
	const MCExpr *AbsoluteExp;
	int64_t Value;
	/// Eat '%'
	Lex();
	if (parseExpression(AbsoluteExp, EndLoc))
	return false;
	if (!AbsoluteExp->evaluateAsAbsolute(Value))
	return Error(StrLoc, "expected absolute expression");
	const char *StrChar = StrLoc.getPointer();
	const char *EndChar = EndLoc.getPointer();
	AsmToken newToken(AsmToken::Integer, StringRef(StrChar , EndChar - StrChar), Value);
	FA.Value.push_back(newToken);
	} else if (Lexer.IsaAltMacroMode() && Lexer.is(AsmToken::Less) &&
	isAltmacroString(StrLoc, EndLoc)) {
	const char *StrChar = StrLoc.getPointer();
	const char *EndChar = EndLoc.getPointer();
	jumpToLoc(EndLoc, CurBuffer);
	/// Eat from '<' to '>'
	Lex();
	AsmToken newToken(AsmToken::String, StringRef(StrChar, EndChar - StrChar));
	FA.Value.push_back(newToken);
	} else if(parseMacroArgument(FA.Value, Vararg))
	return true;

	unsigned PI = Parameter;
	if (!FA.Name.empty()) {
	unsigned FAI = 0;
	for (FAI = 0; FAI < NParameters; ++FAI)
	if (M->Parameters[FAI].Name == FA.Name)
	break;

	if (FAI >= NParameters) {
	assert(M && "expected macro to be defined");
	return Error(IDLoc, "parameter named '" + FA.Name +
	"' does not exist for macro '" + M->Name + "'");
	}
	PI = FAI;
	}

	if (!FA.Value.empty()) {
	if (A.size() <= PI)
	A.resize(PI + 1);
	A[PI] = FA.Value;

	if (FALocs.size() <= PI)
	FALocs.resize(PI + 1);

	FALocs[PI] = Lexer.getLoc();
	}

	// At the end of the statement, fill in remaining arguments that have
	// default values. If there aren't any, then the next argument is
	// required but missing
	if (Lexer.is(AsmToken::EndOfStatement)) {
	bool Failure = false;
	for (unsigned FAI = 0; FAI < NParameters; ++FAI) {
	if (A[FAI].empty()) {
	if (M->Parameters[FAI].Required) {
	Error(FALocs[FAI].isValid() ? FALocs[FAI] : Lexer.getLoc(),
	"missing value for required parameter "
	"'" + M->Parameters[FAI].Name + "' in macro '" + M->Name + "'");
	Failure = true;
	}

	if (!M->Parameters[FAI].Value.empty())
	A[FAI] = M->Parameters[FAI].Value;
	}
	}
	return Failure;
	}

	if (Lexer.is(AsmToken::Comma))
	Lex();
	}

	return TokError("too many positional arguments");
	}

	-const MCAsmMacro *AsmParser::lookupMacro(StringRef Name) {
	- StringMap<MCAsmMacro>::iterator I = MacroMap.find(Name);
	- return (I == MacroMap.end()) ? nullptr : &I->getValue();
	-}
	-
	-void AsmParser::defineMacro(StringRef Name, MCAsmMacro Macro) {
	- MacroMap.insert(std::make_pair(Name, std::move(Macro)));
	-}
	-
	-void AsmParser::undefineMacro(StringRef Name) { MacroMap.erase(Name); }
	-
	bool AsmParser::handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) {
	// Arbitrarily limit macro nesting depth (default matches 'as'). We can
	// eliminate this, although we should protect against infinite loops.
	unsigned MaxNestingDepth = AsmMacroMaxNestingDepth;
	if (ActiveMacros.size() == MaxNestingDepth) {
	std::ostringstream MaxNestingDepthError;
	MaxNestingDepthError << "macros cannot be nested more than "
	<< MaxNestingDepth << " levels deep."
	<< " Use -asm-macro-max-nesting-depth to increase "
	"this limit.";
	return TokError(MaxNestingDepthError.str());
	}

	MCAsmMacroArguments A;
	if (parseMacroArguments(M, A))
	return true;

	// Macro instantiation is lexical, unfortunately. We construct a new buffer
	// to hold the macro body with substitutions.
	SmallString<256> Buf;
	StringRef Body = M->Body;
	raw_svector_ostream OS(Buf);

	if (expandMacro(OS, Body, M->Parameters, A, true, getTok().getLoc()))
	return true;

	// We include the .endmacro in the buffer as our cue to exit the macro
	// instantiation.
	OS << ".endmacro\n";

	std::unique_ptr<MemoryBuffer> Instantiation =
	MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");

	// Create the macro instantiation object and add to the current macro
	// instantiation stack.
	MacroInstantiation *MI = new MacroInstantiation(
	NameLoc, CurBuffer, getTok().getLoc(), TheCondStack.size());
	ActiveMacros.push_back(MI);

	++NumOfMacroInstantiations;

	// Jump to the macro instantiation and prime the lexer.
	CurBuffer = SrcMgr.AddNewSourceBuffer(std::move(Instantiation), SMLoc());
	Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer());
	Lex();

	return false;
	}

	void AsmParser::handleMacroExit() {
	// Jump to the EndOfStatement we should return to, and consume it.
	jumpToLoc(ActiveMacros.back()->ExitLoc, ActiveMacros.back()->ExitBuffer);
	Lex();

	// Pop the instantiation entry.
	delete ActiveMacros.back();
	ActiveMacros.pop_back();
	}

	bool AsmParser::parseAssignment(StringRef Name, bool allow_redef,
	bool NoDeadStrip) {
	MCSymbol *Sym;
	const MCExpr *Value;
	if (MCParserUtils::parseAssignmentExpression(Name, allow_redef, *this, Sym,
	Value))
	return true;

	if (!Sym) {
	// In the case where we parse an expression starting with a '.', we will
	// not generate an error, nor will we create a symbol. In this case we
	// should just return out.
	return false;
	}

	// Do the assignment.
	Out.EmitAssignment(Sym, Value);
	if (NoDeadStrip)
	Out.EmitSymbolAttribute(Sym, MCSA_NoDeadStrip);

	return false;
	}

	/// parseIdentifier:
	/// ::= identifier
	/// ::= string
	bool AsmParser::parseIdentifier(StringRef &Res) {
	// The assembler has relaxed rules for accepting identifiers, in particular we
	// allow things like '.globl $foo' and '.def @feat.00', which would normally be
	// separate tokens. At this level, we have already lexed so we cannot (currently)
	// handle this as a context dependent token, instead we detect adjacent tokens
	// and return the combined identifier.
	if (Lexer.is(AsmToken::Dollar) \|\| Lexer.is(AsmToken::At)) {
	SMLoc PrefixLoc = getLexer().getLoc();

	// Consume the prefix character, and check for a following identifier.

	AsmToken Buf[1];
	Lexer.peekTokens(Buf, false);

	if (Buf[0].isNot(AsmToken::Identifier))
	return true;

	// We have a '$' or '@' followed by an identifier, make sure they are adjacent.
	if (PrefixLoc.getPointer() + 1 != Buf[0].getLoc().getPointer())
	return true;

	// eat $ or @
	Lexer.Lex(); // Lexer's Lex guarantees consecutive token.
	// Construct the joined identifier and consume the token.
	Res =
	StringRef(PrefixLoc.getPointer(), getTok().getIdentifier().size() + 1);
	Lex(); // Parser Lex to maintain invariants.
	return false;
	}

	if (Lexer.isNot(AsmToken::Identifier) && Lexer.isNot(AsmToken::String))
	return true;

	Res = getTok().getIdentifier();

	Lex(); // Consume the identifier token.

	return false;
	}

	/// parseDirectiveSet:
	/// ::= .equ identifier ',' expression
	/// ::= .equiv identifier ',' expression
	/// ::= .set identifier ',' expression
	bool AsmParser::parseDirectiveSet(StringRef IDVal, bool allow_redef) {
	StringRef Name;
	if (check(parseIdentifier(Name), "expected identifier") \|\|
	parseToken(AsmToken::Comma) \|\| parseAssignment(Name, allow_redef, true))
	return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
	return false;
	}

	bool AsmParser::parseEscapedString(std::string &Data) {
	if (check(getTok().isNot(AsmToken::String), "expected string"))
	return true;

	Data = "";
	StringRef Str = getTok().getStringContents();
	for (unsigned i = 0, e = Str.size(); i != e; ++i) {
	if (Str[i] != '\\') {
	Data += Str[i];
	continue;
	}

	// Recognize escaped characters. Note that this escape semantics currently
	// loosely follows Darwin 'as'. Notably, it doesn't support hex escapes.
	++i;
	if (i == e)
	return TokError("unexpected backslash at end of string");

	// Recognize octal sequences.
	if ((unsigned)(Str[i] - '0') <= 7) {
	// Consume up to three octal characters.
	unsigned Value = Str[i] - '0';

	if (i + 1 != e && ((unsigned)(Str[i + 1] - '0')) <= 7) {
	++i;
	Value = Value * 8 + (Str[i] - '0');

	if (i + 1 != e && ((unsigned)(Str[i + 1] - '0')) <= 7) {
	++i;
	Value = Value * 8 + (Str[i] - '0');
	}
	}

	if (Value > 255)
	return TokError("invalid octal escape sequence (out of range)");

	Data += (unsigned char)Value;
	continue;
	}

	// Otherwise recognize individual escapes.
	switch (Str[i]) {
	default:
	// Just reject invalid escape sequences for now.
	return TokError("invalid escape sequence (unrecognized character)");

	case 'b': Data += '\b'; break;
	case 'f': Data += '\f'; break;
	case 'n': Data += '\n'; break;
	case 'r': Data += '\r'; break;
	case 't': Data += '\t'; break;
	case '"': Data += '"'; break;
	case '\\': Data += '\\'; break;
	}
	}

	Lex();
	return false;
	}

	/// parseDirectiveAscii:
	/// ::= ( .ascii \| .asciz \| .string ) [ "string" ( , "string" )* ]
	bool AsmParser::parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) {
	auto parseOp = [&]() -> bool {
	std::string Data;
	if (checkForValidSection() \|\| parseEscapedString(Data))
	return true;
	getStreamer().EmitBytes(Data);
	if (ZeroTerminated)
	getStreamer().EmitBytes(StringRef("\0", 1));
	return false;
	};

	if (parseMany(parseOp))
	return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
	return false;
	}

	/// parseDirectiveReloc
	/// ::= .reloc expression , identifier [ , expression ]
	bool AsmParser::parseDirectiveReloc(SMLoc DirectiveLoc) {
	const MCExpr *Offset;
	const MCExpr *Expr = nullptr;

	SMLoc OffsetLoc = Lexer.getTok().getLoc();
	int64_t OffsetValue;
	// We can only deal with constant expressions at the moment.

	if (parseExpression(Offset))
	return true;

	if (check(!Offset->evaluateAsAbsolute(OffsetValue), OffsetLoc,
	"expression is not a constant value") \|\|
	check(OffsetValue < 0, OffsetLoc, "expression is negative") \|\|
	parseToken(AsmToken::Comma, "expected comma") \|\|
	check(getTok().isNot(AsmToken::Identifier), "expected relocation name"))
	return true;

	SMLoc NameLoc = Lexer.getTok().getLoc();
	StringRef Name = Lexer.getTok().getIdentifier();
	Lex();

	if (Lexer.is(AsmToken::Comma)) {
	Lex();
	SMLoc ExprLoc = Lexer.getLoc();
	if (parseExpression(Expr))
	return true;

	MCValue Value;
	if (!Expr->evaluateAsRelocatable(Value, nullptr, nullptr))
	return Error(ExprLoc, "expression must be relocatable");
	}

	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in .reloc directive"))
	return true;

	if (getStreamer().EmitRelocDirective(*Offset, Name, Expr, DirectiveLoc))
	return Error(NameLoc, "unknown relocation name");

	return false;
	}

	/// parseDirectiveValue
	/// ::= (.byte \| .short \| ... ) [ expression (, expression)* ]
	bool AsmParser::parseDirectiveValue(StringRef IDVal, unsigned Size) {
	auto parseOp = [&]() -> bool {
	const MCExpr *Value;
	SMLoc ExprLoc = getLexer().getLoc();
	if (checkForValidSection() \|\| parseExpression(Value))
	return true;
	// Special case constant expressions to match code generator.
	if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) {
	assert(Size <= 8 && "Invalid size");
	uint64_t IntValue = MCE->getValue();
	if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
	return Error(ExprLoc, "out of range literal value");
	getStreamer().EmitIntValue(IntValue, Size);
	} else
	getStreamer().EmitValue(Value, Size, ExprLoc);
	return false;
	};

	if (parseMany(parseOp))
	return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
	return false;
	}

	/// ParseDirectiveOctaValue
	/// ::= .octa [ hexconstant (, hexconstant)* ]

	bool AsmParser::parseDirectiveOctaValue(StringRef IDVal) {
	auto parseOp = [&]() -> bool {
	if (checkForValidSection())
	return true;
	if (getTok().isNot(AsmToken::Integer) && getTok().isNot(AsmToken::BigNum))
	return TokError("unknown token in expression");
	SMLoc ExprLoc = getTok().getLoc();
	APInt IntValue = getTok().getAPIntVal();
	uint64_t hi, lo;
	Lex();
	if (!IntValue.isIntN(128))
	return Error(ExprLoc, "out of range literal value");
	if (!IntValue.isIntN(64)) {
	hi = IntValue.getHiBits(IntValue.getBitWidth() - 64).getZExtValue();
	lo = IntValue.getLoBits(64).getZExtValue();
	} else {
	hi = 0;
	lo = IntValue.getZExtValue();
	}
	if (MAI.isLittleEndian()) {
	getStreamer().EmitIntValue(lo, 8);
	getStreamer().EmitIntValue(hi, 8);
	} else {
	getStreamer().EmitIntValue(hi, 8);
	getStreamer().EmitIntValue(lo, 8);
	}
	return false;
	};

	if (parseMany(parseOp))
	return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
	return false;
	}

	bool AsmParser::parseRealValue(const fltSemantics &Semantics, APInt &Res) {
	// We don't truly support arithmetic on floating point expressions, so we
	// have to manually parse unary prefixes.
	bool IsNeg = false;
	if (getLexer().is(AsmToken::Minus)) {
	Lexer.Lex();
	IsNeg = true;
	} else if (getLexer().is(AsmToken::Plus))
	Lexer.Lex();

	if (Lexer.is(AsmToken::Error))
	return TokError(Lexer.getErr());
	if (Lexer.isNot(AsmToken::Integer) && Lexer.isNot(AsmToken::Real) &&
	Lexer.isNot(AsmToken::Identifier))
	return TokError("unexpected token in directive");

	// Convert to an APFloat.
	APFloat Value(Semantics);
	StringRef IDVal = getTok().getString();
	if (getLexer().is(AsmToken::Identifier)) {
	if (!IDVal.compare_lower("infinity") \|\| !IDVal.compare_lower("inf"))
	Value = APFloat::getInf(Semantics);
	else if (!IDVal.compare_lower("nan"))
	Value = APFloat::getNaN(Semantics, false, ~0);
	else
	return TokError("invalid floating point literal");
	} else if (Value.convertFromString(IDVal, APFloat::rmNearestTiesToEven) ==
	APFloat::opInvalidOp)
	return TokError("invalid floating point literal");
	if (IsNeg)
	Value.changeSign();

	// Consume the numeric token.
	Lex();

	Res = Value.bitcastToAPInt();

	return false;
	}

	/// parseDirectiveRealValue
	/// ::= (.single \| .double) [ expression (, expression)* ]
	bool AsmParser::parseDirectiveRealValue(StringRef IDVal,
	const fltSemantics &Semantics) {
	auto parseOp = [&]() -> bool {
	APInt AsInt;
	if (checkForValidSection() \|\| parseRealValue(Semantics, AsInt))
	return true;
	getStreamer().EmitIntValue(AsInt.getLimitedValue(),
	AsInt.getBitWidth() / 8);
	return false;
	};

	if (parseMany(parseOp))
	return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
	return false;
	}

	/// parseDirectiveZero
	/// ::= .zero expression
	bool AsmParser::parseDirectiveZero() {
	SMLoc NumBytesLoc = Lexer.getLoc();
	const MCExpr *NumBytes;
	if (checkForValidSection() \|\| parseExpression(NumBytes))
	return true;

	int64_t Val = 0;
	if (getLexer().is(AsmToken::Comma)) {
	Lex();
	if (parseAbsoluteExpression(Val))
	return true;
	}

	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.zero' directive"))
	return true;
	getStreamer().emitFill(*NumBytes, Val, NumBytesLoc);

	return false;
	}

	/// parseDirectiveFill
	/// ::= .fill expression [ , expression [ , expression ] ]
	bool AsmParser::parseDirectiveFill() {
	SMLoc NumValuesLoc = Lexer.getLoc();
	const MCExpr *NumValues;
	if (checkForValidSection() \|\| parseExpression(NumValues))
	return true;

	int64_t FillSize = 1;
	int64_t FillExpr = 0;

	SMLoc SizeLoc, ExprLoc;

	if (parseOptionalToken(AsmToken::Comma)) {
	SizeLoc = getTok().getLoc();
	if (parseAbsoluteExpression(FillSize))
	return true;
	if (parseOptionalToken(AsmToken::Comma)) {
	ExprLoc = getTok().getLoc();
	if (parseAbsoluteExpression(FillExpr))
	return true;
	}
	}
	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.fill' directive"))
	return true;

	if (FillSize < 0) {
	Warning(SizeLoc, "'.fill' directive with negative size has no effect");
	return false;
	}
	if (FillSize > 8) {
	Warning(SizeLoc, "'.fill' directive with size greater than 8 has been truncated to 8");
	FillSize = 8;
	}

	if (!isUInt<32>(FillExpr) && FillSize > 4)
	Warning(ExprLoc, "'.fill' directive pattern has been truncated to 32-bits");

	getStreamer().emitFill(*NumValues, FillSize, FillExpr, NumValuesLoc);

	return false;
	}

	/// parseDirectiveOrg
	/// ::= .org expression [ , expression ]
	bool AsmParser::parseDirectiveOrg() {
	const MCExpr *Offset;
	SMLoc OffsetLoc = Lexer.getLoc();
	if (checkForValidSection() \|\| parseExpression(Offset))
	return true;

	// Parse optional fill expression.
	int64_t FillExpr = 0;
	if (parseOptionalToken(AsmToken::Comma))
	if (parseAbsoluteExpression(FillExpr))
	return addErrorSuffix(" in '.org' directive");
	if (parseToken(AsmToken::EndOfStatement))
	return addErrorSuffix(" in '.org' directive");

	getStreamer().emitValueToOffset(Offset, FillExpr, OffsetLoc);
	return false;
	}

	/// parseDirectiveAlign
	/// ::= {.align, ...} expression [ , expression [ , expression ]]
	bool AsmParser::parseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
	SMLoc AlignmentLoc = getLexer().getLoc();
	int64_t Alignment;
	SMLoc MaxBytesLoc;
	bool HasFillExpr = false;
	int64_t FillExpr = 0;
	int64_t MaxBytesToFill = 0;

	auto parseAlign = [&]() -> bool {
	if (parseAbsoluteExpression(Alignment))
	return true;
	if (parseOptionalToken(AsmToken::Comma)) {
	// The fill expression can be omitted while specifying a maximum number of
	// alignment bytes, e.g:
	// .align 3,,4
	if (getTok().isNot(AsmToken::Comma)) {
	HasFillExpr = true;
	if (parseAbsoluteExpression(FillExpr))
	return true;
	}
	if (parseOptionalToken(AsmToken::Comma))
	if (parseTokenLoc(MaxBytesLoc) \|\|
	parseAbsoluteExpression(MaxBytesToFill))
	return true;
	}
	return parseToken(AsmToken::EndOfStatement);
	};

	if (checkForValidSection())
	return addErrorSuffix(" in directive");
	// Ignore empty '.p2align' directives for GNU-as compatibility
	if (IsPow2 && (ValueSize == 1) && getTok().is(AsmToken::EndOfStatement)) {
	Warning(AlignmentLoc, "p2align directive with no operand(s) is ignored");
	return parseToken(AsmToken::EndOfStatement);
	}
	if (parseAlign())
	return addErrorSuffix(" in directive");

	// Always emit an alignment here even if we thrown an error.
	bool ReturnVal = false;

	// Compute alignment in bytes.
	if (IsPow2) {
	// FIXME: Diagnose overflow.
	if (Alignment >= 32) {
	ReturnVal \|= Error(AlignmentLoc, "invalid alignment value");
	Alignment = 31;
	}

	Alignment = 1ULL << Alignment;
	} else {
	// Reject alignments that aren't either a power of two or zero,
	// for gas compatibility. Alignment of zero is silently rounded
	// up to one.
	if (Alignment == 0)
	Alignment = 1;
	if (!isPowerOf2_64(Alignment))
	ReturnVal \|= Error(AlignmentLoc, "alignment must be a power of 2");
	}

	// Diagnose non-sensical max bytes to align.
	if (MaxBytesLoc.isValid()) {
	if (MaxBytesToFill < 1) {
	ReturnVal \|= Error(MaxBytesLoc,
	"alignment directive can never be satisfied in this "
	"many bytes, ignoring maximum bytes expression");
	MaxBytesToFill = 0;
	}

	if (MaxBytesToFill >= Alignment) {
	Warning(MaxBytesLoc, "maximum bytes expression exceeds alignment and "
	"has no effect");
	MaxBytesToFill = 0;
	}
	}

	// Check whether we should use optimal code alignment for this .align
	// directive.
	const MCSection *Section = getStreamer().getCurrentSectionOnly();
	assert(Section && "must have section to emit alignment");
	bool UseCodeAlign = Section->UseCodeAlign();
	if ((!HasFillExpr \|\| Lexer.getMAI().getTextAlignFillValue() == FillExpr) &&
	ValueSize == 1 && UseCodeAlign) {
	getStreamer().EmitCodeAlignment(Alignment, MaxBytesToFill);
	} else {
	// FIXME: Target specific behavior about how the "extra" bytes are filled.
	getStreamer().EmitValueToAlignment(Alignment, FillExpr, ValueSize,
	MaxBytesToFill);
	}

	return ReturnVal;
	}

	/// parseDirectiveFile
	/// ::= .file [number] filename
	/// ::= .file number directory filename
	bool AsmParser::parseDirectiveFile(SMLoc DirectiveLoc) {
	// FIXME: I'm not sure what this is.
	int64_t FileNumber = -1;
	SMLoc FileNumberLoc = getLexer().getLoc();
	if (getLexer().is(AsmToken::Integer)) {
	FileNumber = getTok().getIntVal();
	Lex();

	if (FileNumber < 1)
	return TokError("file number less than one");
	}

	std::string Path = getTok().getString();

	// Usually the directory and filename together, otherwise just the directory.
	// Allow the strings to have escaped octal character sequence.
	if (check(getTok().isNot(AsmToken::String),
	"unexpected token in '.file' directive") \|\|
	parseEscapedString(Path))
	return true;

	StringRef Directory;
	StringRef Filename;
	std::string FilenameData;
	if (getLexer().is(AsmToken::String)) {
	if (check(FileNumber == -1,
	"explicit path specified, but no file number") \|\|
	parseEscapedString(FilenameData))
	return true;
	Filename = FilenameData;
	Directory = Path;
	} else {
	Filename = Path;
	}

	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.file' directive"))
	return true;

	if (FileNumber == -1)
	getStreamer().EmitFileDirective(Filename);
	else {
	// If there is -g option as well as debug info from directive file,
	// we turn off -g option, directly use the existing debug info instead.
	if (getContext().getGenDwarfForAssembly())
	getContext().setGenDwarfForAssembly(false);
	else if (getStreamer().EmitDwarfFileDirective(FileNumber, Directory, Filename) ==
	0)
	return Error(FileNumberLoc, "file number already allocated");
	}

	return false;
	}

	/// parseDirectiveLine
	/// ::= .line [number]
	bool AsmParser::parseDirectiveLine() {
	int64_t LineNumber;
	if (getLexer().is(AsmToken::Integer)) {
	if (parseIntToken(LineNumber, "unexpected token in '.line' directive"))
	return true;
	(void)LineNumber;
	// FIXME: Do something with the .line.
	}
	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.line' directive"))
	return true;

	return false;
	}

	/// parseDirectiveLoc
	/// ::= .loc FileNumber [LineNumber] [ColumnPos] [basic_block] [prologue_end]
	/// [epilogue_begin] [is_stmt VALUE] [isa VALUE]
	/// The first number is a file number, must have been previously assigned with
	/// a .file directive, the second number is the line number and optionally the
	/// third number is a column position (zero if not specified). The remaining
	/// optional items are .loc sub-directives.
	bool AsmParser::parseDirectiveLoc() {
	int64_t FileNumber = 0, LineNumber = 0;
	SMLoc Loc = getTok().getLoc();
	if (parseIntToken(FileNumber, "unexpected token in '.loc' directive") \|\|
	check(FileNumber < 1, Loc,
	"file number less than one in '.loc' directive") \|\|
	check(!getContext().isValidDwarfFileNumber(FileNumber), Loc,
	"unassigned file number in '.loc' directive"))
	return true;

	// optional
	if (getLexer().is(AsmToken::Integer)) {
	LineNumber = getTok().getIntVal();
	if (LineNumber < 0)
	return TokError("line number less than zero in '.loc' directive");
	Lex();
	}

	int64_t ColumnPos = 0;
	if (getLexer().is(AsmToken::Integer)) {
	ColumnPos = getTok().getIntVal();
	if (ColumnPos < 0)
	return TokError("column position less than zero in '.loc' directive");
	Lex();
	}

	unsigned Flags = DWARF2_LINE_DEFAULT_IS_STMT ? DWARF2_FLAG_IS_STMT : 0;
	unsigned Isa = 0;
	int64_t Discriminator = 0;

	auto parseLocOp = [&]() -> bool {
	StringRef Name;
	SMLoc Loc = getTok().getLoc();
	if (parseIdentifier(Name))
	return TokError("unexpected token in '.loc' directive");

	if (Name == "basic_block")
	Flags \|= DWARF2_FLAG_BASIC_BLOCK;
	else if (Name == "prologue_end")
	Flags \|= DWARF2_FLAG_PROLOGUE_END;
	else if (Name == "epilogue_begin")
	Flags \|= DWARF2_FLAG_EPILOGUE_BEGIN;
	else if (Name == "is_stmt") {
	Loc = getTok().getLoc();
	const MCExpr *Value;
	if (parseExpression(Value))
	return true;
	// The expression must be the constant 0 or 1.
	if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) {
	int Value = MCE->getValue();
	if (Value == 0)
	Flags &= ~DWARF2_FLAG_IS_STMT;
	else if (Value == 1)
	Flags \|= DWARF2_FLAG_IS_STMT;
	else
	return Error(Loc, "is_stmt value not 0 or 1");
	} else {
	return Error(Loc, "is_stmt value not the constant value of 0 or 1");
	}
	} else if (Name == "isa") {
	Loc = getTok().getLoc();
	const MCExpr *Value;
	if (parseExpression(Value))
	return true;
	// The expression must be a constant greater or equal to 0.
	if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) {
	int Value = MCE->getValue();
	if (Value < 0)
	return Error(Loc, "isa number less than zero");
	Isa = Value;
	} else {
	return Error(Loc, "isa number not a constant value");
	}
	} else if (Name == "discriminator") {
	if (parseAbsoluteExpression(Discriminator))
	return true;
	} else {
	return Error(Loc, "unknown sub-directive in '.loc' directive");
	}
	return false;
	};

	if (parseMany(parseLocOp, false /hasComma/))
	return true;

	getStreamer().EmitDwarfLocDirective(FileNumber, LineNumber, ColumnPos, Flags,
	Isa, Discriminator, StringRef());

	return false;
	}

	/// parseDirectiveStabs
	/// ::= .stabs string, number, number, number
	bool AsmParser::parseDirectiveStabs() {
	return TokError("unsupported directive '.stabs'");
	}

	/// parseDirectiveCVFile
	/// ::= .cv_file number filename [checksum] [checksumkind]
	bool AsmParser::parseDirectiveCVFile() {
	SMLoc FileNumberLoc = getTok().getLoc();
	int64_t FileNumber;
	std::string Filename;
	std::string Checksum;
	int64_t ChecksumKind = 0;

	if (parseIntToken(FileNumber,
	"expected file number in '.cv_file' directive") \|\|
	check(FileNumber < 1, FileNumberLoc, "file number less than one") \|\|
	check(getTok().isNot(AsmToken::String),
	"unexpected token in '.cv_file' directive") \|\|
	parseEscapedString(Filename))
	return true;
	if (!parseOptionalToken(AsmToken::EndOfStatement)) {
	if (check(getTok().isNot(AsmToken::String),
	"unexpected token in '.cv_file' directive") \|\|
	parseEscapedString(Checksum) \|\|
	parseIntToken(ChecksumKind,
	"expected checksum kind in '.cv_file' directive") \|\|
	parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.cv_file' directive"))
	return true;
	}

	Checksum = fromHex(Checksum);
	void *CKMem = Ctx.allocate(Checksum.size(), 1);
	memcpy(CKMem, Checksum.data(), Checksum.size());
	ArrayRef<uint8_t> ChecksumAsBytes(reinterpret_cast<const uint8_t *>(CKMem),
	Checksum.size());

	if (!getStreamer().EmitCVFileDirective(FileNumber, Filename, ChecksumAsBytes,
	static_cast<uint8_t>(ChecksumKind)))
	return Error(FileNumberLoc, "file number already allocated");

	return false;
	}

	bool AsmParser::parseCVFunctionId(int64_t &FunctionId,
	StringRef DirectiveName) {
	SMLoc Loc;
	return parseTokenLoc(Loc) \|\|
	parseIntToken(FunctionId, "expected function id in '" + DirectiveName +
	"' directive") \|\|
	check(FunctionId < 0 \|\| FunctionId >= UINT_MAX, Loc,
	"expected function id within range [0, UINT_MAX)");
	}

	bool AsmParser::parseCVFileId(int64_t &FileNumber, StringRef DirectiveName) {
	SMLoc Loc;
	return parseTokenLoc(Loc) \|\|
	parseIntToken(FileNumber, "expected integer in '" + DirectiveName +
	"' directive") \|\|
	check(FileNumber < 1, Loc, "file number less than one in '" +
	DirectiveName + "' directive") \|\|
	check(!getCVContext().isValidFileNumber(FileNumber), Loc,
	"unassigned file number in '" + DirectiveName + "' directive");
	}

	/// parseDirectiveCVFuncId
	/// ::= .cv_func_id FunctionId
	///
	/// Introduces a function ID that can be used with .cv_loc.
	bool AsmParser::parseDirectiveCVFuncId() {
	SMLoc FunctionIdLoc = getTok().getLoc();
	int64_t FunctionId;

	if (parseCVFunctionId(FunctionId, ".cv_func_id") \|\|
	parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.cv_func_id' directive"))
	return true;

	if (!getStreamer().EmitCVFuncIdDirective(FunctionId))
	return Error(FunctionIdLoc, "function id already allocated");

	return false;
	}

	/// parseDirectiveCVInlineSiteId
	/// ::= .cv_inline_site_id FunctionId
	/// "within" IAFunc
	/// "inlined_at" IAFile IALine [IACol]
	///
	/// Introduces a function ID that can be used with .cv_loc. Includes "inlined
	/// at" source location information for use in the line table of the caller,
	/// whether the caller is a real function or another inlined call site.
	bool AsmParser::parseDirectiveCVInlineSiteId() {
	SMLoc FunctionIdLoc = getTok().getLoc();
	int64_t FunctionId;
	int64_t IAFunc;
	int64_t IAFile;
	int64_t IALine;
	int64_t IACol = 0;

	// FunctionId
	if (parseCVFunctionId(FunctionId, ".cv_inline_site_id"))
	return true;

	// "within"
	if (check((getLexer().isNot(AsmToken::Identifier) \|\|
	getTok().getIdentifier() != "within"),
	"expected 'within' identifier in '.cv_inline_site_id' directive"))
	return true;
	Lex();

	// IAFunc
	if (parseCVFunctionId(IAFunc, ".cv_inline_site_id"))
	return true;

	// "inlined_at"
	if (check((getLexer().isNot(AsmToken::Identifier) \|\|
	getTok().getIdentifier() != "inlined_at"),
	"expected 'inlined_at' identifier in '.cv_inline_site_id' "
	"directive") )
	return true;
	Lex();

	// IAFile IALine
	if (parseCVFileId(IAFile, ".cv_inline_site_id") \|\|
	parseIntToken(IALine, "expected line number after 'inlined_at'"))
	return true;

	// [IACol]
	if (getLexer().is(AsmToken::Integer)) {
	IACol = getTok().getIntVal();
	Lex();
	}

	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.cv_inline_site_id' directive"))
	return true;

	if (!getStreamer().EmitCVInlineSiteIdDirective(FunctionId, IAFunc, IAFile,
	IALine, IACol, FunctionIdLoc))
	return Error(FunctionIdLoc, "function id already allocated");

	return false;
	}

	/// parseDirectiveCVLoc
	/// ::= .cv_loc FunctionId FileNumber [LineNumber] [ColumnPos] [prologue_end]
	/// [is_stmt VALUE]
	/// The first number is a file number, must have been previously assigned with
	/// a .file directive, the second number is the line number and optionally the
	/// third number is a column position (zero if not specified). The remaining
	/// optional items are .loc sub-directives.
	bool AsmParser::parseDirectiveCVLoc() {
	SMLoc DirectiveLoc = getTok().getLoc();
	int64_t FunctionId, FileNumber;
	if (parseCVFunctionId(FunctionId, ".cv_loc") \|\|
	parseCVFileId(FileNumber, ".cv_loc"))
	return true;

	int64_t LineNumber = 0;
	if (getLexer().is(AsmToken::Integer)) {
	LineNumber = getTok().getIntVal();
	if (LineNumber < 0)
	return TokError("line number less than zero in '.cv_loc' directive");
	Lex();
	}

	int64_t ColumnPos = 0;
	if (getLexer().is(AsmToken::Integer)) {
	ColumnPos = getTok().getIntVal();
	if (ColumnPos < 0)
	return TokError("column position less than zero in '.cv_loc' directive");
	Lex();
	}

	bool PrologueEnd = false;
	uint64_t IsStmt = 0;

	auto parseOp = [&]() -> bool {
	StringRef Name;
	SMLoc Loc = getTok().getLoc();
	if (parseIdentifier(Name))
	return TokError("unexpected token in '.cv_loc' directive");
	if (Name == "prologue_end")
	PrologueEnd = true;
	else if (Name == "is_stmt") {
	Loc = getTok().getLoc();
	const MCExpr *Value;
	if (parseExpression(Value))
	return true;
	// The expression must be the constant 0 or 1.
	IsStmt = ~0ULL;
	if (const auto *MCE = dyn_cast<MCConstantExpr>(Value))
	IsStmt = MCE->getValue();

	if (IsStmt > 1)
	return Error(Loc, "is_stmt value not 0 or 1");
	} else {
	return Error(Loc, "unknown sub-directive in '.cv_loc' directive");
	}
	return false;
	};

	if (parseMany(parseOp, false /hasComma/))
	return true;

	getStreamer().EmitCVLocDirective(FunctionId, FileNumber, LineNumber,
	ColumnPos, PrologueEnd, IsStmt, StringRef(),
	DirectiveLoc);
	return false;
	}

	/// parseDirectiveCVLinetable
	/// ::= .cv_linetable FunctionId, FnStart, FnEnd
	bool AsmParser::parseDirectiveCVLinetable() {
	int64_t FunctionId;
	StringRef FnStartName, FnEndName;
	SMLoc Loc = getTok().getLoc();
	if (parseCVFunctionId(FunctionId, ".cv_linetable") \|\|
	parseToken(AsmToken::Comma,
	"unexpected token in '.cv_linetable' directive") \|\|
	parseTokenLoc(Loc) \|\| check(parseIdentifier(FnStartName), Loc,
	"expected identifier in directive") \|\|
	parseToken(AsmToken::Comma,
	"unexpected token in '.cv_linetable' directive") \|\|
	parseTokenLoc(Loc) \|\| check(parseIdentifier(FnEndName), Loc,
	"expected identifier in directive"))
	return true;

	MCSymbol *FnStartSym = getContext().getOrCreateSymbol(FnStartName);
	MCSymbol *FnEndSym = getContext().getOrCreateSymbol(FnEndName);

	getStreamer().EmitCVLinetableDirective(FunctionId, FnStartSym, FnEndSym);
	return false;
	}

	/// parseDirectiveCVInlineLinetable
	/// ::= .cv_inline_linetable PrimaryFunctionId FileId LineNum FnStart FnEnd
	bool AsmParser::parseDirectiveCVInlineLinetable() {
	int64_t PrimaryFunctionId, SourceFileId, SourceLineNum;
	StringRef FnStartName, FnEndName;
	SMLoc Loc = getTok().getLoc();
	if (parseCVFunctionId(PrimaryFunctionId, ".cv_inline_linetable") \|\|
	parseTokenLoc(Loc) \|\|
	parseIntToken(
	SourceFileId,
	"expected SourceField in '.cv_inline_linetable' directive") \|\|
	check(SourceFileId <= 0, Loc,
	"File id less than zero in '.cv_inline_linetable' directive") \|\|
	parseTokenLoc(Loc) \|\|
	parseIntToken(
	SourceLineNum,
	"expected SourceLineNum in '.cv_inline_linetable' directive") \|\|
	check(SourceLineNum < 0, Loc,
	"Line number less than zero in '.cv_inline_linetable' directive") \|\|
	parseTokenLoc(Loc) \|\| check(parseIdentifier(FnStartName), Loc,
	"expected identifier in directive") \|\|
	parseTokenLoc(Loc) \|\| check(parseIdentifier(FnEndName), Loc,
	"expected identifier in directive"))
	return true;

	if (parseToken(AsmToken::EndOfStatement, "Expected End of Statement"))
	return true;

	MCSymbol *FnStartSym = getContext().getOrCreateSymbol(FnStartName);
	MCSymbol *FnEndSym = getContext().getOrCreateSymbol(FnEndName);
	getStreamer().EmitCVInlineLinetableDirective(PrimaryFunctionId, SourceFileId,
	SourceLineNum, FnStartSym,
	FnEndSym);
	return false;
	}

	/// parseDirectiveCVDefRange
	/// ::= .cv_def_range RangeStart RangeEnd (GapStart GapEnd), bytes
	bool AsmParser::parseDirectiveCVDefRange() {
	SMLoc Loc;
	std::vector<std::pair<const MCSymbol , const MCSymbol >> Ranges;
	while (getLexer().is(AsmToken::Identifier)) {
	Loc = getLexer().getLoc();
	StringRef GapStartName;
	if (parseIdentifier(GapStartName))
	return Error(Loc, "expected identifier in directive");
	MCSymbol *GapStartSym = getContext().getOrCreateSymbol(GapStartName);

	Loc = getLexer().getLoc();
	StringRef GapEndName;
	if (parseIdentifier(GapEndName))
	return Error(Loc, "expected identifier in directive");
	MCSymbol *GapEndSym = getContext().getOrCreateSymbol(GapEndName);

	Ranges.push_back({GapStartSym, GapEndSym});
	}

	std::string FixedSizePortion;
	if (parseToken(AsmToken::Comma, "unexpected token in directive") \|\|
	parseEscapedString(FixedSizePortion))
	return true;

	getStreamer().EmitCVDefRangeDirective(Ranges, FixedSizePortion);
	return false;
	}

	/// parseDirectiveCVStringTable
	/// ::= .cv_stringtable
	bool AsmParser::parseDirectiveCVStringTable() {
	getStreamer().EmitCVStringTableDirective();
	return false;
	}

	/// parseDirectiveCVFileChecksums
	/// ::= .cv_filechecksums
	bool AsmParser::parseDirectiveCVFileChecksums() {
	getStreamer().EmitCVFileChecksumsDirective();
	return false;
	}

	/// parseDirectiveCVFileChecksumOffset
	/// ::= .cv_filechecksumoffset fileno
	bool AsmParser::parseDirectiveCVFileChecksumOffset() {
	int64_t FileNo;
	if (parseIntToken(FileNo, "expected identifier in directive"))
	return true;
	if (parseToken(AsmToken::EndOfStatement, "Expected End of Statement"))
	return true;
	getStreamer().EmitCVFileChecksumOffsetDirective(FileNo);
	return false;
	}

	/// parseDirectiveCVFPOData
	/// ::= .cv_fpo_data procsym
	bool AsmParser::parseDirectiveCVFPOData() {
	SMLoc DirLoc = getLexer().getLoc();
	StringRef ProcName;
	if (parseIdentifier(ProcName))
	return TokError("expected symbol name");
	if (parseEOL("unexpected tokens"))
	return addErrorSuffix(" in '.cv_fpo_data' directive");
	MCSymbol *ProcSym = getContext().getOrCreateSymbol(ProcName);
	getStreamer().EmitCVFPOData(ProcSym, DirLoc);
	return false;
	}

	/// parseDirectiveCFISections
	/// ::= .cfi_sections section [, section]
	bool AsmParser::parseDirectiveCFISections() {
	StringRef Name;
	bool EH = false;
	bool Debug = false;

	if (parseIdentifier(Name))
	return TokError("Expected an identifier");

	if (Name == ".eh_frame")
	EH = true;
	else if (Name == ".debug_frame")
	Debug = true;

	if (getLexer().is(AsmToken::Comma)) {
	Lex();

	if (parseIdentifier(Name))
	return TokError("Expected an identifier");

	if (Name == ".eh_frame")
	EH = true;
	else if (Name == ".debug_frame")
	Debug = true;
	}

	getStreamer().EmitCFISections(EH, Debug);
	return false;
	}

	/// parseDirectiveCFIStartProc
	/// ::= .cfi_startproc [simple]
	bool AsmParser::parseDirectiveCFIStartProc() {
	StringRef Simple;
	if (!parseOptionalToken(AsmToken::EndOfStatement)) {
	if (check(parseIdentifier(Simple) \|\| Simple != "simple",
	"unexpected token") \|\|
	parseToken(AsmToken::EndOfStatement))
	return addErrorSuffix(" in '.cfi_startproc' directive");
	}

	getStreamer().EmitCFIStartProc(!Simple.empty());
	return false;
	}

	/// parseDirectiveCFIEndProc
	/// ::= .cfi_endproc
	bool AsmParser::parseDirectiveCFIEndProc() {
	getStreamer().EmitCFIEndProc();
	return false;
	}

	/// \brief parse register name or number.
	bool AsmParser::parseRegisterOrRegisterNumber(int64_t &Register,
	SMLoc DirectiveLoc) {
	unsigned RegNo;

	if (getLexer().isNot(AsmToken::Integer)) {
	if (getTargetParser().ParseRegister(RegNo, DirectiveLoc, DirectiveLoc))
	return true;
	Register = getContext().getRegisterInfo()->getDwarfRegNum(RegNo, true);
	} else
	return parseAbsoluteExpression(Register);

	return false;
	}

	/// parseDirectiveCFIDefCfa
	/// ::= .cfi_def_cfa register, offset
	bool AsmParser::parseDirectiveCFIDefCfa(SMLoc DirectiveLoc) {
	int64_t Register = 0, Offset = 0;
	if (parseRegisterOrRegisterNumber(Register, DirectiveLoc) \|\|
	parseToken(AsmToken::Comma, "unexpected token in directive") \|\|
	parseAbsoluteExpression(Offset))
	return true;

	getStreamer().EmitCFIDefCfa(Register, Offset);
	return false;
	}

	/// parseDirectiveCFIDefCfaOffset
	/// ::= .cfi_def_cfa_offset offset
	bool AsmParser::parseDirectiveCFIDefCfaOffset() {
	int64_t Offset = 0;
	if (parseAbsoluteExpression(Offset))
	return true;

	getStreamer().EmitCFIDefCfaOffset(Offset);
	return false;
	}

	/// parseDirectiveCFIRegister
	/// ::= .cfi_register register, register
	bool AsmParser::parseDirectiveCFIRegister(SMLoc DirectiveLoc) {
	int64_t Register1 = 0, Register2 = 0;
	if (parseRegisterOrRegisterNumber(Register1, DirectiveLoc) \|\|
	parseToken(AsmToken::Comma, "unexpected token in directive") \|\|
	parseRegisterOrRegisterNumber(Register2, DirectiveLoc))
	return true;

	getStreamer().EmitCFIRegister(Register1, Register2);
	return false;
	}

	/// parseDirectiveCFIWindowSave
	/// ::= .cfi_window_save
	bool AsmParser::parseDirectiveCFIWindowSave() {
	getStreamer().EmitCFIWindowSave();
	return false;
	}

	/// parseDirectiveCFIAdjustCfaOffset
	/// ::= .cfi_adjust_cfa_offset adjustment
	bool AsmParser::parseDirectiveCFIAdjustCfaOffset() {
	int64_t Adjustment = 0;
	if (parseAbsoluteExpression(Adjustment))
	return true;

	getStreamer().EmitCFIAdjustCfaOffset(Adjustment);
	return false;
	}

	/// parseDirectiveCFIDefCfaRegister
	/// ::= .cfi_def_cfa_register register
	bool AsmParser::parseDirectiveCFIDefCfaRegister(SMLoc DirectiveLoc) {
	int64_t Register = 0;
	if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
	return true;

	getStreamer().EmitCFIDefCfaRegister(Register);
	return false;
	}

	/// parseDirectiveCFIOffset
	/// ::= .cfi_offset register, offset
	bool AsmParser::parseDirectiveCFIOffset(SMLoc DirectiveLoc) {
	int64_t Register = 0;
	int64_t Offset = 0;

	if (parseRegisterOrRegisterNumber(Register, DirectiveLoc) \|\|
	parseToken(AsmToken::Comma, "unexpected token in directive") \|\|
	parseAbsoluteExpression(Offset))
	return true;

	getStreamer().EmitCFIOffset(Register, Offset);
	return false;
	}

	/// parseDirectiveCFIRelOffset
	/// ::= .cfi_rel_offset register, offset
	bool AsmParser::parseDirectiveCFIRelOffset(SMLoc DirectiveLoc) {
	int64_t Register = 0, Offset = 0;

	if (parseRegisterOrRegisterNumber(Register, DirectiveLoc) \|\|
	parseToken(AsmToken::Comma, "unexpected token in directive") \|\|
	parseAbsoluteExpression(Offset))
	return true;

	getStreamer().EmitCFIRelOffset(Register, Offset);
	return false;
	}

	static bool isValidEncoding(int64_t Encoding) {
	if (Encoding & ~0xff)
	return false;

	if (Encoding == dwarf::DW_EH_PE_omit)
	return true;

	const unsigned Format = Encoding & 0xf;
	if (Format != dwarf::DW_EH_PE_absptr && Format != dwarf::DW_EH_PE_udata2 &&
	Format != dwarf::DW_EH_PE_udata4 && Format != dwarf::DW_EH_PE_udata8 &&
	Format != dwarf::DW_EH_PE_sdata2 && Format != dwarf::DW_EH_PE_sdata4 &&
	Format != dwarf::DW_EH_PE_sdata8 && Format != dwarf::DW_EH_PE_signed)
	return false;

	const unsigned Application = Encoding & 0x70;
	if (Application != dwarf::DW_EH_PE_absptr &&
	Application != dwarf::DW_EH_PE_pcrel)
	return false;

	return true;
	}

	/// parseDirectiveCFIPersonalityOrLsda
	/// IsPersonality true for cfi_personality, false for cfi_lsda
	/// ::= .cfi_personality encoding, [symbol_name]
	/// ::= .cfi_lsda encoding, [symbol_name]
	bool AsmParser::parseDirectiveCFIPersonalityOrLsda(bool IsPersonality) {
	int64_t Encoding = 0;
	if (parseAbsoluteExpression(Encoding))
	return true;
	if (Encoding == dwarf::DW_EH_PE_omit)
	return false;

	StringRef Name;
	if (check(!isValidEncoding(Encoding), "unsupported encoding.") \|\|
	parseToken(AsmToken::Comma, "unexpected token in directive") \|\|
	check(parseIdentifier(Name), "expected identifier in directive"))
	return true;

	MCSymbol *Sym = getContext().getOrCreateSymbol(Name);

	if (IsPersonality)
	getStreamer().EmitCFIPersonality(Sym, Encoding);
	else
	getStreamer().EmitCFILsda(Sym, Encoding);
	return false;
	}

	/// parseDirectiveCFIRememberState
	/// ::= .cfi_remember_state
	bool AsmParser::parseDirectiveCFIRememberState() {
	getStreamer().EmitCFIRememberState();
	return false;
	}

	/// parseDirectiveCFIRestoreState
	/// ::= .cfi_remember_state
	bool AsmParser::parseDirectiveCFIRestoreState() {
	getStreamer().EmitCFIRestoreState();
	return false;
	}

	/// parseDirectiveCFISameValue
	/// ::= .cfi_same_value register
	bool AsmParser::parseDirectiveCFISameValue(SMLoc DirectiveLoc) {
	int64_t Register = 0;

	if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
	return true;

	getStreamer().EmitCFISameValue(Register);
	return false;
	}

	/// parseDirectiveCFIRestore
	/// ::= .cfi_restore register
	bool AsmParser::parseDirectiveCFIRestore(SMLoc DirectiveLoc) {
	int64_t Register = 0;
	if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
	return true;

	getStreamer().EmitCFIRestore(Register);
	return false;
	}

	/// parseDirectiveCFIEscape
	/// ::= .cfi_escape expression[,...]
	bool AsmParser::parseDirectiveCFIEscape() {
	std::string Values;
	int64_t CurrValue;
	if (parseAbsoluteExpression(CurrValue))
	return true;

	Values.push_back((uint8_t)CurrValue);

	while (getLexer().is(AsmToken::Comma)) {
	Lex();

	if (parseAbsoluteExpression(CurrValue))
	return true;

	Values.push_back((uint8_t)CurrValue);
	}

	getStreamer().EmitCFIEscape(Values);
	return false;
	}

	/// parseDirectiveCFIReturnColumn
	/// ::= .cfi_return_column register
	bool AsmParser::parseDirectiveCFIReturnColumn(SMLoc DirectiveLoc) {
	int64_t Register = 0;
	if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
	return true;
	getStreamer().EmitCFIReturnColumn(Register);
	return false;
	}

	/// parseDirectiveCFISignalFrame
	/// ::= .cfi_signal_frame
	bool AsmParser::parseDirectiveCFISignalFrame() {
	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.cfi_signal_frame'"))
	return true;

	getStreamer().EmitCFISignalFrame();
	return false;
	}

	/// parseDirectiveCFIUndefined
	/// ::= .cfi_undefined register
	bool AsmParser::parseDirectiveCFIUndefined(SMLoc DirectiveLoc) {
	int64_t Register = 0;

	if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
	return true;

	getStreamer().EmitCFIUndefined(Register);
	return false;
	}

	/// parseDirectiveAltmacro
	/// ::= .altmacro
	/// ::= .noaltmacro
	bool AsmParser::parseDirectiveAltmacro(StringRef Directive) {
	if (getLexer().isNot(AsmToken::EndOfStatement))
	return TokError("unexpected token in '" + Directive + "' directive");
	if (Directive == ".altmacro")
	getLexer().SetAltMacroMode(true);
	else
	getLexer().SetAltMacroMode(false);
	return false;
	}

	/// parseDirectiveMacrosOnOff
	/// ::= .macros_on
	/// ::= .macros_off
	bool AsmParser::parseDirectiveMacrosOnOff(StringRef Directive) {
	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '" + Directive + "' directive"))
	return true;

	setMacrosEnabled(Directive == ".macros_on");
	return false;
	}

	/// parseDirectiveMacro
	/// ::= .macro name[,] [parameters]
	bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
	StringRef Name;
	if (parseIdentifier(Name))
	return TokError("expected identifier in '.macro' directive");

	if (getLexer().is(AsmToken::Comma))
	Lex();

	MCAsmMacroParameters Parameters;
	while (getLexer().isNot(AsmToken::EndOfStatement)) {

	if (!Parameters.empty() && Parameters.back().Vararg)
	return Error(Lexer.getLoc(),
	"Vararg parameter '" + Parameters.back().Name +
	"' should be last one in the list of parameters.");

	MCAsmMacroParameter Parameter;
	if (parseIdentifier(Parameter.Name))
	return TokError("expected identifier in '.macro' directive");

	// Emit an error if two (or more) named parameters share the same name
	for (const MCAsmMacroParameter& CurrParam : Parameters)
	if (CurrParam.Name.equals(Parameter.Name))
	return TokError("macro '" + Name + "' has multiple parameters"
	" named '" + Parameter.Name + "'");

	if (Lexer.is(AsmToken::Colon)) {
	Lex(); // consume ':'

	SMLoc QualLoc;
	StringRef Qualifier;

	QualLoc = Lexer.getLoc();
	if (parseIdentifier(Qualifier))
	return Error(QualLoc, "missing parameter qualifier for "
	"'" + Parameter.Name + "' in macro '" + Name + "'");

	if (Qualifier == "req")
	Parameter.Required = true;
	else if (Qualifier == "vararg")
	Parameter.Vararg = true;
	else
	return Error(QualLoc, Qualifier + " is not a valid parameter qualifier "
	"for '" + Parameter.Name + "' in macro '" + Name + "'");
	}

	if (getLexer().is(AsmToken::Equal)) {
	Lex();

	SMLoc ParamLoc;

	ParamLoc = Lexer.getLoc();
	if (parseMacroArgument(Parameter.Value, /Vararg=/false ))
	return true;

	if (Parameter.Required)
	Warning(ParamLoc, "pointless default value for required parameter "
	"'" + Parameter.Name + "' in macro '" + Name + "'");
	}

	Parameters.push_back(std::move(Parameter));

	if (getLexer().is(AsmToken::Comma))
	Lex();
	}

	// Eat just the end of statement.
	Lexer.Lex();

	// Consuming deferred text, so use Lexer.Lex to ignore Lexing Errors
	AsmToken EndToken, StartToken = getTok();
	unsigned MacroDepth = 0;
	// Lex the macro definition.
	while (true) {
	// Ignore Lexing errors in macros.
	while (Lexer.is(AsmToken::Error)) {
	Lexer.Lex();
	}

	// Check whether we have reached the end of the file.
	if (getLexer().is(AsmToken::Eof))
	return Error(DirectiveLoc, "no matching '.endmacro' in definition");

	// Otherwise, check whether we have reach the .endmacro.
	if (getLexer().is(AsmToken::Identifier)) {
	if (getTok().getIdentifier() == ".endm" \|\|
	getTok().getIdentifier() == ".endmacro") {
	if (MacroDepth == 0) { // Outermost macro.
	EndToken = getTok();
	Lexer.Lex();
	if (getLexer().isNot(AsmToken::EndOfStatement))
	return TokError("unexpected token in '" + EndToken.getIdentifier() +
	"' directive");
	break;
	} else {
	// Otherwise we just found the end of an inner macro.
	--MacroDepth;
	}
	} else if (getTok().getIdentifier() == ".macro") {
	// We allow nested macros. Those aren't instantiated until the outermost
	// macro is expanded so just ignore them for now.
	++MacroDepth;
	}
	}

	// Otherwise, scan til the end of the statement.
	eatToEndOfStatement();
	}

	- if (lookupMacro(Name)) {
	+ if (getContext().lookupMacro(Name)) {
	return Error(DirectiveLoc, "macro '" + Name + "' is already defined");
	}

	const char *BodyStart = StartToken.getLoc().getPointer();
	const char *BodyEnd = EndToken.getLoc().getPointer();
	StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart);
	checkForBadMacro(DirectiveLoc, Name, Body, Parameters);
	- defineMacro(Name, MCAsmMacro(Name, Body, std::move(Parameters)));
	+ getContext().defineMacro(Name, MCAsmMacro(Name, Body, std::move(Parameters)));
	return false;
	}

	/// checkForBadMacro
	///
	/// With the support added for named parameters there may be code out there that
	/// is transitioning from positional parameters. In versions of gas that did
	/// not support named parameters they would be ignored on the macro definition.
	/// But to support both styles of parameters this is not possible so if a macro
	/// definition has named parameters but does not use them and has what appears
	/// to be positional parameters, strings like $1, $2, ... and $n, then issue a
	/// warning that the positional parameter found in body which have no effect.
	/// Hoping the developer will either remove the named parameters from the macro
	/// definition so the positional parameters get used if that was what was
	/// intended or change the macro to use the named parameters. It is possible
	/// this warning will trigger when the none of the named parameters are used
	/// and the strings like $1 are infact to simply to be passed trough unchanged.
	void AsmParser::checkForBadMacro(SMLoc DirectiveLoc, StringRef Name,
	StringRef Body,
	ArrayRef<MCAsmMacroParameter> Parameters) {
	// If this macro is not defined with named parameters the warning we are
	// checking for here doesn't apply.
	unsigned NParameters = Parameters.size();
	if (NParameters == 0)
	return;

	bool NamedParametersFound = false;
	bool PositionalParametersFound = false;

	// Look at the body of the macro for use of both the named parameters and what
	// are likely to be positional parameters. This is what expandMacro() is
	// doing when it finds the parameters in the body.
	while (!Body.empty()) {
	// Scan for the next possible parameter.
	std::size_t End = Body.size(), Pos = 0;
	for (; Pos != End; ++Pos) {
	// Check for a substitution or escape.
	// This macro is defined with parameters, look for \foo, \bar, etc.
	if (Body[Pos] == '\\' && Pos + 1 != End)
	break;

	// This macro should have parameters, but look for $0, $1, ..., $n too.
	if (Body[Pos] != '$' \|\| Pos + 1 == End)
	continue;
	char Next = Body[Pos + 1];
	if (Next == '$' \|\| Next == 'n' \|\|
	isdigit(static_cast<unsigned char>(Next)))
	break;
	}

	// Check if we reached the end.
	if (Pos == End)
	break;

	if (Body[Pos] == '$') {
	switch (Body[Pos + 1]) {
	// $$ => $
	case '$':
	break;

	// $n => number of arguments
	case 'n':
	PositionalParametersFound = true;
	break;

	// $[0-9] => argument
	default: {
	PositionalParametersFound = true;
	break;
	}
	}
	Pos += 2;
	} else {
	unsigned I = Pos + 1;
	while (isIdentifierChar(Body[I]) && I + 1 != End)
	++I;

	const char *Begin = Body.data() + Pos + 1;
	StringRef Argument(Begin, I - (Pos + 1));
	unsigned Index = 0;
	for (; Index < NParameters; ++Index)
	if (Parameters[Index].Name == Argument)
	break;

	if (Index == NParameters) {
	if (Body[Pos + 1] == '(' && Body[Pos + 2] == ')')
	Pos += 3;
	else {
	Pos = I;
	}
	} else {
	NamedParametersFound = true;
	Pos += 1 + Argument.size();
	}
	}
	// Update the scan point.
	Body = Body.substr(Pos);
	}

	if (!NamedParametersFound && PositionalParametersFound)
	Warning(DirectiveLoc, "macro defined with named parameters which are not "
	"used in macro body, possible positional parameter "
	"found in body which will have no effect");
	}

	/// parseDirectiveExitMacro
	/// ::= .exitm
	bool AsmParser::parseDirectiveExitMacro(StringRef Directive) {
	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '" + Directive + "' directive"))
	return true;

	if (!isInsideMacroInstantiation())
	return TokError("unexpected '" + Directive + "' in file, "
	"no current macro definition");

	// Exit all conditionals that are active in the current macro.
	while (TheCondStack.size() != ActiveMacros.back()->CondStackDepth) {
	TheCondState = TheCondStack.back();
	TheCondStack.pop_back();
	}

	handleMacroExit();
	return false;
	}

	/// parseDirectiveEndMacro
	/// ::= .endm
	/// ::= .endmacro
	bool AsmParser::parseDirectiveEndMacro(StringRef Directive) {
	if (getLexer().isNot(AsmToken::EndOfStatement))
	return TokError("unexpected token in '" + Directive + "' directive");

	// If we are inside a macro instantiation, terminate the current
	// instantiation.
	if (isInsideMacroInstantiation()) {
	handleMacroExit();
	return false;
	}

	// Otherwise, this .endmacro is a stray entry in the file; well formed
	// .endmacro directives are handled during the macro definition parsing.
	return TokError("unexpected '" + Directive + "' in file, "
	"no current macro definition");
	}

	/// parseDirectivePurgeMacro
	/// ::= .purgem
	bool AsmParser::parseDirectivePurgeMacro(SMLoc DirectiveLoc) {
	StringRef Name;
	SMLoc Loc;
	if (parseTokenLoc(Loc) \|\|
	check(parseIdentifier(Name), Loc,
	"expected identifier in '.purgem' directive") \|\|
	parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.purgem' directive"))
	return true;

	- if (!lookupMacro(Name))
	+ if (!getContext().lookupMacro(Name))
	return Error(DirectiveLoc, "macro '" + Name + "' is not defined");

	- undefineMacro(Name);
	+ getContext().undefineMacro(Name);
	return false;
	}

	/// parseDirectiveBundleAlignMode
	/// ::= {.bundle_align_mode} expression
	bool AsmParser::parseDirectiveBundleAlignMode() {
	// Expect a single argument: an expression that evaluates to a constant
	// in the inclusive range 0-30.
	SMLoc ExprLoc = getLexer().getLoc();
	int64_t AlignSizePow2;
	if (checkForValidSection() \|\| parseAbsoluteExpression(AlignSizePow2) \|\|
	parseToken(AsmToken::EndOfStatement, "unexpected token after expression "
	"in '.bundle_align_mode' "
	"directive") \|\|
	check(AlignSizePow2 < 0 \|\| AlignSizePow2 > 30, ExprLoc,
	"invalid bundle alignment size (expected between 0 and 30)"))
	return true;

	// Because of AlignSizePow2's verified range we can safely truncate it to
	// unsigned.
	getStreamer().EmitBundleAlignMode(static_cast<unsigned>(AlignSizePow2));
	return false;
	}

	/// parseDirectiveBundleLock
	/// ::= {.bundle_lock} [align_to_end]
	bool AsmParser::parseDirectiveBundleLock() {
	if (checkForValidSection())
	return true;
	bool AlignToEnd = false;

	StringRef Option;
	SMLoc Loc = getTok().getLoc();
	const char *kInvalidOptionError =
	"invalid option for '.bundle_lock' directive";

	if (!parseOptionalToken(AsmToken::EndOfStatement)) {
	if (check(parseIdentifier(Option), Loc, kInvalidOptionError) \|\|
	check(Option != "align_to_end", Loc, kInvalidOptionError) \|\|
	parseToken(AsmToken::EndOfStatement,
	"unexpected token after '.bundle_lock' directive option"))
	return true;
	AlignToEnd = true;
	}

	getStreamer().EmitBundleLock(AlignToEnd);
	return false;
	}

	/// parseDirectiveBundleLock
	/// ::= {.bundle_lock}
	bool AsmParser::parseDirectiveBundleUnlock() {
	if (checkForValidSection() \|\|
	parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.bundle_unlock' directive"))
	return true;

	getStreamer().EmitBundleUnlock();
	return false;
	}

	/// parseDirectiveSpace
	/// ::= (.skip \| .space) expression [ , expression ]
	bool AsmParser::parseDirectiveSpace(StringRef IDVal) {
	SMLoc NumBytesLoc = Lexer.getLoc();
	const MCExpr *NumBytes;
	if (checkForValidSection() \|\| parseExpression(NumBytes))
	return true;

	int64_t FillExpr = 0;
	if (parseOptionalToken(AsmToken::Comma))
	if (parseAbsoluteExpression(FillExpr))
	return addErrorSuffix("in '" + Twine(IDVal) + "' directive");
	if (parseToken(AsmToken::EndOfStatement))
	return addErrorSuffix("in '" + Twine(IDVal) + "' directive");

	// FIXME: Sometimes the fill expr is 'nop' if it isn't supplied, instead of 0.
	getStreamer().emitFill(*NumBytes, FillExpr, NumBytesLoc);

	return false;
	}

	/// parseDirectiveDCB
	/// ::= .dcb.{b, l, w} expression, expression
	bool AsmParser::parseDirectiveDCB(StringRef IDVal, unsigned Size) {
	SMLoc NumValuesLoc = Lexer.getLoc();
	int64_t NumValues;
	if (checkForValidSection() \|\| parseAbsoluteExpression(NumValues))
	return true;

	if (NumValues < 0) {
	Warning(NumValuesLoc, "'" + Twine(IDVal) + "' directive with negative repeat count has no effect");
	return false;
	}

	if (parseToken(AsmToken::Comma,
	"unexpected token in '" + Twine(IDVal) + "' directive"))
	return true;

	const MCExpr *Value;
	SMLoc ExprLoc = getLexer().getLoc();
	if (parseExpression(Value))
	return true;

	// Special case constant expressions to match code generator.
	if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) {
	assert(Size <= 8 && "Invalid size");
	uint64_t IntValue = MCE->getValue();
	if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
	return Error(ExprLoc, "literal value out of range for directive");
	for (uint64_t i = 0, e = NumValues; i != e; ++i)
	getStreamer().EmitIntValue(IntValue, Size);
	} else {
	for (uint64_t i = 0, e = NumValues; i != e; ++i)
	getStreamer().EmitValue(Value, Size, ExprLoc);
	}

	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '" + Twine(IDVal) + "' directive"))
	return true;

	return false;
	}

	/// parseDirectiveRealDCB
	/// ::= .dcb.{d, s} expression, expression
	bool AsmParser::parseDirectiveRealDCB(StringRef IDVal, const fltSemantics &Semantics) {
	SMLoc NumValuesLoc = Lexer.getLoc();
	int64_t NumValues;
	if (checkForValidSection() \|\| parseAbsoluteExpression(NumValues))
	return true;

	if (NumValues < 0) {
	Warning(NumValuesLoc, "'" + Twine(IDVal) + "' directive with negative repeat count has no effect");
	return false;
	}

	if (parseToken(AsmToken::Comma,
	"unexpected token in '" + Twine(IDVal) + "' directive"))
	return true;

	APInt AsInt;
	if (parseRealValue(Semantics, AsInt))
	return true;

	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '" + Twine(IDVal) + "' directive"))
	return true;

	for (uint64_t i = 0, e = NumValues; i != e; ++i)
	getStreamer().EmitIntValue(AsInt.getLimitedValue(),
	AsInt.getBitWidth() / 8);

	return false;
	}

	/// parseDirectiveDS
	/// ::= .ds.{b, d, l, p, s, w, x} expression
	bool AsmParser::parseDirectiveDS(StringRef IDVal, unsigned Size) {
	SMLoc NumValuesLoc = Lexer.getLoc();
	int64_t NumValues;
	if (checkForValidSection() \|\| parseAbsoluteExpression(NumValues))
	return true;

	if (NumValues < 0) {
	Warning(NumValuesLoc, "'" + Twine(IDVal) + "' directive with negative repeat count has no effect");
	return false;
	}

	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '" + Twine(IDVal) + "' directive"))
	return true;

	for (uint64_t i = 0, e = NumValues; i != e; ++i)
	getStreamer().emitFill(Size, 0);

	return false;
	}

	/// parseDirectiveLEB128
	/// ::= (.sleb128 \| .uleb128) [ expression (, expression)* ]
	bool AsmParser::parseDirectiveLEB128(bool Signed) {
	if (checkForValidSection())
	return true;

	auto parseOp = [&]() -> bool {
	const MCExpr *Value;
	if (parseExpression(Value))
	return true;
	if (Signed)
	getStreamer().EmitSLEB128Value(Value);
	else
	getStreamer().EmitULEB128Value(Value);
	return false;
	};

	if (parseMany(parseOp))
	return addErrorSuffix(" in directive");

	return false;
	}

	/// parseDirectiveSymbolAttribute
	/// ::= { ".globl", ".weak", ... } [ identifier ( , identifier )* ]
	bool AsmParser::parseDirectiveSymbolAttribute(MCSymbolAttr Attr) {
	auto parseOp = [&]() -> bool {
	StringRef Name;
	SMLoc Loc = getTok().getLoc();
	if (parseIdentifier(Name))
	return Error(Loc, "expected identifier");
	MCSymbol *Sym = getContext().getOrCreateSymbol(Name);

	// Assembler local symbols don't make any sense here. Complain loudly.
	if (Sym->isTemporary())
	return Error(Loc, "non-local symbol required");

	if (!getStreamer().EmitSymbolAttribute(Sym, Attr))
	return Error(Loc, "unable to emit symbol attribute");
	return false;
	};

	if (parseMany(parseOp))
	return addErrorSuffix(" in directive");
	return false;
	}

	/// parseDirectiveComm
	/// ::= ( .comm \| .lcomm ) identifier , size_expression [ , align_expression ]
	bool AsmParser::parseDirectiveComm(bool IsLocal) {
	if (checkForValidSection())
	return true;

	SMLoc IDLoc = getLexer().getLoc();
	StringRef Name;
	if (parseIdentifier(Name))
	return TokError("expected identifier in directive");

	// Handle the identifier as the key symbol.
	MCSymbol *Sym = getContext().getOrCreateSymbol(Name);

	if (getLexer().isNot(AsmToken::Comma))
	return TokError("unexpected token in directive");
	Lex();

	int64_t Size;
	SMLoc SizeLoc = getLexer().getLoc();
	if (parseAbsoluteExpression(Size))
	return true;

	int64_t Pow2Alignment = 0;
	SMLoc Pow2AlignmentLoc;
	if (getLexer().is(AsmToken::Comma)) {
	Lex();
	Pow2AlignmentLoc = getLexer().getLoc();
	if (parseAbsoluteExpression(Pow2Alignment))
	return true;

	LCOMM::LCOMMType LCOMM = Lexer.getMAI().getLCOMMDirectiveAlignmentType();
	if (IsLocal && LCOMM == LCOMM::NoAlignment)
	return Error(Pow2AlignmentLoc, "alignment not supported on this target");

	// If this target takes alignments in bytes (not log) validate and convert.
	if ((!IsLocal && Lexer.getMAI().getCOMMDirectiveAlignmentIsInBytes()) \|\|
	(IsLocal && LCOMM == LCOMM::ByteAlignment)) {
	if (!isPowerOf2_64(Pow2Alignment))
	return Error(Pow2AlignmentLoc, "alignment must be a power of 2");
	Pow2Alignment = Log2_64(Pow2Alignment);
	}
	}

	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.comm' or '.lcomm' directive"))
	return true;

	// NOTE: a size of zero for a .comm should create a undefined symbol
	// but a size of .lcomm creates a bss symbol of size zero.
	if (Size < 0)
	return Error(SizeLoc, "invalid '.comm' or '.lcomm' directive size, can't "
	"be less than zero");

	// NOTE: The alignment in the directive is a power of 2 value, the assembler
	// may internally end up wanting an alignment in bytes.
	// FIXME: Diagnose overflow.
	if (Pow2Alignment < 0)
	return Error(Pow2AlignmentLoc, "invalid '.comm' or '.lcomm' directive "
	"alignment, can't be less than zero");

	Sym->redefineIfPossible();
	if (!Sym->isUndefined())
	return Error(IDLoc, "invalid symbol redefinition");

	// Create the Symbol as a common or local common with Size and Pow2Alignment
	if (IsLocal) {
	getStreamer().EmitLocalCommonSymbol(Sym, Size, 1 << Pow2Alignment);
	return false;
	}

	getStreamer().EmitCommonSymbol(Sym, Size, 1 << Pow2Alignment);
	return false;
	}

	/// parseDirectiveAbort
	/// ::= .abort [... message ...]
	bool AsmParser::parseDirectiveAbort() {
	// FIXME: Use loc from directive.
	SMLoc Loc = getLexer().getLoc();

	StringRef Str = parseStringToEndOfStatement();
	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.abort' directive"))
	return true;

	if (Str.empty())
	return Error(Loc, ".abort detected. Assembly stopping.");
	else
	return Error(Loc, ".abort '" + Str + "' detected. Assembly stopping.");
	// FIXME: Actually abort assembly here.

	return false;
	}

	/// parseDirectiveInclude
	/// ::= .include "filename"
	bool AsmParser::parseDirectiveInclude() {
	// Allow the strings to have escaped octal character sequence.
	std::string Filename;
	SMLoc IncludeLoc = getTok().getLoc();

	if (check(getTok().isNot(AsmToken::String),
	"expected string in '.include' directive") \|\|
	parseEscapedString(Filename) \|\|
	check(getTok().isNot(AsmToken::EndOfStatement),
	"unexpected token in '.include' directive") \|\|
	// Attempt to switch the lexer to the included file before consuming the
	// end of statement to avoid losing it when we switch.
	check(enterIncludeFile(Filename), IncludeLoc,
	"Could not find include file '" + Filename + "'"))
	return true;

	return false;
	}

	/// parseDirectiveIncbin
	/// ::= .incbin "filename" [ , skip [ , count ] ]
	bool AsmParser::parseDirectiveIncbin() {
	// Allow the strings to have escaped octal character sequence.
	std::string Filename;
	SMLoc IncbinLoc = getTok().getLoc();
	if (check(getTok().isNot(AsmToken::String),
	"expected string in '.incbin' directive") \|\|
	parseEscapedString(Filename))
	return true;

	int64_t Skip = 0;
	const MCExpr *Count = nullptr;
	SMLoc SkipLoc, CountLoc;
	if (parseOptionalToken(AsmToken::Comma)) {
	// The skip expression can be omitted while specifying the count, e.g:
	// .incbin "filename",,4
	if (getTok().isNot(AsmToken::Comma)) {
	if (parseTokenLoc(SkipLoc) \|\| parseAbsoluteExpression(Skip))
	return true;
	}
	if (parseOptionalToken(AsmToken::Comma)) {
	CountLoc = getTok().getLoc();
	if (parseExpression(Count))
	return true;
	}
	}

	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.incbin' directive"))
	return true;

	if (check(Skip < 0, SkipLoc, "skip is negative"))
	return true;

	// Attempt to process the included file.
	if (processIncbinFile(Filename, Skip, Count, CountLoc))
	return Error(IncbinLoc, "Could not find incbin file '" + Filename + "'");
	return false;
	}

	/// parseDirectiveIf
	/// ::= .if{,eq,ge,gt,le,lt,ne} expression
	bool AsmParser::parseDirectiveIf(SMLoc DirectiveLoc, DirectiveKind DirKind) {
	TheCondStack.push_back(TheCondState);
	TheCondState.TheCond = AsmCond::IfCond;
	if (TheCondState.Ignore) {
	eatToEndOfStatement();
	} else {
	int64_t ExprValue;
	if (parseAbsoluteExpression(ExprValue) \|\|
	parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.if' directive"))
	return true;

	switch (DirKind) {
	default:
	llvm_unreachable("unsupported directive");
	case DK_IF:
	case DK_IFNE:
	break;
	case DK_IFEQ:
	ExprValue = ExprValue == 0;
	break;
	case DK_IFGE:
	ExprValue = ExprValue >= 0;
	break;
	case DK_IFGT:
	ExprValue = ExprValue > 0;
	break;
	case DK_IFLE:
	ExprValue = ExprValue <= 0;
	break;
	case DK_IFLT:
	ExprValue = ExprValue < 0;
	break;
	}

	TheCondState.CondMet = ExprValue;
	TheCondState.Ignore = !TheCondState.CondMet;
	}

	return false;
	}

	/// parseDirectiveIfb
	/// ::= .ifb string
	bool AsmParser::parseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank) {
	TheCondStack.push_back(TheCondState);
	TheCondState.TheCond = AsmCond::IfCond;

	if (TheCondState.Ignore) {
	eatToEndOfStatement();
	} else {
	StringRef Str = parseStringToEndOfStatement();

	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.ifb' directive"))
	return true;

	TheCondState.CondMet = ExpectBlank == Str.empty();
	TheCondState.Ignore = !TheCondState.CondMet;
	}

	return false;
	}

	/// parseDirectiveIfc
	/// ::= .ifc string1, string2
	/// ::= .ifnc string1, string2
	bool AsmParser::parseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual) {
	TheCondStack.push_back(TheCondState);
	TheCondState.TheCond = AsmCond::IfCond;

	if (TheCondState.Ignore) {
	eatToEndOfStatement();
	} else {
	StringRef Str1 = parseStringToComma();

	if (parseToken(AsmToken::Comma, "unexpected token in '.ifc' directive"))
	return true;

	StringRef Str2 = parseStringToEndOfStatement();

	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.ifc' directive"))
	return true;

	TheCondState.CondMet = ExpectEqual == (Str1.trim() == Str2.trim());
	TheCondState.Ignore = !TheCondState.CondMet;
	}

	return false;
	}

	/// parseDirectiveIfeqs
	/// ::= .ifeqs string1, string2
	bool AsmParser::parseDirectiveIfeqs(SMLoc DirectiveLoc, bool ExpectEqual) {
	if (Lexer.isNot(AsmToken::String)) {
	if (ExpectEqual)
	return TokError("expected string parameter for '.ifeqs' directive");
	return TokError("expected string parameter for '.ifnes' directive");
	}

	StringRef String1 = getTok().getStringContents();
	Lex();

	if (Lexer.isNot(AsmToken::Comma)) {
	if (ExpectEqual)
	return TokError(
	"expected comma after first string for '.ifeqs' directive");
	return TokError("expected comma after first string for '.ifnes' directive");
	}

	Lex();

	if (Lexer.isNot(AsmToken::String)) {
	if (ExpectEqual)
	return TokError("expected string parameter for '.ifeqs' directive");
	return TokError("expected string parameter for '.ifnes' directive");
	}

	StringRef String2 = getTok().getStringContents();
	Lex();

	TheCondStack.push_back(TheCondState);
	TheCondState.TheCond = AsmCond::IfCond;
	TheCondState.CondMet = ExpectEqual == (String1 == String2);
	TheCondState.Ignore = !TheCondState.CondMet;

	return false;
	}

	/// parseDirectiveIfdef
	/// ::= .ifdef symbol
	bool AsmParser::parseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined) {
	StringRef Name;
	TheCondStack.push_back(TheCondState);
	TheCondState.TheCond = AsmCond::IfCond;

	if (TheCondState.Ignore) {
	eatToEndOfStatement();
	} else {
	if (check(parseIdentifier(Name), "expected identifier after '.ifdef'") \|\|
	parseToken(AsmToken::EndOfStatement, "unexpected token in '.ifdef'"))
	return true;

	MCSymbol *Sym = getContext().lookupSymbol(Name);

	if (expect_defined)
	TheCondState.CondMet = (Sym && !Sym->isUndefined());
	else
	TheCondState.CondMet = (!Sym \|\| Sym->isUndefined());
	TheCondState.Ignore = !TheCondState.CondMet;
	}

	return false;
	}

	/// parseDirectiveElseIf
	/// ::= .elseif expression
	bool AsmParser::parseDirectiveElseIf(SMLoc DirectiveLoc) {
	if (TheCondState.TheCond != AsmCond::IfCond &&
	TheCondState.TheCond != AsmCond::ElseIfCond)
	return Error(DirectiveLoc, "Encountered a .elseif that doesn't follow an"
	" .if or an .elseif");
	TheCondState.TheCond = AsmCond::ElseIfCond;

	bool LastIgnoreState = false;
	if (!TheCondStack.empty())
	LastIgnoreState = TheCondStack.back().Ignore;
	if (LastIgnoreState \|\| TheCondState.CondMet) {
	TheCondState.Ignore = true;
	eatToEndOfStatement();
	} else {
	int64_t ExprValue;
	if (parseAbsoluteExpression(ExprValue))
	return true;

	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.elseif' directive"))
	return true;

	TheCondState.CondMet = ExprValue;
	TheCondState.Ignore = !TheCondState.CondMet;
	}

	return false;
	}

	/// parseDirectiveElse
	/// ::= .else
	bool AsmParser::parseDirectiveElse(SMLoc DirectiveLoc) {
	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.else' directive"))
	return true;

	if (TheCondState.TheCond != AsmCond::IfCond &&
	TheCondState.TheCond != AsmCond::ElseIfCond)
	return Error(DirectiveLoc, "Encountered a .else that doesn't follow "
	" an .if or an .elseif");
	TheCondState.TheCond = AsmCond::ElseCond;
	bool LastIgnoreState = false;
	if (!TheCondStack.empty())
	LastIgnoreState = TheCondStack.back().Ignore;
	if (LastIgnoreState \|\| TheCondState.CondMet)
	TheCondState.Ignore = true;
	else
	TheCondState.Ignore = false;

	return false;
	}

	/// parseDirectiveEnd
	/// ::= .end
	bool AsmParser::parseDirectiveEnd(SMLoc DirectiveLoc) {
	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.end' directive"))
	return true;

	while (Lexer.isNot(AsmToken::Eof))
	Lexer.Lex();

	return false;
	}

	/// parseDirectiveError
	/// ::= .err
	/// ::= .error [string]
	bool AsmParser::parseDirectiveError(SMLoc L, bool WithMessage) {
	if (!TheCondStack.empty()) {
	if (TheCondStack.back().Ignore) {
	eatToEndOfStatement();
	return false;
	}
	}

	if (!WithMessage)
	return Error(L, ".err encountered");

	StringRef Message = ".error directive invoked in source file";
	if (Lexer.isNot(AsmToken::EndOfStatement)) {
	if (Lexer.isNot(AsmToken::String))
	return TokError(".error argument must be a string");

	Message = getTok().getStringContents();
	Lex();
	}

	return Error(L, Message);
	}

	/// parseDirectiveWarning
	/// ::= .warning [string]
	bool AsmParser::parseDirectiveWarning(SMLoc L) {
	if (!TheCondStack.empty()) {
	if (TheCondStack.back().Ignore) {
	eatToEndOfStatement();
	return false;
	}
	}

	StringRef Message = ".warning directive invoked in source file";

	if (!parseOptionalToken(AsmToken::EndOfStatement)) {
	if (Lexer.isNot(AsmToken::String))
	return TokError(".warning argument must be a string");

	Message = getTok().getStringContents();
	Lex();
	if (parseToken(AsmToken::EndOfStatement,
	"expected end of statement in '.warning' directive"))
	return true;
	}

	return Warning(L, Message);
	}

	/// parseDirectiveEndIf
	/// ::= .endif
	bool AsmParser::parseDirectiveEndIf(SMLoc DirectiveLoc) {
	if (parseToken(AsmToken::EndOfStatement,
	"unexpected token in '.endif' directive"))
	return true;

	if ((TheCondState.TheCond == AsmCond::NoCond) \|\| TheCondStack.empty())
	return Error(DirectiveLoc, "Encountered a .endif that doesn't follow "
	"an .if or .else");
	if (!TheCondStack.empty()) {
	TheCondState = TheCondStack.back();
	TheCondStack.pop_back();
	}

	return false;
	}

	void AsmParser::initializeDirectiveKindMap() {
	DirectiveKindMap[".set"] = DK_SET;
	DirectiveKindMap[".equ"] = DK_EQU;
	DirectiveKindMap[".equiv"] = DK_EQUIV;
	DirectiveKindMap[".ascii"] = DK_ASCII;
	DirectiveKindMap[".asciz"] = DK_ASCIZ;
	DirectiveKindMap[".string"] = DK_STRING;
	DirectiveKindMap[".byte"] = DK_BYTE;
	DirectiveKindMap[".short"] = DK_SHORT;
	DirectiveKindMap[".value"] = DK_VALUE;
	DirectiveKindMap[".2byte"] = DK_2BYTE;
	DirectiveKindMap[".long"] = DK_LONG;
	DirectiveKindMap[".int"] = DK_INT;
	DirectiveKindMap[".4byte"] = DK_4BYTE;
	DirectiveKindMap[".quad"] = DK_QUAD;
	DirectiveKindMap[".8byte"] = DK_8BYTE;
	DirectiveKindMap[".octa"] = DK_OCTA;
	DirectiveKindMap[".single"] = DK_SINGLE;
	DirectiveKindMap[".float"] = DK_FLOAT;
	DirectiveKindMap[".double"] = DK_DOUBLE;
	DirectiveKindMap[".align"] = DK_ALIGN;
	DirectiveKindMap[".align32"] = DK_ALIGN32;
	DirectiveKindMap[".balign"] = DK_BALIGN;
	DirectiveKindMap[".balignw"] = DK_BALIGNW;
	DirectiveKindMap[".balignl"] = DK_BALIGNL;
	DirectiveKindMap[".p2align"] = DK_P2ALIGN;
	DirectiveKindMap[".p2alignw"] = DK_P2ALIGNW;
	DirectiveKindMap[".p2alignl"] = DK_P2ALIGNL;
	DirectiveKindMap[".org"] = DK_ORG;
	DirectiveKindMap[".fill"] = DK_FILL;
	DirectiveKindMap[".zero"] = DK_ZERO;
	DirectiveKindMap[".extern"] = DK_EXTERN;
	DirectiveKindMap[".globl"] = DK_GLOBL;
	DirectiveKindMap[".global"] = DK_GLOBAL;
	DirectiveKindMap[".lazy_reference"] = DK_LAZY_REFERENCE;
	DirectiveKindMap[".no_dead_strip"] = DK_NO_DEAD_STRIP;
	DirectiveKindMap[".symbol_resolver"] = DK_SYMBOL_RESOLVER;
	DirectiveKindMap[".private_extern"] = DK_PRIVATE_EXTERN;
	DirectiveKindMap[".reference"] = DK_REFERENCE;
	DirectiveKindMap[".weak_definition"] = DK_WEAK_DEFINITION;
	DirectiveKindMap[".weak_reference"] = DK_WEAK_REFERENCE;
	DirectiveKindMap[".weak_def_can_be_hidden"] = DK_WEAK_DEF_CAN_BE_HIDDEN;
	DirectiveKindMap[".comm"] = DK_COMM;
	DirectiveKindMap[".common"] = DK_COMMON;
	DirectiveKindMap[".lcomm"] = DK_LCOMM;
	DirectiveKindMap[".abort"] = DK_ABORT;
	DirectiveKindMap[".include"] = DK_INCLUDE;
	DirectiveKindMap[".incbin"] = DK_INCBIN;
	DirectiveKindMap[".code16"] = DK_CODE16;
	DirectiveKindMap[".code16gcc"] = DK_CODE16GCC;
	DirectiveKindMap[".rept"] = DK_REPT;
	DirectiveKindMap[".rep"] = DK_REPT;
	DirectiveKindMap[".irp"] = DK_IRP;
	DirectiveKindMap[".irpc"] = DK_IRPC;
	DirectiveKindMap[".endr"] = DK_ENDR;
	DirectiveKindMap[".bundle_align_mode"] = DK_BUNDLE_ALIGN_MODE;
	DirectiveKindMap[".bundle_lock"] = DK_BUNDLE_LOCK;
	DirectiveKindMap[".bundle_unlock"] = DK_BUNDLE_UNLOCK;
	DirectiveKindMap[".if"] = DK_IF;
	DirectiveKindMap[".ifeq"] = DK_IFEQ;
	DirectiveKindMap[".ifge"] = DK_IFGE;
	DirectiveKindMap[".ifgt"] = DK_IFGT;
	DirectiveKindMap[".ifle"] = DK_IFLE;
	DirectiveKindMap[".iflt"] = DK_IFLT;
	DirectiveKindMap[".ifne"] = DK_IFNE;
	DirectiveKindMap[".ifb"] = DK_IFB;
	DirectiveKindMap[".ifnb"] = DK_IFNB;
	DirectiveKindMap[".ifc"] = DK_IFC;
	DirectiveKindMap[".ifeqs"] = DK_IFEQS;
	DirectiveKindMap[".ifnc"] = DK_IFNC;
	DirectiveKindMap[".ifnes"] = DK_IFNES;
	DirectiveKindMap[".ifdef"] = DK_IFDEF;
	DirectiveKindMap[".ifndef"] = DK_IFNDEF;
	DirectiveKindMap[".ifnotdef"] = DK_IFNOTDEF;
	DirectiveKindMap[".elseif"] = DK_ELSEIF;
	DirectiveKindMap[".else"] = DK_ELSE;
	DirectiveKindMap[".end"] = DK_END;
	DirectiveKindMap[".endif"] = DK_ENDIF;
	DirectiveKindMap[".skip"] = DK_SKIP;
	DirectiveKindMap[".space"] = DK_SPACE;
	DirectiveKindMap[".file"] = DK_FILE;
	DirectiveKindMap[".line"] = DK_LINE;
	DirectiveKindMap[".loc"] = DK_LOC;
	DirectiveKindMap[".stabs"] = DK_STABS;
	DirectiveKindMap[".cv_file"] = DK_CV_FILE;
	DirectiveKindMap[".cv_func_id"] = DK_CV_FUNC_ID;
	DirectiveKindMap[".cv_loc"] = DK_CV_LOC;
	DirectiveKindMap[".cv_linetable"] = DK_CV_LINETABLE;
	DirectiveKindMap[".cv_inline_linetable"] = DK_CV_INLINE_LINETABLE;
	DirectiveKindMap[".cv_inline_site_id"] = DK_CV_INLINE_SITE_ID;
	DirectiveKindMap[".cv_def_range"] = DK_CV_DEF_RANGE;
	DirectiveKindMap[".cv_stringtable"] = DK_CV_STRINGTABLE;
	DirectiveKindMap[".cv_filechecksums"] = DK_CV_FILECHECKSUMS;
	DirectiveKindMap[".cv_filechecksumoffset"] = DK_CV_FILECHECKSUM_OFFSET;
	DirectiveKindMap[".cv_fpo_data"] = DK_CV_FPO_DATA;
	DirectiveKindMap[".sleb128"] = DK_SLEB128;
	DirectiveKindMap[".uleb128"] = DK_ULEB128;
	DirectiveKindMap[".cfi_sections"] = DK_CFI_SECTIONS;
	DirectiveKindMap[".cfi_startproc"] = DK_CFI_STARTPROC;
	DirectiveKindMap[".cfi_endproc"] = DK_CFI_ENDPROC;
	DirectiveKindMap[".cfi_def_cfa"] = DK_CFI_DEF_CFA;
	DirectiveKindMap[".cfi_def_cfa_offset"] = DK_CFI_DEF_CFA_OFFSET;
	DirectiveKindMap[".cfi_adjust_cfa_offset"] = DK_CFI_ADJUST_CFA_OFFSET;
	DirectiveKindMap[".cfi_def_cfa_register"] = DK_CFI_DEF_CFA_REGISTER;
	DirectiveKindMap[".cfi_offset"] = DK_CFI_OFFSET;
	DirectiveKindMap[".cfi_rel_offset"] = DK_CFI_REL_OFFSET;
	DirectiveKindMap[".cfi_personality"] = DK_CFI_PERSONALITY;
	DirectiveKindMap[".cfi_lsda"] = DK_CFI_LSDA;
	DirectiveKindMap[".cfi_remember_state"] = DK_CFI_REMEMBER_STATE;
	DirectiveKindMap[".cfi_restore_state"] = DK_CFI_RESTORE_STATE;
	DirectiveKindMap[".cfi_same_value"] = DK_CFI_SAME_VALUE;
	DirectiveKindMap[".cfi_restore"] = DK_CFI_RESTORE;
	DirectiveKindMap[".cfi_escape"] = DK_CFI_ESCAPE;
	DirectiveKindMap[".cfi_return_column"] = DK_CFI_RETURN_COLUMN;
	DirectiveKindMap[".cfi_signal_frame"] = DK_CFI_SIGNAL_FRAME;
	DirectiveKindMap[".cfi_undefined"] = DK_CFI_UNDEFINED;
	DirectiveKindMap[".cfi_register"] = DK_CFI_REGISTER;
	DirectiveKindMap[".cfi_window_save"] = DK_CFI_WINDOW_SAVE;
	DirectiveKindMap[".macros_on"] = DK_MACROS_ON;
	DirectiveKindMap[".macros_off"] = DK_MACROS_OFF;
	DirectiveKindMap[".macro"] = DK_MACRO;
	DirectiveKindMap[".exitm"] = DK_EXITM;
	DirectiveKindMap[".endm"] = DK_ENDM;
	DirectiveKindMap[".endmacro"] = DK_ENDMACRO;
	DirectiveKindMap[".purgem"] = DK_PURGEM;
	DirectiveKindMap[".err"] = DK_ERR;
	DirectiveKindMap[".error"] = DK_ERROR;
	DirectiveKindMap[".warning"] = DK_WARNING;
	DirectiveKindMap[".altmacro"] = DK_ALTMACRO;
	DirectiveKindMap[".noaltmacro"] = DK_NOALTMACRO;
	DirectiveKindMap[".reloc"] = DK_RELOC;
	DirectiveKindMap[".dc"] = DK_DC;
	DirectiveKindMap[".dc.a"] = DK_DC_A;
	DirectiveKindMap[".dc.b"] = DK_DC_B;
	DirectiveKindMap[".dc.d"] = DK_DC_D;
	DirectiveKindMap[".dc.l"] = DK_DC_L;
	DirectiveKindMap[".dc.s"] = DK_DC_S;
	DirectiveKindMap[".dc.w"] = DK_DC_W;
	DirectiveKindMap[".dc.x"] = DK_DC_X;
	DirectiveKindMap[".dcb"] = DK_DCB;
	DirectiveKindMap[".dcb.b"] = DK_DCB_B;
	DirectiveKindMap[".dcb.d"] = DK_DCB_D;
	DirectiveKindMap[".dcb.l"] = DK_DCB_L;
	DirectiveKindMap[".dcb.s"] = DK_DCB_S;
	DirectiveKindMap[".dcb.w"] = DK_DCB_W;
	DirectiveKindMap[".dcb.x"] = DK_DCB_X;
	DirectiveKindMap[".ds"] = DK_DS;
	DirectiveKindMap[".ds.b"] = DK_DS_B;
	DirectiveKindMap[".ds.d"] = DK_DS_D;
	DirectiveKindMap[".ds.l"] = DK_DS_L;
	DirectiveKindMap[".ds.p"] = DK_DS_P;
	DirectiveKindMap[".ds.s"] = DK_DS_S;
	DirectiveKindMap[".ds.w"] = DK_DS_W;
	DirectiveKindMap[".ds.x"] = DK_DS_X;
	DirectiveKindMap[".print"] = DK_PRINT;
	}

	MCAsmMacro *AsmParser::parseMacroLikeBody(SMLoc DirectiveLoc) {
	AsmToken EndToken, StartToken = getTok();

	unsigned NestLevel = 0;
	while (true) {
	// Check whether we have reached the end of the file.
	if (getLexer().is(AsmToken::Eof)) {
	printError(DirectiveLoc, "no matching '.endr' in definition");
	return nullptr;
	}

	if (Lexer.is(AsmToken::Identifier) &&
	(getTok().getIdentifier() == ".rept" \|\|
	getTok().getIdentifier() == ".irp" \|\|
	getTok().getIdentifier() == ".irpc")) {
	++NestLevel;
	}

	// Otherwise, check whether we have reached the .endr.
	if (Lexer.is(AsmToken::Identifier) && getTok().getIdentifier() == ".endr") {
	if (NestLevel == 0) {
	EndToken = getTok();
	Lex();
	if (Lexer.isNot(AsmToken::EndOfStatement)) {
	printError(getTok().getLoc(),
	"unexpected token in '.endr' directive");
	return nullptr;
	}
	break;
	}
	--NestLevel;
	}

	// Otherwise, scan till the end of the statement.
	eatToEndOfStatement();
	}

	const char *BodyStart = StartToken.getLoc().getPointer();
	const char *BodyEnd = EndToken.getLoc().getPointer();
	StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart);

	// We Are Anonymous.
	MacroLikeBodies.emplace_back(StringRef(), Body, MCAsmMacroParameters());
	return &MacroLikeBodies.back();
	}

	void AsmParser::instantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
	raw_svector_ostream &OS) {
	OS << ".endr\n";

	std::unique_ptr<MemoryBuffer> Instantiation =
	MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");

	// Create the macro instantiation object and add to the current macro
	// instantiation stack.
	MacroInstantiation *MI = new MacroInstantiation(
	DirectiveLoc, CurBuffer, getTok().getLoc(), TheCondStack.size());
	ActiveMacros.push_back(MI);

	// Jump to the macro instantiation and prime the lexer.
	CurBuffer = SrcMgr.AddNewSourceBuffer(std::move(Instantiation), SMLoc());
	Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer());
	Lex();
	}

	/// parseDirectiveRept
	/// ::= .rep \| .rept count
	bool AsmParser::parseDirectiveRept(SMLoc DirectiveLoc, StringRef Dir) {
	const MCExpr *CountExpr;
	SMLoc CountLoc = getTok().getLoc();
	if (parseExpression(CountExpr))
	return true;

	int64_t Count;
	if (!CountExpr->evaluateAsAbsolute(Count)) {
	return Error(CountLoc, "unexpected token in '" + Dir + "' directive");
	}

	if (check(Count < 0, CountLoc, "Count is negative") \|\|
	parseToken(AsmToken::EndOfStatement,
	"unexpected token in '" + Dir + "' directive"))
	return true;

	// Lex the rept definition.
	MCAsmMacro *M = parseMacroLikeBody(DirectiveLoc);
	if (!M)
	return true;

	// Macro instantiation is lexical, unfortunately. We construct a new buffer
	// to hold the macro body with substitutions.
	SmallString<256> Buf;
	raw_svector_ostream OS(Buf);
	while (Count--) {
	// Note that the AtPseudoVariable is disabled for instantiations of .rep(t).
	if (expandMacro(OS, M->Body, None, None, false, getTok().getLoc()))
	return true;
	}
	instantiateMacroLikeBody(M, DirectiveLoc, OS);

	return false;
	}

	/// parseDirectiveIrp
	/// ::= .irp symbol,values
	bool AsmParser::parseDirectiveIrp(SMLoc DirectiveLoc) {
	MCAsmMacroParameter Parameter;
	MCAsmMacroArguments A;
	if (check(parseIdentifier(Parameter.Name),
	"expected identifier in '.irp' directive") \|\|
	parseToken(AsmToken::Comma, "expected comma in '.irp' directive") \|\|
	parseMacroArguments(nullptr, A) \|\|
	parseToken(AsmToken::EndOfStatement, "expected End of Statement"))
	return true;

	// Lex the irp definition.
	MCAsmMacro *M = parseMacroLikeBody(DirectiveLoc);
	if (!M)
	return true;

	// Macro instantiation is lexical, unfortunately. We construct a new buffer
	// to hold the macro body with substitutions.
	SmallString<256> Buf;
	raw_svector_ostream OS(Buf);

	for (const MCAsmMacroArgument &Arg : A) {
	// Note that the AtPseudoVariable is enabled for instantiations of .irp.
	// This is undocumented, but GAS seems to support it.
	if (expandMacro(OS, M->Body, Parameter, Arg, true, getTok().getLoc()))
	return true;
	}

	instantiateMacroLikeBody(M, DirectiveLoc, OS);

	return false;
	}

	/// parseDirectiveIrpc
	/// ::= .irpc symbol,values
	bool AsmParser::parseDirectiveIrpc(SMLoc DirectiveLoc) {
	MCAsmMacroParameter Parameter;
	MCAsmMacroArguments A;

	if (check(parseIdentifier(Parameter.Name),
	"expected identifier in '.irpc' directive") \|\|
	parseToken(AsmToken::Comma, "expected comma in '.irpc' directive") \|\|
	parseMacroArguments(nullptr, A))
	return true;

	if (A.size() != 1 \|\| A.front().size() != 1)
	return TokError("unexpected token in '.irpc' directive");

	// Eat the end of statement.
	if (parseToken(AsmToken::EndOfStatement, "expected end of statement"))
	return true;

	// Lex the irpc definition.
	MCAsmMacro *M = parseMacroLikeBody(DirectiveLoc);
	if (!M)
	return true;

	// Macro instantiation is lexical, unfortunately. We construct a new buffer
	// to hold the macro body with substitutions.
	SmallString<256> Buf;
	raw_svector_ostream OS(Buf);

	StringRef Values = A.front().front().getString();
	for (std::size_t I = 0, End = Values.size(); I != End; ++I) {
	MCAsmMacroArgument Arg;
	Arg.emplace_back(AsmToken::Identifier, Values.slice(I, I + 1));

	// Note that the AtPseudoVariable is enabled for instantiations of .irpc.
	// This is undocumented, but GAS seems to support it.
	if (expandMacro(OS, M->Body, Parameter, Arg, true, getTok().getLoc()))
	return true;
	}

	instantiateMacroLikeBody(M, DirectiveLoc, OS);

	return false;
	}

	bool AsmParser::parseDirectiveEndr(SMLoc DirectiveLoc) {
	if (ActiveMacros.empty())
	return TokError("unmatched '.endr' directive");

	// The only .repl that should get here are the ones created by
	// instantiateMacroLikeBody.
	assert(getLexer().is(AsmToken::EndOfStatement));

	handleMacroExit();
	return false;
	}

	bool AsmParser::parseDirectiveMSEmit(SMLoc IDLoc, ParseStatementInfo &Info,
	size_t Len) {
	const MCExpr *Value;
	SMLoc ExprLoc = getLexer().getLoc();
	if (parseExpression(Value))
	return true;
	const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value);
	if (!MCE)
	return Error(ExprLoc, "unexpected expression in _emit");
	uint64_t IntValue = MCE->getValue();
	if (!isUInt<8>(IntValue) && !isInt<8>(IntValue))
	return Error(ExprLoc, "literal value out of range for directive");

	Info.AsmRewrites->emplace_back(AOK_Emit, IDLoc, Len);
	return false;
	}

	bool AsmParser::parseDirectiveMSAlign(SMLoc IDLoc, ParseStatementInfo &Info) {
	const MCExpr *Value;
	SMLoc ExprLoc = getLexer().getLoc();
	if (parseExpression(Value))
	return true;
	const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value);
	if (!MCE)
	return Error(ExprLoc, "unexpected expression in align");
	uint64_t IntValue = MCE->getValue();
	if (!isPowerOf2_64(IntValue))
	return Error(ExprLoc, "literal value not a power of two greater then zero");

	Info.AsmRewrites->emplace_back(AOK_Align, IDLoc, 5, Log2_64(IntValue));
	return false;
	}

	bool AsmParser::parseDirectivePrint(SMLoc DirectiveLoc) {
	const AsmToken StrTok = getTok();
	Lex();
	if (StrTok.isNot(AsmToken::String) \|\| StrTok.getString().front() != '"')
	return Error(DirectiveLoc, "expected double quoted string after .print");
	if (parseToken(AsmToken::EndOfStatement, "expected end of statement"))
	return true;
	llvm::outs() << StrTok.getStringContents() << '\n';
	return false;
	}

	// We are comparing pointers, but the pointers are relative to a single string.
	// Thus, this should always be deterministic.
	static int rewritesSort(const AsmRewrite *AsmRewriteA,
	const AsmRewrite *AsmRewriteB) {
	if (AsmRewriteA->Loc.getPointer() < AsmRewriteB->Loc.getPointer())
	return -1;
	if (AsmRewriteB->Loc.getPointer() < AsmRewriteA->Loc.getPointer())
	return 1;

	// It's possible to have a SizeDirective, Imm/ImmPrefix and an Input/Output
	// rewrite to the same location. Make sure the SizeDirective rewrite is
	// performed first, then the Imm/ImmPrefix and finally the Input/Output. This
	// ensures the sort algorithm is stable.
	if (AsmRewritePrecedence[AsmRewriteA->Kind] >
	AsmRewritePrecedence[AsmRewriteB->Kind])
	return -1;

	if (AsmRewritePrecedence[AsmRewriteA->Kind] <
	AsmRewritePrecedence[AsmRewriteB->Kind])
	return 1;
	llvm_unreachable("Unstable rewrite sort.");
	}

	bool AsmParser::parseMSInlineAsm(
	void *AsmLoc, std::string &AsmString, unsigned &NumOutputs,
	unsigned &NumInputs, SmallVectorImpl<std::pair<void *, bool>> &OpDecls,
	SmallVectorImpl<std::string> &Constraints,
	SmallVectorImpl<std::string> &Clobbers, const MCInstrInfo *MII,
	const MCInstPrinter *IP, MCAsmParserSemaCallback &SI) {
	SmallVector<void *, 4> InputDecls;
	SmallVector<void *, 4> OutputDecls;
	SmallVector<bool, 4> InputDeclsAddressOf;
	SmallVector<bool, 4> OutputDeclsAddressOf;
	SmallVector<std::string, 4> InputConstraints;
	SmallVector<std::string, 4> OutputConstraints;
	SmallVector<unsigned, 4> ClobberRegs;

	SmallVector<AsmRewrite, 4> AsmStrRewrites;

	// Prime the lexer.
	Lex();

	// While we have input, parse each statement.
	unsigned InputIdx = 0;
	unsigned OutputIdx = 0;
	while (getLexer().isNot(AsmToken::Eof)) {
	// Parse curly braces marking block start/end
	if (parseCurlyBlockScope(AsmStrRewrites))
	continue;

	ParseStatementInfo Info(&AsmStrRewrites);
	bool StatementErr = parseStatement(Info, &SI);

	if (StatementErr \|\| Info.ParseError) {
	// Emit pending errors if any exist.
	printPendingErrors();
	return true;
	}

	// No pending error should exist here.
	assert(!hasPendingError() && "unexpected error from parseStatement");

	if (Info.Opcode == ~0U)
	continue;

	const MCInstrDesc &Desc = MII->get(Info.Opcode);

	// Build the list of clobbers, outputs and inputs.
	for (unsigned i = 1, e = Info.ParsedOperands.size(); i != e; ++i) {
	MCParsedAsmOperand &Operand = *Info.ParsedOperands[i];

	// Immediate.
	if (Operand.isImm())
	continue;

	// Register operand.
	if (Operand.isReg() && !Operand.needAddressOf() &&
	!getTargetParser().OmitRegisterFromClobberLists(Operand.getReg())) {
	unsigned NumDefs = Desc.getNumDefs();
	// Clobber.
	if (NumDefs && Operand.getMCOperandNum() < NumDefs)
	ClobberRegs.push_back(Operand.getReg());
	continue;
	}

	// Expr/Input or Output.
	StringRef SymName = Operand.getSymName();
	if (SymName.empty())
	continue;

	void *OpDecl = Operand.getOpDecl();
	if (!OpDecl)
	continue;

	bool isOutput = (i == 1) && Desc.mayStore();
	SMLoc Start = SMLoc::getFromPointer(SymName.data());
	if (isOutput) {
	++InputIdx;
	OutputDecls.push_back(OpDecl);
	OutputDeclsAddressOf.push_back(Operand.needAddressOf());
	OutputConstraints.push_back(("=" + Operand.getConstraint()).str());
	AsmStrRewrites.emplace_back(AOK_Output, Start, SymName.size());
	} else {
	InputDecls.push_back(OpDecl);
	InputDeclsAddressOf.push_back(Operand.needAddressOf());
	InputConstraints.push_back(Operand.getConstraint().str());
	AsmStrRewrites.emplace_back(AOK_Input, Start, SymName.size());
	}
	}

	// Consider implicit defs to be clobbers. Think of cpuid and push.
	ArrayRef<MCPhysReg> ImpDefs(Desc.getImplicitDefs(),
	Desc.getNumImplicitDefs());
	ClobberRegs.insert(ClobberRegs.end(), ImpDefs.begin(), ImpDefs.end());
	}

	// Set the number of Outputs and Inputs.
	NumOutputs = OutputDecls.size();
	NumInputs = InputDecls.size();

	// Set the unique clobbers.
	array_pod_sort(ClobberRegs.begin(), ClobberRegs.end());
	ClobberRegs.erase(std::unique(ClobberRegs.begin(), ClobberRegs.end()),
	ClobberRegs.end());
	Clobbers.assign(ClobberRegs.size(), std::string());
	for (unsigned I = 0, E = ClobberRegs.size(); I != E; ++I) {
	raw_string_ostream OS(Clobbers[I]);
	IP->printRegName(OS, ClobberRegs[I]);
	}

	// Merge the various outputs and inputs. Output are expected first.
	if (NumOutputs \|\| NumInputs) {
	unsigned NumExprs = NumOutputs + NumInputs;
	OpDecls.resize(NumExprs);
	Constraints.resize(NumExprs);
	for (unsigned i = 0; i < NumOutputs; ++i) {
	OpDecls[i] = std::make_pair(OutputDecls[i], OutputDeclsAddressOf[i]);
	Constraints[i] = OutputConstraints[i];
	}
	for (unsigned i = 0, j = NumOutputs; i < NumInputs; ++i, ++j) {
	OpDecls[j] = std::make_pair(InputDecls[i], InputDeclsAddressOf[i]);
	Constraints[j] = InputConstraints[i];
	}
	}

	// Build the IR assembly string.
	std::string AsmStringIR;
	raw_string_ostream OS(AsmStringIR);
	StringRef ASMString =
	SrcMgr.getMemoryBuffer(SrcMgr.getMainFileID())->getBuffer();
	const char *AsmStart = ASMString.begin();
	const char *AsmEnd = ASMString.end();
	array_pod_sort(AsmStrRewrites.begin(), AsmStrRewrites.end(), rewritesSort);
	for (const AsmRewrite &AR : AsmStrRewrites) {
	AsmRewriteKind Kind = AR.Kind;

	const char *Loc = AR.Loc.getPointer();
	assert(Loc >= AsmStart && "Expected Loc to be at or after Start!");

	// Emit everything up to the immediate/expression.
	if (unsigned Len = Loc - AsmStart)
	OS << StringRef(AsmStart, Len);

	// Skip the original expression.
	if (Kind == AOK_Skip) {
	AsmStart = Loc + AR.Len;
	continue;
	}

	unsigned AdditionalSkip = 0;
	// Rewrite expressions in $N notation.
	switch (Kind) {
	default:
	break;
	case AOK_IntelExpr:
	assert(AR.IntelExp.isValid() && "cannot write invalid intel expression");
	if (AR.IntelExp.NeedBracs)
	OS << "[";
	if (AR.IntelExp.hasBaseReg())
	OS << AR.IntelExp.BaseReg;
	if (AR.IntelExp.hasIndexReg())
	OS << (AR.IntelExp.hasBaseReg() ? " + " : "")
	<< AR.IntelExp.IndexReg;
	if (AR.IntelExp.Scale > 1)
	OS << " * $$" << AR.IntelExp.Scale;
	if (AR.IntelExp.Imm \|\| !AR.IntelExp.hasRegs())
	OS << (AR.IntelExp.hasRegs() ? " + $$" : "$$") << AR.IntelExp.Imm;
	if (AR.IntelExp.NeedBracs)
	OS << "]";
	break;
	case AOK_Label:
	OS << Ctx.getAsmInfo()->getPrivateLabelPrefix() << AR.Label;
	break;
	case AOK_Input:
	OS << '$' << InputIdx++;
	break;
	case AOK_Output:
	OS << '$' << OutputIdx++;
	break;
	case AOK_SizeDirective:
	switch (AR.Val) {
	default: break;
	case 8: OS << "byte ptr "; break;
	case 16: OS << "word ptr "; break;
	case 32: OS << "dword ptr "; break;
	case 64: OS << "qword ptr "; break;
	case 80: OS << "xword ptr "; break;
	case 128: OS << "xmmword ptr "; break;
	case 256: OS << "ymmword ptr "; break;
	}
	break;
	case AOK_Emit:
	OS << ".byte";
	break;
	case AOK_Align: {
	// MS alignment directives are measured in bytes. If the native assembler
	// measures alignment in bytes, we can pass it straight through.
	OS << ".align";
	if (getContext().getAsmInfo()->getAlignmentIsInBytes())
	break;

	// Alignment is in log2 form, so print that instead and skip the original
	// immediate.
	unsigned Val = AR.Val;
	OS << ' ' << Val;
	assert(Val < 10 && "Expected alignment less then 2^10.");
	AdditionalSkip = (Val < 4) ? 2 : Val < 7 ? 3 : 4;
	break;
	}
	case AOK_EVEN:
	OS << ".even";
	break;
	case AOK_EndOfStatement:
	OS << "\n\t";
	break;
	}

	// Skip the original expression.
	AsmStart = Loc + AR.Len + AdditionalSkip;
	}

	// Emit the remainder of the asm string.
	if (AsmStart != AsmEnd)
	OS << StringRef(AsmStart, AsmEnd - AsmStart);

	AsmString = OS.str();
	return false;
	}

	namespace llvm {
	namespace MCParserUtils {

	/// Returns whether the given symbol is used anywhere in the given expression,
	/// or subexpressions.
	static bool isSymbolUsedInExpression(const MCSymbol Sym, const MCExpr Value) {
	switch (Value->getKind()) {
	case MCExpr::Binary: {
	const MCBinaryExpr BE = static_cast<const MCBinaryExpr >(Value);
	return isSymbolUsedInExpression(Sym, BE->getLHS()) \|\|
	isSymbolUsedInExpression(Sym, BE->getRHS());
	}
	case MCExpr::Target:
	case MCExpr::Constant:
	return false;
	case MCExpr::SymbolRef: {
	const MCSymbol &S =
	static_cast<const MCSymbolRefExpr *>(Value)->getSymbol();
	if (S.isVariable())
	return isSymbolUsedInExpression(Sym, S.getVariableValue());
	return &S == Sym;
	}
	case MCExpr::Unary:
	return isSymbolUsedInExpression(
	Sym, static_cast<const MCUnaryExpr *>(Value)->getSubExpr());
	}

	llvm_unreachable("Unknown expr kind!");
	}

	bool parseAssignmentExpression(StringRef Name, bool allow_redef,
	MCAsmParser &Parser, MCSymbol *&Sym,
	const MCExpr *&Value) {

	// FIXME: Use better location, we should use proper tokens.
	SMLoc EqualLoc = Parser.getTok().getLoc();

	if (Parser.parseExpression(Value)) {
	return Parser.TokError("missing expression");
	}

	// Note: we don't count b as used in "a = b". This is to allow
	// a = b
	// b = c

	if (Parser.parseToken(AsmToken::EndOfStatement))
	return true;

	// Validate that the LHS is allowed to be a variable (either it has not been
	// used as a symbol, or it is an absolute symbol).
	Sym = Parser.getContext().lookupSymbol(Name);
	if (Sym) {
	// Diagnose assignment to a label.
	//
	// FIXME: Diagnostics. Note the location of the definition as a label.
	// FIXME: Diagnose assignment to protected identifier (e.g., register name).
	if (isSymbolUsedInExpression(Sym, Value))
	return Parser.Error(EqualLoc, "Recursive use of '" + Name + "'");
	else if (Sym->isUndefined(/SetUsed/ false) && !Sym->isUsed() &&
	!Sym->isVariable())
	; // Allow redefinitions of undefined symbols only used in directives.
	else if (Sym->isVariable() && !Sym->isUsed() && allow_redef)
	; // Allow redefinitions of variables that haven't yet been used.
	else if (!Sym->isUndefined() && (!Sym->isVariable() \|\| !allow_redef))
	return Parser.Error(EqualLoc, "redefinition of '" + Name + "'");
	else if (!Sym->isVariable())
	return Parser.Error(EqualLoc, "invalid assignment to '" + Name + "'");
	else if (!isa<MCConstantExpr>(Sym->getVariableValue()))
	return Parser.Error(EqualLoc,
	"invalid reassignment of non-absolute variable '" +
	Name + "'");
	} else if (Name == ".") {
	Parser.getStreamer().emitValueToOffset(Value, 0, EqualLoc);
	return false;
	} else
	Sym = Parser.getContext().getOrCreateSymbol(Name);

	Sym->setRedefinable(allow_redef);

	return false;
	}

	} // end namespace MCParserUtils
	} // end namespace llvm

	/// \brief Create an MCAsmParser instance.
	MCAsmParser *llvm::createMCAsmParser(SourceMgr &SM, MCContext &C,
	MCStreamer &Out, const MCAsmInfo &MAI,
	unsigned CB) {
	return new AsmParser(SM, C, Out, MAI, CB);
	}
	Index: head/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
	===================================================================
	--- head/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (revision 329409)
	+++ head/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (revision 329410)
	@@ -1,4160 +1,4164 @@
	//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	/// \file
	/// \brief This is the parent TargetLowering class for hardware code gen
	/// targets.
	//
	//===----------------------------------------------------------------------===//

	#define AMDGPU_LOG2E_F 1.44269504088896340735992468100189214f
	#define AMDGPU_LN2_F 0.693147180559945309417232121458176568f
	#define AMDGPU_LN10_F 2.30258509299404568401799145468436421f

	#include "AMDGPUISelLowering.h"
	#include "AMDGPU.h"
	#include "AMDGPUCallLowering.h"
	#include "AMDGPUFrameLowering.h"
	#include "AMDGPUIntrinsicInfo.h"
	#include "AMDGPURegisterInfo.h"
	#include "AMDGPUSubtarget.h"
	#include "AMDGPUTargetMachine.h"
	#include "R600MachineFunctionInfo.h"
	#include "SIInstrInfo.h"
	#include "SIMachineFunctionInfo.h"
	#include "llvm/CodeGen/CallingConvLower.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DiagnosticInfo.h"
	#include "llvm/Support/KnownBits.h"
	using namespace llvm;

	static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
	CCValAssign::LocInfo LocInfo,
	ISD::ArgFlagsTy ArgFlags, CCState &State) {
	MachineFunction &MF = State.getMachineFunction();
	AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();

	uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(),
	ArgFlags.getOrigAlign());
	State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
	return true;
	}

	static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
	CCValAssign::LocInfo LocInfo,
	ISD::ArgFlagsTy ArgFlags, CCState &State,
	const TargetRegisterClass *RC,
	unsigned NumRegs) {
	ArrayRef<MCPhysReg> RegList = makeArrayRef(RC->begin(), NumRegs);
	unsigned RegResult = State.AllocateReg(RegList);
	if (RegResult == AMDGPU::NoRegister)
	return false;

	State.addLoc(CCValAssign::getReg(ValNo, ValVT, RegResult, LocVT, LocInfo));
	return true;
	}

	static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
	CCValAssign::LocInfo LocInfo,
	ISD::ArgFlagsTy ArgFlags, CCState &State) {
	switch (LocVT.SimpleTy) {
	case MVT::i64:
	case MVT::f64:
	case MVT::v2i32:
	case MVT::v2f32: {
	// Up to SGPR0-SGPR39
	return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
	&AMDGPU::SGPR_64RegClass, 20);
	}
	default:
	return false;
	}
	}

	// Allocate up to VGPR31.
	//
	// TODO: Since there are no VGPR alignent requirements would it be better to
	// split into individual scalar registers?
	static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
	CCValAssign::LocInfo LocInfo,
	ISD::ArgFlagsTy ArgFlags, CCState &State) {
	switch (LocVT.SimpleTy) {
	case MVT::i64:
	case MVT::f64:
	case MVT::v2i32:
	case MVT::v2f32: {
	return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
	&AMDGPU::VReg_64RegClass, 31);
	}
	case MVT::v4i32:
	case MVT::v4f32:
	case MVT::v2i64:
	case MVT::v2f64: {
	return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
	&AMDGPU::VReg_128RegClass, 29);
	}
	case MVT::v8i32:
	case MVT::v8f32: {
	return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
	&AMDGPU::VReg_256RegClass, 25);

	}
	case MVT::v16i32:
	case MVT::v16f32: {
	return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
	&AMDGPU::VReg_512RegClass, 17);

	}
	default:
	return false;
	}
	}

	#include "AMDGPUGenCallingConv.inc"

	// Find a larger type to do a load / store of a vector with.
	EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
	unsigned StoreSize = VT.getStoreSizeInBits();
	if (StoreSize <= 32)
	return EVT::getIntegerVT(Ctx, StoreSize);

	assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
	return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
	}

	unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
	KnownBits Known;
	EVT VT = Op.getValueType();
	DAG.computeKnownBits(Op, Known);

	return VT.getSizeInBits() - Known.countMinLeadingZeros();
	}

	unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
	EVT VT = Op.getValueType();

	// In order for this to be a signed 24-bit value, bit 23, must
	// be a sign bit.
	return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op);
	}

	AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
	const AMDGPUSubtarget &STI)
	: TargetLowering(TM), Subtarget(&STI) {
	AMDGPUASI = AMDGPU::getAMDGPUAS(TM);
	// Lower floating point store/load to integer store/load to reduce the number
	// of patterns in tablegen.
	setOperationAction(ISD::LOAD, MVT::f32, Promote);
	AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);

	setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
	AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);

	setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
	AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);

	setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
	AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);

	setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
	AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);

	setOperationAction(ISD::LOAD, MVT::i64, Promote);
	AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);

	setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
	AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);

	setOperationAction(ISD::LOAD, MVT::f64, Promote);
	AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);

	setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
	AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);

	// There are no 64-bit extloads. These should be done as a 32-bit extload and
	// an extension to 64-bit.
	for (MVT VT : MVT::integer_valuetypes()) {
	setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
	setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
	setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
	}

	for (MVT VT : MVT::integer_valuetypes()) {
	if (VT == MVT::i64)
	continue;

	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);

	setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
	setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
	setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
	setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);

	setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
	}

	for (MVT VT : MVT::integer_vector_valuetypes()) {
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
	setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
	setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
	setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
	setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
	}

	setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);

	setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);

	setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);

	setOperationAction(ISD::STORE, MVT::f32, Promote);
	AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);

	setOperationAction(ISD::STORE, MVT::v2f32, Promote);
	AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);

	setOperationAction(ISD::STORE, MVT::v4f32, Promote);
	AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);

	setOperationAction(ISD::STORE, MVT::v8f32, Promote);
	AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);

	setOperationAction(ISD::STORE, MVT::v16f32, Promote);
	AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);

	setOperationAction(ISD::STORE, MVT::i64, Promote);
	AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);

	setOperationAction(ISD::STORE, MVT::v2i64, Promote);
	AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);

	setOperationAction(ISD::STORE, MVT::f64, Promote);
	AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);

	setOperationAction(ISD::STORE, MVT::v2f64, Promote);
	AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);

	setTruncStoreAction(MVT::i64, MVT::i1, Expand);
	setTruncStoreAction(MVT::i64, MVT::i8, Expand);
	setTruncStoreAction(MVT::i64, MVT::i16, Expand);
	setTruncStoreAction(MVT::i64, MVT::i32, Expand);

	setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
	setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
	setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
	setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);

	setTruncStoreAction(MVT::f32, MVT::f16, Expand);
	setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
	setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
	setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);

	setTruncStoreAction(MVT::f64, MVT::f16, Expand);
	setTruncStoreAction(MVT::f64, MVT::f32, Expand);

	setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
	setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);

	setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
	setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);

	setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
	setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);


	setOperationAction(ISD::Constant, MVT::i32, Legal);
	setOperationAction(ISD::Constant, MVT::i64, Legal);
	setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
	setOperationAction(ISD::ConstantFP, MVT::f64, Legal);

	setOperationAction(ISD::BR_JT, MVT::Other, Expand);
	setOperationAction(ISD::BRIND, MVT::Other, Expand);

	// This is totally unsupported, just custom lower to produce an error.
	setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);

	// Library functions. These default to Expand, but we have instructions
	// for them.
	setOperationAction(ISD::FCEIL, MVT::f32, Legal);
	setOperationAction(ISD::FEXP2, MVT::f32, Legal);
	setOperationAction(ISD::FPOW, MVT::f32, Legal);
	setOperationAction(ISD::FLOG2, MVT::f32, Legal);
	setOperationAction(ISD::FABS, MVT::f32, Legal);
	setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
	setOperationAction(ISD::FRINT, MVT::f32, Legal);
	setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
	setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
	setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);

	setOperationAction(ISD::FROUND, MVT::f32, Custom);
	setOperationAction(ISD::FROUND, MVT::f64, Custom);

	setOperationAction(ISD::FLOG, MVT::f32, Custom);
	setOperationAction(ISD::FLOG10, MVT::f32, Custom);

	if (Subtarget->has16BitInsts()) {
	setOperationAction(ISD::FLOG, MVT::f16, Custom);
	setOperationAction(ISD::FLOG10, MVT::f16, Custom);
	}

	setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
	setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);

	setOperationAction(ISD::FREM, MVT::f32, Custom);
	setOperationAction(ISD::FREM, MVT::f64, Custom);

	// v_mad_f32 does not support denormals according to some sources.
	if (!Subtarget->hasFP32Denormals())
	setOperationAction(ISD::FMAD, MVT::f32, Legal);

	// Expand to fneg + fadd.
	setOperationAction(ISD::FSUB, MVT::f64, Expand);

	setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
	setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
	setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
	setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
	setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
	setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
	setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);

	if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
	setOperationAction(ISD::FCEIL, MVT::f64, Custom);
	setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
	setOperationAction(ISD::FRINT, MVT::f64, Custom);
	setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
	}

	if (!Subtarget->hasBFI()) {
	// fcopysign can be done in a single instruction with BFI.
	setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
	}

	setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
	setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);

	const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
	for (MVT VT : ScalarIntVTs) {
	// These should use [SU]DIVREM, so set them to expand
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);

	// GPU does not have divrem function for signed or unsigned.
	setOperationAction(ISD::SDIVREM, VT, Custom);
	setOperationAction(ISD::UDIVREM, VT, Custom);

	// GPU does not have [S\|U]MUL_LOHI functions as a single instruction.
	setOperationAction(ISD::SMUL_LOHI, VT, Expand);
	setOperationAction(ISD::UMUL_LOHI, VT, Expand);

	setOperationAction(ISD::BSWAP, VT, Expand);
	setOperationAction(ISD::CTTZ, VT, Expand);
	setOperationAction(ISD::CTLZ, VT, Expand);
	}

	if (!Subtarget->hasBCNT(32))
	setOperationAction(ISD::CTPOP, MVT::i32, Expand);

	if (!Subtarget->hasBCNT(64))
	setOperationAction(ISD::CTPOP, MVT::i64, Expand);

	// The hardware supports 32-bit ROTR, but not ROTL.
	setOperationAction(ISD::ROTL, MVT::i32, Expand);
	setOperationAction(ISD::ROTL, MVT::i64, Expand);
	setOperationAction(ISD::ROTR, MVT::i64, Expand);

	setOperationAction(ISD::MUL, MVT::i64, Expand);
	setOperationAction(ISD::MULHU, MVT::i64, Expand);
	setOperationAction(ISD::MULHS, MVT::i64, Expand);
	setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);

	setOperationAction(ISD::SMIN, MVT::i32, Legal);
	setOperationAction(ISD::UMIN, MVT::i32, Legal);
	setOperationAction(ISD::SMAX, MVT::i32, Legal);
	setOperationAction(ISD::UMAX, MVT::i32, Legal);

	if (Subtarget->hasFFBH())
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);

	if (Subtarget->hasFFBL())
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);

	setOperationAction(ISD::CTTZ, MVT::i64, Custom);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
	setOperationAction(ISD::CTLZ, MVT::i64, Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);

	// We only really have 32-bit BFE instructions (and 16-bit on VI).
	//
	// On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
	// effort to match them now. We want this to be false for i64 cases when the
	// extraction isn't restricted to the upper or lower half. Ideally we would
	// have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
	// span the midpoint are probably relatively rare, so don't worry about them
	// for now.
	if (Subtarget->hasBFE())
	setHasExtractBitsInsn(true);

	static const MVT::SimpleValueType VectorIntTypes[] = {
	MVT::v2i32, MVT::v4i32
	};

	for (MVT VT : VectorIntTypes) {
	// Expand the following operations for the current type by default.
	setOperationAction(ISD::ADD, VT, Expand);
	setOperationAction(ISD::AND, VT, Expand);
	setOperationAction(ISD::FP_TO_SINT, VT, Expand);
	setOperationAction(ISD::FP_TO_UINT, VT, Expand);
	setOperationAction(ISD::MUL, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::OR, VT, Expand);
	setOperationAction(ISD::SHL, VT, Expand);
	setOperationAction(ISD::SRA, VT, Expand);
	setOperationAction(ISD::SRL, VT, Expand);
	setOperationAction(ISD::ROTL, VT, Expand);
	setOperationAction(ISD::ROTR, VT, Expand);
	setOperationAction(ISD::SUB, VT, Expand);
	setOperationAction(ISD::SINT_TO_FP, VT, Expand);
	setOperationAction(ISD::UINT_TO_FP, VT, Expand);
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	setOperationAction(ISD::SMUL_LOHI, VT, Expand);
	setOperationAction(ISD::UMUL_LOHI, VT, Expand);
	setOperationAction(ISD::SDIVREM, VT, Custom);
	setOperationAction(ISD::UDIVREM, VT, Expand);
	setOperationAction(ISD::ADDC, VT, Expand);
	setOperationAction(ISD::SUBC, VT, Expand);
	setOperationAction(ISD::ADDE, VT, Expand);
	setOperationAction(ISD::SUBE, VT, Expand);
	setOperationAction(ISD::SELECT, VT, Expand);
	setOperationAction(ISD::VSELECT, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);
	setOperationAction(ISD::XOR, VT, Expand);
	setOperationAction(ISD::BSWAP, VT, Expand);
	setOperationAction(ISD::CTPOP, VT, Expand);
	setOperationAction(ISD::CTTZ, VT, Expand);
	setOperationAction(ISD::CTLZ, VT, Expand);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
	setOperationAction(ISD::SETCC, VT, Expand);
	}

	static const MVT::SimpleValueType FloatVectorTypes[] = {
	MVT::v2f32, MVT::v4f32
	};

	for (MVT VT : FloatVectorTypes) {
	setOperationAction(ISD::FABS, VT, Expand);
	setOperationAction(ISD::FMINNUM, VT, Expand);
	setOperationAction(ISD::FMAXNUM, VT, Expand);
	setOperationAction(ISD::FADD, VT, Expand);
	setOperationAction(ISD::FCEIL, VT, Expand);
	setOperationAction(ISD::FCOS, VT, Expand);
	setOperationAction(ISD::FDIV, VT, Expand);
	setOperationAction(ISD::FEXP2, VT, Expand);
	setOperationAction(ISD::FLOG2, VT, Expand);
	setOperationAction(ISD::FREM, VT, Expand);
	setOperationAction(ISD::FLOG, VT, Expand);
	setOperationAction(ISD::FLOG10, VT, Expand);
	setOperationAction(ISD::FPOW, VT, Expand);
	setOperationAction(ISD::FFLOOR, VT, Expand);
	setOperationAction(ISD::FTRUNC, VT, Expand);
	setOperationAction(ISD::FMUL, VT, Expand);
	setOperationAction(ISD::FMA, VT, Expand);
	setOperationAction(ISD::FRINT, VT, Expand);
	setOperationAction(ISD::FNEARBYINT, VT, Expand);
	setOperationAction(ISD::FSQRT, VT, Expand);
	setOperationAction(ISD::FSIN, VT, Expand);
	setOperationAction(ISD::FSUB, VT, Expand);
	setOperationAction(ISD::FNEG, VT, Expand);
	setOperationAction(ISD::VSELECT, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);
	setOperationAction(ISD::FCOPYSIGN, VT, Expand);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
	setOperationAction(ISD::SETCC, VT, Expand);
	}

	// This causes using an unrolled select operation rather than expansion with
	// bit operations. This is in general better, but the alternative using BFI
	// instructions may be better if the select sources are SGPRs.
	setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
	AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);

	setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
	AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);

	// There are no libcalls of any kind.
	for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
	setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);

	setBooleanContents(ZeroOrNegativeOneBooleanContent);
	setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

	setSchedulingPreference(Sched::RegPressure);
	setJumpIsExpensive(true);

	// FIXME: This is only partially true. If we have to do vector compares, any
	// SGPR pair can be a condition register. If we have a uniform condition, we
	// are better off doing SALU operations, where there is only one SCC. For now,
	// we don't have a way of knowing during instruction selection if a condition
	// will be uniform and we always use vector compares. Assume we are using
	// vector compares until that is fixed.
	setHasMultipleConditionRegisters(true);

	// SI at least has hardware support for floating point exceptions, but no way
	// of using or handling them is implemented. They are also optional in OpenCL
	// (Section 7.3)
	setHasFloatingPointExceptions(Subtarget->hasFPExceptions());

	PredictableSelectIsExpensive = false;

	// We want to find all load dependencies for long chains of stores to enable
	// merging into very wide vectors. The problem is with vectors with > 4
	// elements. MergeConsecutiveStores will attempt to merge these because x8/x16
	// vectors are a legal type, even though we have to split the loads
	// usually. When we can more precisely specify load legality per address
	// space, we should be able to make FindBetterChain/MergeConsecutiveStores
	// smarter so that they can figure out what to do in 2 iterations without all
	// N > 4 stores on the same chain.
	GatherAllAliasesMaxDepth = 16;

	// memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
	// about these during lowering.
	MaxStoresPerMemcpy = 0xffffffff;
	MaxStoresPerMemmove = 0xffffffff;
	MaxStoresPerMemset = 0xffffffff;

	setTargetDAGCombine(ISD::BITCAST);
	setTargetDAGCombine(ISD::SHL);
	setTargetDAGCombine(ISD::SRA);
	setTargetDAGCombine(ISD::SRL);
	setTargetDAGCombine(ISD::MUL);
	setTargetDAGCombine(ISD::MULHU);
	setTargetDAGCombine(ISD::MULHS);
	setTargetDAGCombine(ISD::SELECT);
	setTargetDAGCombine(ISD::SELECT_CC);
	setTargetDAGCombine(ISD::STORE);
	setTargetDAGCombine(ISD::FADD);
	setTargetDAGCombine(ISD::FSUB);
	setTargetDAGCombine(ISD::FNEG);
	setTargetDAGCombine(ISD::FABS);
	setTargetDAGCombine(ISD::AssertZext);
	setTargetDAGCombine(ISD::AssertSext);
	}

	//===----------------------------------------------------------------------===//
	// Target Information
	//===----------------------------------------------------------------------===//

	LLVM_READNONE
	static bool fnegFoldsIntoOp(unsigned Opc) {
	switch (Opc) {
	case ISD::FADD:
	case ISD::FSUB:
	case ISD::FMUL:
	case ISD::FMA:
	case ISD::FMAD:
	case ISD::FMINNUM:
	case ISD::FMAXNUM:
	case ISD::FSIN:
	case ISD::FTRUNC:
	case ISD::FRINT:
	case ISD::FNEARBYINT:
	case AMDGPUISD::RCP:
	case AMDGPUISD::RCP_LEGACY:
	case AMDGPUISD::SIN_HW:
	case AMDGPUISD::FMUL_LEGACY:
	case AMDGPUISD::FMIN_LEGACY:
	case AMDGPUISD::FMAX_LEGACY:
	return true;
	default:
	return false;
	}
	}

	/// \p returns true if the operation will definitely need to use a 64-bit
	/// encoding, and thus will use a VOP3 encoding regardless of the source
	/// modifiers.
	LLVM_READONLY
	static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
	return N->getNumOperands() > 2 \|\| VT == MVT::f64;
	}

	// Most FP instructions support source modifiers, but this could be refined
	// slightly.
	LLVM_READONLY
	static bool hasSourceMods(const SDNode *N) {
	if (isa<MemSDNode>(N))
	return false;

	switch (N->getOpcode()) {
	case ISD::CopyToReg:
	case ISD::SELECT:
	case ISD::FDIV:
	case ISD::FREM:
	case ISD::INLINEASM:
	case AMDGPUISD::INTERP_P1:
	case AMDGPUISD::INTERP_P2:
	case AMDGPUISD::DIV_SCALE:

	// TODO: Should really be looking at the users of the bitcast. These are
	// problematic because bitcasts are used to legalize all stores to integer
	// types.
	case ISD::BITCAST:
	return false;
	default:
	return true;
	}
	}

	bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
	unsigned CostThreshold) {
	// Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
	// it is truly free to use a source modifier in all cases. If there are
	// multiple users but for each one will necessitate using VOP3, there will be
	// a code size increase. Try to avoid increasing code size unless we know it
	// will save on the instruction count.
	unsigned NumMayIncreaseSize = 0;
	MVT VT = N->getValueType(0).getScalarType().getSimpleVT();

	// XXX - Should this limit number of uses to check?
	for (const SDNode *U : N->uses()) {
	if (!hasSourceMods(U))
	return false;

	if (!opMustUseVOP3Encoding(U, VT)) {
	if (++NumMayIncreaseSize > CostThreshold)
	return false;
	}
	}

	return true;
	}

	MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
	return MVT::i32;
	}

	bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
	return true;
	}

	// The backend supports 32 and 64 bit floating point immediates.
	// FIXME: Why are we reporting vectors of FP immediates as legal?
	bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
	EVT ScalarVT = VT.getScalarType();
	return (ScalarVT == MVT::f32 \|\| ScalarVT == MVT::f64 \|\|
	(ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
	}

	// We don't want to shrink f64 / f32 constants.
	bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
	EVT ScalarVT = VT.getScalarType();
	return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
	}

	bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
	ISD::LoadExtType,
	EVT NewVT) const {

	unsigned NewSize = NewVT.getStoreSizeInBits();

	// If we are reducing to a 32-bit load, this is always better.
	if (NewSize == 32)
	return true;

	EVT OldVT = N->getValueType(0);
	unsigned OldSize = OldVT.getStoreSizeInBits();

	// Don't produce extloads from sub 32-bit types. SI doesn't have scalar
	// extloads, so doing one requires using a buffer_load. In cases where we
	// still couldn't use a scalar load, using the wider load shouldn't really
	// hurt anything.

	// If the old size already had to be an extload, there's no harm in continuing
	// to reduce the width.
	return (OldSize < 32);
	}

	bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
	EVT CastTy) const {

	assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());

	if (LoadTy.getScalarType() == MVT::i32)
	return false;

	unsigned LScalarSize = LoadTy.getScalarSizeInBits();
	unsigned CastScalarSize = CastTy.getScalarSizeInBits();

	return (LScalarSize < CastScalarSize) \|\|
	(CastScalarSize >= 32);
	}

	// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
	// profitable with the expansion for 64-bit since it's generally good to
	// speculate things.
	// FIXME: These should really have the size as a parameter.
	bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
	return true;
	}

	bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
	return true;
	}

	//===---------------------------------------------------------------------===//
	// Target Properties
	//===---------------------------------------------------------------------===//

	bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
	assert(VT.isFloatingPoint());

	// Packed operations do not have a fabs modifier.
	return VT == MVT::f32 \|\| VT == MVT::f64 \|\|
	(Subtarget->has16BitInsts() && VT == MVT::f16);
	}

	bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
	assert(VT.isFloatingPoint());
	return VT == MVT::f32 \|\| VT == MVT::f64 \|\|
	(Subtarget->has16BitInsts() && VT == MVT::f16) \|\|
	(Subtarget->hasVOP3PInsts() && VT == MVT::v2f16);
	}

	bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
	unsigned NumElem,
	unsigned AS) const {
	return true;
	}

	bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
	// There are few operations which truly have vector input operands. Any vector
	// operation is going to involve operations on each component, and a
	// build_vector will be a copy per element, so it always makes sense to use a
	// build_vector input in place of the extracted element to avoid a copy into a
	// super register.
	//
	// We should probably only do this if all users are extracts only, but this
	// should be the common case.
	return true;
	}

	bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
	// Truncate is just accessing a subregister.

	unsigned SrcSize = Source.getSizeInBits();
	unsigned DestSize = Dest.getSizeInBits();

	return DestSize < SrcSize && DestSize % 32 == 0 ;
	}

	bool AMDGPUTargetLowering::isTruncateFree(Type Source, Type Dest) const {
	// Truncate is just accessing a subregister.

	unsigned SrcSize = Source->getScalarSizeInBits();
	unsigned DestSize = Dest->getScalarSizeInBits();

	if (DestSize== 16 && Subtarget->has16BitInsts())
	return SrcSize >= 32;

	return DestSize < SrcSize && DestSize % 32 == 0;
	}

	bool AMDGPUTargetLowering::isZExtFree(Type Src, Type Dest) const {
	unsigned SrcSize = Src->getScalarSizeInBits();
	unsigned DestSize = Dest->getScalarSizeInBits();

	if (SrcSize == 16 && Subtarget->has16BitInsts())
	return DestSize >= 32;

	return SrcSize == 32 && DestSize == 64;
	}

	bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
	// Any register load of a 64-bit value really requires 2 32-bit moves. For all
	// practical purposes, the extra mov 0 to load a 64-bit is free. As used,
	// this will enable reducing 64-bit operations the 32-bit, which is always
	// good.

	if (Src == MVT::i16)
	return Dest == MVT::i32 \|\|Dest == MVT::i64 ;

	return Src == MVT::i32 && Dest == MVT::i64;
	}

	bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
	return isZExtFree(Val.getValueType(), VT2);
	}

	// v_mad_mix* support a conversion from f16 to f32.
	//
	// There is only one special case when denormals are enabled we don't currently,
	// where this is OK to use.
	bool AMDGPUTargetLowering::isFPExtFoldable(unsigned Opcode,
	EVT DestVT, EVT SrcVT) const {
	return Opcode == ISD::FMAD && Subtarget->hasMadMixInsts() &&
	DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
	SrcVT.getScalarType() == MVT::f16;
	}

	bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
	// There aren't really 64-bit registers, but pairs of 32-bit ones and only a
	// limited number of native 64-bit operations. Shrinking an operation to fit
	// in a single 32-bit register should always be helpful. As currently used,
	// this is much less general than the name suggests, and is only used in
	// places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
	// not profitable, and may actually be harmful.
	return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
	}

	//===---------------------------------------------------------------------===//
	// TargetLowering Callbacks
	//===---------------------------------------------------------------------===//

	CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
	bool IsVarArg) {
	switch (CC) {
	case CallingConv::AMDGPU_KERNEL:
	case CallingConv::SPIR_KERNEL:
	return CC_AMDGPU_Kernel;
	case CallingConv::AMDGPU_VS:
	case CallingConv::AMDGPU_GS:
	case CallingConv::AMDGPU_PS:
	case CallingConv::AMDGPU_CS:
	case CallingConv::AMDGPU_HS:
	case CallingConv::AMDGPU_ES:
	case CallingConv::AMDGPU_LS:
	return CC_AMDGPU;
	case CallingConv::C:
	case CallingConv::Fast:
	case CallingConv::Cold:
	return CC_AMDGPU_Func;
	default:
	report_fatal_error("Unsupported calling convention.");
	}
	}

	CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
	bool IsVarArg) {
	switch (CC) {
	case CallingConv::AMDGPU_KERNEL:
	case CallingConv::SPIR_KERNEL:
	return CC_AMDGPU_Kernel;
	case CallingConv::AMDGPU_VS:
	case CallingConv::AMDGPU_GS:
	case CallingConv::AMDGPU_PS:
	case CallingConv::AMDGPU_CS:
	case CallingConv::AMDGPU_HS:
	case CallingConv::AMDGPU_ES:
	case CallingConv::AMDGPU_LS:
	return RetCC_SI_Shader;
	case CallingConv::C:
	case CallingConv::Fast:
	case CallingConv::Cold:
	return RetCC_AMDGPU_Func;
	default:
	report_fatal_error("Unsupported calling convention.");
	}
	}

	/// The SelectionDAGBuilder will automatically promote function arguments
	/// with illegal types. However, this does not work for the AMDGPU targets
	/// since the function arguments are stored in memory as these illegal types.
	/// In order to handle this properly we need to get the original types sizes
	/// from the LLVM IR Function and fixup the ISD:InputArg values before
	/// passing them to AnalyzeFormalArguments()

	/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
	/// input values across multiple registers. Each item in the Ins array
	/// represents a single value that will be stored in registers. Ins[x].VT is
	/// the value type of the value that will be stored in the register, so
	/// whatever SDNode we lower the argument to needs to be this type.
	///
	/// In order to correctly lower the arguments we need to know the size of each
	/// argument. Since Ins[x].VT gives us the size of the register that will
	/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
	/// for the orignal function argument so that we can deduce the correct memory
	/// type to use for Ins[x]. In most cases the correct memory type will be
	/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
	/// we have a kernel argument of type v8i8, this argument will be split into
	/// 8 parts and each part will be represented by its own item in the Ins array.
	/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
	/// the argument before it was split. From this, we deduce that the memory type
	/// for each individual part is i8. We pass the memory type as LocVT to the
	/// calling convention analysis function and the register type (Ins[x].VT) as
	/// the ValVT.
	void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,
	const SmallVectorImpl<ISD::InputArg> &Ins) const {
	for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
	const ISD::InputArg &In = Ins[i];
	EVT MemVT;

	unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT);

	if (!Subtarget->isAmdHsaOS() &&
	(In.ArgVT == MVT::i16 \|\| In.ArgVT == MVT::i8 \|\| In.ArgVT == MVT::f16)) {
	// The ABI says the caller will extend these values to 32-bits.
	MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32;
	} else if (NumRegs == 1) {
	// This argument is not split, so the IR type is the memory type.
	assert(!In.Flags.isSplit());
	if (In.ArgVT.isExtended()) {
	// We have an extended type, like i24, so we should just use the register type
	MemVT = In.VT;
	} else {
	MemVT = In.ArgVT;
	}
	} else if (In.ArgVT.isVector() && In.VT.isVector() &&
	In.ArgVT.getScalarType() == In.VT.getScalarType()) {
	assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements());
	// We have a vector value which has been split into a vector with
	// the same scalar type, but fewer elements. This should handle
	// all the floating-point vector types.
	MemVT = In.VT;
	} else if (In.ArgVT.isVector() &&
	In.ArgVT.getVectorNumElements() == NumRegs) {
	// This arg has been split so that each element is stored in a separate
	// register.
	MemVT = In.ArgVT.getScalarType();
	} else if (In.ArgVT.isExtended()) {
	// We have an extended type, like i65.
	MemVT = In.VT;
	} else {
	unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs;
	assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0);
	if (In.VT.isInteger()) {
	MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
	} else if (In.VT.isVector()) {
	assert(!In.VT.getScalarType().isFloatingPoint());
	unsigned NumElements = In.VT.getVectorNumElements();
	assert(MemoryBits % NumElements == 0);
	// This vector type has been split into another vector type with
	// a different elements size.
	EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
	MemoryBits / NumElements);
	MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
	} else {
	llvm_unreachable("cannot deduce memory type.");
	}
	}

	// Convert one element vectors to scalar.
	if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
	MemVT = MemVT.getScalarType();

	if (MemVT.isExtended()) {
	// This should really only happen if we have vec3 arguments
	assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
	MemVT = MemVT.getPow2VectorType(State.getContext());
	}

	assert(MemVT.isSimple());
	allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags,
	State);
	}
	}

	SDValue AMDGPUTargetLowering::LowerReturn(
	SDValue Chain, CallingConv::ID CallConv,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &DL, SelectionDAG &DAG) const {
	// FIXME: Fails for r600 tests
	//assert(!isVarArg && Outs.empty() && OutVals.empty() &&
	// "wave terminate should not have return values");
	return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
	}

	//===---------------------------------------------------------------------===//
	// Target specific lowering
	//===---------------------------------------------------------------------===//

	/// Selects the correct CCAssignFn for a given CallingConvention value.
	CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
	bool IsVarArg) {
	return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
	}

	CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
	bool IsVarArg) {
	return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
	}

	SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
	SelectionDAG &DAG,
	MachineFrameInfo &MFI,
	int ClobberedFI) const {
	SmallVector<SDValue, 8> ArgChains;
	int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
	int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;

	// Include the original chain at the beginning of the list. When this is
	// used by target LowerCall hooks, this helps legalize find the
	// CALLSEQ_BEGIN node.
	ArgChains.push_back(Chain);

	// Add a chain value for each stack argument corresponding
	for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
	UE = DAG.getEntryNode().getNode()->use_end();
	U != UE; ++U) {
	if (LoadSDNode L = dyn_cast<LoadSDNode>(U)) {
	if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
	if (FI->getIndex() < 0) {
	int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
	int64_t InLastByte = InFirstByte;
	InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;

	if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) \|\|
	(FirstByte <= InFirstByte && InFirstByte <= LastByte))
	ArgChains.push_back(SDValue(L, 1));
	}
	}
	}
	}

	// Build a tokenfactor for all the chains.
	return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
	}

	SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals,
	StringRef Reason) const {
	SDValue Callee = CLI.Callee;
	SelectionDAG &DAG = CLI.DAG;

	const Function &Fn = DAG.getMachineFunction().getFunction();

	StringRef FuncName("<unknown>");

	if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
	FuncName = G->getSymbol();
	else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
	FuncName = G->getGlobal()->getName();

	DiagnosticInfoUnsupported NoCalls(
	Fn, Reason + FuncName, CLI.DL.getDebugLoc());
	DAG.getContext()->diagnose(NoCalls);

	if (!CLI.IsTailCall) {
	for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
	InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
	}

	return DAG.getEntryNode();
	}

	SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
	}

	SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
	SelectionDAG &DAG) const {
	const Function &Fn = DAG.getMachineFunction().getFunction();

	DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
	SDLoc(Op).getDebugLoc());
	DAG.getContext()->diagnose(NoDynamicAlloca);
	auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
	return DAG.getMergeValues(Ops, SDLoc());
	}

	SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
	SelectionDAG &DAG) const {
	switch (Op.getOpcode()) {
	default:
	Op->print(errs(), &DAG);
	llvm_unreachable("Custom lowering code for this"
	"instruction is not implemented yet!");
	break;
	case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
	case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
	case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
	case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
	case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
	case ISD::FREM: return LowerFREM(Op, DAG);
	case ISD::FCEIL: return LowerFCEIL(Op, DAG);
	case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
	case ISD::FRINT: return LowerFRINT(Op, DAG);
	case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
	case ISD::FROUND: return LowerFROUND(Op, DAG);
	case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
	case ISD::FLOG:
	return LowerFLOG(Op, DAG, 1 / AMDGPU_LOG2E_F);
	case ISD::FLOG10:
	return LowerFLOG(Op, DAG, AMDGPU_LN2_F / AMDGPU_LN10_F);
	case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
	case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
	case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
	case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
	case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
	case ISD::CTTZ:
	case ISD::CTTZ_ZERO_UNDEF:
	case ISD::CTLZ:
	case ISD::CTLZ_ZERO_UNDEF:
	return LowerCTLZ_CTTZ(Op, DAG);
	case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
	}
	return Op;
	}

	void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const {
	switch (N->getOpcode()) {
	case ISD::SIGN_EXTEND_INREG:
	// Different parts of legalization seem to interpret which type of
	// sign_extend_inreg is the one to check for custom lowering. The extended
	// from type is what really matters, but some places check for custom
	// lowering of the result type. This results in trying to use
	// ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
	// nothing here and let the illegal result integer be handled normally.
	return;
	default:
	return;
	}
	}

	static bool hasDefinedInitializer(const GlobalValue *GV) {
	const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
	if (!GVar \|\| !GVar->hasInitializer())
	return false;

	return !isa<UndefValue>(GVar->getInitializer());
	}

	SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
	SDValue Op,
	SelectionDAG &DAG) const {

	const DataLayout &DL = DAG.getDataLayout();
	GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
	const GlobalValue *GV = G->getGlobal();

	if (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS) {
	// XXX: What does the value of G->getOffset() mean?
	assert(G->getOffset() == 0 &&
	"Do not know what to do with an non-zero offset");

	// TODO: We could emit code to handle the initialization somewhere.
	if (!hasDefinedInitializer(GV)) {
	unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
	return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
	}
	}

	const Function &Fn = DAG.getMachineFunction().getFunction();
	DiagnosticInfoUnsupported BadInit(
	Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
	DAG.getContext()->diagnose(BadInit);
	return SDValue();
	}

	SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
	SelectionDAG &DAG) const {
	SmallVector<SDValue, 8> Args;

	for (const SDUse &U : Op->ops())
	DAG.ExtractVectorElements(U.get(), Args);

	return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
	}

	SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
	SelectionDAG &DAG) const {

	SmallVector<SDValue, 8> Args;
	unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
	EVT VT = Op.getValueType();
	DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
	VT.getVectorNumElements());

	return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
	}

	/// \brief Generate Min/Max node
	SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
	SDValue LHS, SDValue RHS,
	SDValue True, SDValue False,
	SDValue CC,
	DAGCombinerInfo &DCI) const {
	if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
	switch (CCOpcode) {
	case ISD::SETOEQ:
	case ISD::SETONE:
	case ISD::SETUNE:
	case ISD::SETNE:
	case ISD::SETUEQ:
	case ISD::SETEQ:
	case ISD::SETFALSE:
	case ISD::SETFALSE2:
	case ISD::SETTRUE:
	case ISD::SETTRUE2:
	case ISD::SETUO:
	case ISD::SETO:
	break;
	case ISD::SETULE:
	case ISD::SETULT: {
	if (LHS == True)
	return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
	return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
	}
	case ISD::SETOLE:
	case ISD::SETOLT:
	case ISD::SETLE:
	case ISD::SETLT: {
	// Ordered. Assume ordered for undefined.

	// Only do this after legalization to avoid interfering with other combines
	// which might occur.
	if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
	!DCI.isCalledByLegalizer())
	return SDValue();

	// We need to permute the operands to get the correct NaN behavior. The
	// selected operand is the second one based on the failing compare with NaN,
	// so permute it based on the compare type the hardware uses.
	if (LHS == True)
	return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
	return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
	}
	case ISD::SETUGE:
	case ISD::SETUGT: {
	if (LHS == True)
	return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
	return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
	}
	case ISD::SETGT:
	case ISD::SETGE:
	case ISD::SETOGE:
	case ISD::SETOGT: {
	if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
	!DCI.isCalledByLegalizer())
	return SDValue();

	if (LHS == True)
	return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
	return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
	}
	case ISD::SETCC_INVALID:
	llvm_unreachable("Invalid setcc condcode!");
	}
	return SDValue();
	}

	std::pair<SDValue, SDValue>
	AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
	SDLoc SL(Op);

	SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);

	const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
	const SDValue One = DAG.getConstant(1, SL, MVT::i32);

	SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
	SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);

	return std::make_pair(Lo, Hi);
	}

	SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
	SDLoc SL(Op);

	SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
	const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
	}

	SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
	SDLoc SL(Op);

	SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
	const SDValue One = DAG.getConstant(1, SL, MVT::i32);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
	}

	SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
	SelectionDAG &DAG) const {
	LoadSDNode *Load = cast<LoadSDNode>(Op);
	EVT VT = Op.getValueType();


	// If this is a 2 element vector, we really want to scalarize and not create
	// weird 1 element vectors.
	if (VT.getVectorNumElements() == 2)
	return scalarizeVectorLoad(Load, DAG);

	SDValue BasePtr = Load->getBasePtr();
	EVT MemVT = Load->getMemoryVT();
	SDLoc SL(Op);

	const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();

	EVT LoVT, HiVT;
	EVT LoMemVT, HiMemVT;
	SDValue Lo, Hi;

	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
	std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
	std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);

	unsigned Size = LoMemVT.getStoreSize();
	unsigned BaseAlign = Load->getAlignment();
	unsigned HiAlign = MinAlign(BaseAlign, Size);

	SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
	Load->getChain(), BasePtr, SrcValue, LoMemVT,
	BaseAlign, Load->getMemOperand()->getFlags());
	SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, Size);
	SDValue HiLoad =
	DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
	HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
	HiMemVT, HiAlign, Load->getMemOperand()->getFlags());

	SDValue Ops[] = {
	DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
	DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
	LoLoad.getValue(1), HiLoad.getValue(1))
	};

	return DAG.getMergeValues(Ops, SL);
	}

	SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
	SelectionDAG &DAG) const {
	StoreSDNode *Store = cast<StoreSDNode>(Op);
	SDValue Val = Store->getValue();
	EVT VT = Val.getValueType();

	// If this is a 2 element vector, we really want to scalarize and not create
	// weird 1 element vectors.
	if (VT.getVectorNumElements() == 2)
	return scalarizeVectorStore(Store, DAG);

	EVT MemVT = Store->getMemoryVT();
	SDValue Chain = Store->getChain();
	SDValue BasePtr = Store->getBasePtr();
	SDLoc SL(Op);

	EVT LoVT, HiVT;
	EVT LoMemVT, HiMemVT;
	SDValue Lo, Hi;

	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
	std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
	std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT);

	SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());

	const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
	unsigned BaseAlign = Store->getAlignment();
	unsigned Size = LoMemVT.getStoreSize();
	unsigned HiAlign = MinAlign(BaseAlign, Size);

	SDValue LoStore =
	DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
	Store->getMemOperand()->getFlags());
	SDValue HiStore =
	DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
	HiMemVT, HiAlign, Store->getMemOperand()->getFlags());

	return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
	}

	// This is a shortcut for integer division because we have fast i32<->f32
	// conversions, and fast f32 reciprocal instructions. The fractional part of a
	// float is enough to accurately represent up to a 24-bit signed integer.
	SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
	bool Sign) const {
	SDLoc DL(Op);
	EVT VT = Op.getValueType();
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	MVT IntVT = MVT::i32;
	MVT FltVT = MVT::f32;

	unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
	if (LHSSignBits < 9)
	return SDValue();

	unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
	if (RHSSignBits < 9)
	return SDValue();

	unsigned BitSize = VT.getSizeInBits();
	unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
	unsigned DivBits = BitSize - SignBits;
	if (Sign)
	++DivBits;

	ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
	ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;

	SDValue jq = DAG.getConstant(1, DL, IntVT);

	if (Sign) {
	// char\|short jq = ia ^ ib;
	jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);

	// jq = jq >> (bitsize - 2)
	jq = DAG.getNode(ISD::SRA, DL, VT, jq,
	DAG.getConstant(BitSize - 2, DL, VT));

	// jq = jq \| 0x1
	jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
	}

	// int ia = (int)LHS;
	SDValue ia = LHS;

	// int ib, (int)RHS;
	SDValue ib = RHS;

	// float fa = (float)ia;
	SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);

	// float fb = (float)ib;
	SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);

	SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
	fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));

	// fq = trunc(fq);
	fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);

	// float fqneg = -fq;
	SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);

	// float fr = mad(fqneg, fb, fa);
	unsigned OpCode = Subtarget->hasFP32Denormals() ?
	(unsigned)AMDGPUISD::FMAD_FTZ :
	(unsigned)ISD::FMAD;
	SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);

	// int iq = (int)fq;
	SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);

	// fr = fabs(fr);
	fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);

	// fb = fabs(fb);
	fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);

	EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);

	// int cv = fr >= fb;
	SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);

	// jq = (cv ? jq : 0);
	jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));

	// dst = iq + jq;
	SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);

	// Rem needs compensation, it's easier to recompute it
	SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
	Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);

	// Truncate to number of bits this divide really is.
	if (Sign) {
	SDValue InRegSize
	= DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
	Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
	Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
	} else {
	SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
	Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
	Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
	}

	return DAG.getMergeValues({ Div, Rem }, DL);
	}

	void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
	SelectionDAG &DAG,
	SmallVectorImpl<SDValue> &Results) const {
	SDLoc DL(Op);
	EVT VT = Op.getValueType();

	assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");

	EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());

	SDValue One = DAG.getConstant(1, DL, HalfVT);
	SDValue Zero = DAG.getConstant(0, DL, HalfVT);

	//HiLo split
	SDValue LHS = Op.getOperand(0);
	SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
	SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);

	SDValue RHS = Op.getOperand(1);
	SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
	SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);

	if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
	DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {

	SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
	LHS_Lo, RHS_Lo);

	SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
	SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});

	Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
	Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
	return;
	}

	if (isTypeLegal(MVT::i64)) {
	// Compute denominator reciprocal.
	unsigned FMAD = Subtarget->hasFP32Denormals() ?
	(unsigned)AMDGPUISD::FMAD_FTZ :
	(unsigned)ISD::FMAD;

	SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
	SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
	SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
	DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
	Cvt_Lo);
	SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
	SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
	DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
	SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
	DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
	SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
	SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
	DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
	Mul1);
	SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
	SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
	SDValue Rcp64 = DAG.getBitcast(VT,
	DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));

	SDValue Zero64 = DAG.getConstant(0, DL, VT);
	SDValue One64 = DAG.getConstant(1, DL, VT);
	SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
	SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);

	SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
	SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
	SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
	SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
	Zero);
	SDValue Mulhi1_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
	One);

	SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
	Mulhi1_Lo, Zero1);
	SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
	Mulhi1_Hi, Add1_Lo.getValue(1));
	SDValue Add1_HiNc = DAG.getNode(ISD::ADD, DL, HalfVT, Rcp_Hi, Mulhi1_Hi);
	SDValue Add1 = DAG.getBitcast(VT,
	DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));

	SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
	SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
	SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
	Zero);
	SDValue Mulhi2_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
	One);

	SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
	Mulhi2_Lo, Zero1);
	SDValue Add2_HiC = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_HiNc,
	Mulhi2_Hi, Add1_Lo.getValue(1));
	SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add2_HiC,
	Zero, Add2_Lo.getValue(1));
	SDValue Add2 = DAG.getBitcast(VT,
	DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
	SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);

	SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);

	SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
	SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
	SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
	Mul3_Lo, Zero1);
	SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
	Mul3_Hi, Sub1_Lo.getValue(1));
	SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
	SDValue Sub1 = DAG.getBitcast(VT,
	DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));

	SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
	SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
	ISD::SETUGE);
	SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
	ISD::SETUGE);
	SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);

	// TODO: Here and below portions of the code can be enclosed into if/endif.
	// Currently control flow is unconditional and we have 4 selects after
	// potential endif to substitute PHIs.

	// if C3 != 0 ...
	SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
	RHS_Lo, Zero1);
	SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
	RHS_Hi, Sub1_Lo.getValue(1));
	SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
	Zero, Sub2_Lo.getValue(1));
	SDValue Sub2 = DAG.getBitcast(VT,
	DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));

	SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);

	SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
	ISD::SETUGE);
	SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
	ISD::SETUGE);
	SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);

	// if (C6 != 0)
	SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);

	SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
	RHS_Lo, Zero1);
	SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
	RHS_Hi, Sub2_Lo.getValue(1));
	SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
	Zero, Sub3_Lo.getValue(1));
	SDValue Sub3 = DAG.getBitcast(VT,
	DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));

	// endif C6
	// endif C3

	SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
	SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);

	SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
	SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);

	Results.push_back(Div);
	Results.push_back(Rem);

	return;
	}

	// r600 expandion.
	// Get Speculative values
	SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
	SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);

	SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
	SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
	REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);

	SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
	SDValue DIV_Lo = Zero;

	const unsigned halfBitWidth = HalfVT.getSizeInBits();

	for (unsigned i = 0; i < halfBitWidth; ++i) {
	const unsigned bitPos = halfBitWidth - i - 1;
	SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
	// Get value of high bit
	SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
	HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
	HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);

	// Shift
	REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
	// Add LHS high bit
	REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);

	SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
	SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);

	DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);

	// Update REM
	SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
	REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
	}

	SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
	DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
	Results.push_back(DIV);
	Results.push_back(REM);
	}

	SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	EVT VT = Op.getValueType();

	if (VT == MVT::i64) {
	SmallVector<SDValue, 2> Results;
	LowerUDIVREM64(Op, DAG, Results);
	return DAG.getMergeValues(Results, DL);
	}

	if (VT == MVT::i32) {
	if (SDValue Res = LowerDIVREM24(Op, DAG, false))
	return Res;
	}

	SDValue Num = Op.getOperand(0);
	SDValue Den = Op.getOperand(1);

	// RCP = URECIP(Den) = 2^32 / Den + e
	// e is rounding error.
	SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);

	// RCP_LO = mul(RCP, Den) */
	SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);

	// RCP_HI = mulhu (RCP, Den) */
	SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);

	// NEG_RCP_LO = -RCP_LO
	SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	RCP_LO);

	// ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
	SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
	NEG_RCP_LO, RCP_LO,
	ISD::SETEQ);
	// Calculate the rounding error from the URECIP instruction
	// E = mulhu(ABS_RCP_LO, RCP)
	SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);

	// RCP_A_E = RCP + E
	SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);

	// RCP_S_E = RCP - E
	SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);

	// Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
	SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
	RCP_A_E, RCP_S_E,
	ISD::SETEQ);
	// Quotient = mulhu(Tmp0, Num)
	SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);

	// Num_S_Remainder = Quotient * Den
	SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);

	// Remainder = Num - Num_S_Remainder
	SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);

	// Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
	SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
	DAG.getConstant(-1, DL, VT),
	DAG.getConstant(0, DL, VT),
	ISD::SETUGE);
	// Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
	SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
	Num_S_Remainder,
	DAG.getConstant(-1, DL, VT),
	DAG.getConstant(0, DL, VT),
	ISD::SETUGE);
	// Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
	SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
	Remainder_GE_Zero);

	// Calculate Division result:

	// Quotient_A_One = Quotient + 1
	SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
	DAG.getConstant(1, DL, VT));

	// Quotient_S_One = Quotient - 1
	SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
	DAG.getConstant(1, DL, VT));

	// Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
	SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
	Quotient, Quotient_A_One, ISD::SETEQ);

	// Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
	Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
	Quotient_S_One, Div, ISD::SETEQ);

	// Calculate Rem result:

	// Remainder_S_Den = Remainder - Den
	SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);

	// Remainder_A_Den = Remainder + Den
	SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);

	// Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
	SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
	Remainder, Remainder_S_Den, ISD::SETEQ);

	// Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
	Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
	Remainder_A_Den, Rem, ISD::SETEQ);
	SDValue Ops[2] = {
	Div,
	Rem
	};
	return DAG.getMergeValues(Ops, DL);
	}

	SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	EVT VT = Op.getValueType();

	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);

	SDValue Zero = DAG.getConstant(0, DL, VT);
	SDValue NegOne = DAG.getConstant(-1, DL, VT);

	if (VT == MVT::i32) {
	if (SDValue Res = LowerDIVREM24(Op, DAG, true))
	return Res;
	}

	if (VT == MVT::i64 &&
	DAG.ComputeNumSignBits(LHS) > 32 &&
	DAG.ComputeNumSignBits(RHS) > 32) {
	EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());

	//HiLo split
	SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
	SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
	SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
	LHS_Lo, RHS_Lo);
	SDValue Res[2] = {
	DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
	DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
	};
	return DAG.getMergeValues(Res, DL);
	}

	SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
	SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
	SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
	SDValue RSign = LHSign; // Remainder sign is the same as LHS

	LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
	RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);

	LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
	RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);

	SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
	SDValue Rem = Div.getValue(1);

	Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
	Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);

	Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
	Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);

	SDValue Res[2] = {
	Div,
	Rem
	};
	return DAG.getMergeValues(Res, DL);
	}

	// (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
	SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
	SDLoc SL(Op);
	EVT VT = Op.getValueType();
	SDValue X = Op.getOperand(0);
	SDValue Y = Op.getOperand(1);

	// TODO: Should this propagate fast-math-flags?

	SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
	SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
	SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);

	return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
	}

	SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
	SDLoc SL(Op);
	SDValue Src = Op.getOperand(0);

	// result = trunc(src)
	// if (src > 0.0 && src != result)
	// result += 1.0

	SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);

	const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
	const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);

	EVT SetCCVT =
	getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);

	SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
	SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
	SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);

	SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
	// TODO: Should this propagate fast-math-flags?
	return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
	}

	static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
	SelectionDAG &DAG) {
	const unsigned FractBits = 52;
	const unsigned ExpBits = 11;

	SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
	Hi,
	DAG.getConstant(FractBits - 32, SL, MVT::i32),
	DAG.getConstant(ExpBits, SL, MVT::i32));
	SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
	DAG.getConstant(1023, SL, MVT::i32));

	return Exp;
	}

	SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
	SDLoc SL(Op);
	SDValue Src = Op.getOperand(0);

	assert(Op.getValueType() == MVT::f64);

	const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
	const SDValue One = DAG.getConstant(1, SL, MVT::i32);

	SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);

	// Extract the upper half, since this is where we will find the sign and
	// exponent.
	SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);

	SDValue Exp = extractF64Exponent(Hi, SL, DAG);

	const unsigned FractBits = 52;

	// Extract the sign bit.
	const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
	SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);

	// Extend back to to 64-bits.
	SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
	SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);

	SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
	const SDValue FractMask
	= DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);

	SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
	SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
	SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);

	EVT SetCCVT =
	getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);

	const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);

	SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
	SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);

	SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
	SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);

	return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
	}

	SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
	SDLoc SL(Op);
	SDValue Src = Op.getOperand(0);

	assert(Op.getValueType() == MVT::f64);

	APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
	SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
	SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);

	// TODO: Should this propagate fast-math-flags?

	SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
	SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);

	SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);

	APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
	SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);

	EVT SetCCVT =
	getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
	SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);

	return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
	}

	SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
	// FNEARBYINT and FRINT are the same, except in their handling of FP
	// exceptions. Those aren't really meaningful for us, and OpenCL only has
	// rint, so just treat them as equivalent.
	return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
	}

	// XXX - May require not supporting f32 denormals?

	// Don't handle v2f16. The extra instructions to scalarize and repack around the
	// compare and vselect end up producing worse code than scalarizing the whole
	// operation.
	SDValue AMDGPUTargetLowering::LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const {
	SDLoc SL(Op);
	SDValue X = Op.getOperand(0);
	EVT VT = Op.getValueType();

	SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);

	// TODO: Should this propagate fast-math-flags?

	SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);

	SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);

	const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
	const SDValue One = DAG.getConstantFP(1.0, SL, VT);
	const SDValue Half = DAG.getConstantFP(0.5, SL, VT);

	SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);

	EVT SetCCVT =
	getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);

	SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);

	SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);

	return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
	}

	SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
	SDLoc SL(Op);
	SDValue X = Op.getOperand(0);

	SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);

	const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
	const SDValue One = DAG.getConstant(1, SL, MVT::i32);
	const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
	const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
	EVT SetCCVT =
	getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);

	SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);

	SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);

	SDValue Exp = extractF64Exponent(Hi, SL, DAG);

	const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL,
	MVT::i64);

	SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
	SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
	DAG.getConstant(INT64_C(0x0008000000000000), SL,
	MVT::i64),
	Exp);

	SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
	SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
	DAG.getConstant(0, SL, MVT::i64), Tmp0,
	ISD::SETNE);

	SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
	D, DAG.getConstant(0, SL, MVT::i64));
	SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);

	K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
	K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);

	SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
	SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
	SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);

	SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
	ExpEqNegOne,
	DAG.getConstantFP(1.0, SL, MVT::f64),
	DAG.getConstantFP(0.0, SL, MVT::f64));

	SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);

	K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
	K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);

	return K;
	}

	SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();

	if (VT == MVT::f32 \|\| VT == MVT::f16)
	return LowerFROUND32_16(Op, DAG);

	if (VT == MVT::f64)
	return LowerFROUND64(Op, DAG);

	llvm_unreachable("unhandled type");
	}

	SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
	SDLoc SL(Op);
	SDValue Src = Op.getOperand(0);

	// result = trunc(src);
	// if (src < 0.0 && src != result)
	// result += -1.0.

	SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);

	const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
	const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);

	EVT SetCCVT =
	getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);

	SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
	SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
	SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);

	SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
	// TODO: Should this propagate fast-math-flags?
	return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
	}

	SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
	double Log2BaseInverted) const {
	EVT VT = Op.getValueType();

	SDLoc SL(Op);
	SDValue Operand = Op.getOperand(0);
	SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
	SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);

	return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
	}

	static bool isCtlzOpc(unsigned Opc) {
	return Opc == ISD::CTLZ \|\| Opc == ISD::CTLZ_ZERO_UNDEF;
	}

	static bool isCttzOpc(unsigned Opc) {
	return Opc == ISD::CTTZ \|\| Opc == ISD::CTTZ_ZERO_UNDEF;
	}

	SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
	SDLoc SL(Op);
	SDValue Src = Op.getOperand(0);
	bool ZeroUndef = Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF \|\|
	Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;

	unsigned ISDOpc, NewOpc;
	if (isCtlzOpc(Op.getOpcode())) {
	ISDOpc = ISD::CTLZ_ZERO_UNDEF;
	NewOpc = AMDGPUISD::FFBH_U32;
	} else if (isCttzOpc(Op.getOpcode())) {
	ISDOpc = ISD::CTTZ_ZERO_UNDEF;
	NewOpc = AMDGPUISD::FFBL_B32;
	} else
	llvm_unreachable("Unexpected OPCode!!!");


	if (ZeroUndef && Src.getValueType() == MVT::i32)
	return DAG.getNode(NewOpc, SL, MVT::i32, Src);

	SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);

	const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
	const SDValue One = DAG.getConstant(1, SL, MVT::i32);

	SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
	SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);

	EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), MVT::i32);

	SDValue HiOrLo = isCtlzOpc(Op.getOpcode()) ? Hi : Lo;
	SDValue Hi0orLo0 = DAG.getSetCC(SL, SetCCVT, HiOrLo, Zero, ISD::SETEQ);

	SDValue OprLo = DAG.getNode(ISDOpc, SL, MVT::i32, Lo);
	SDValue OprHi = DAG.getNode(ISDOpc, SL, MVT::i32, Hi);

	const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
	SDValue Add, NewOpr;
	if (isCtlzOpc(Op.getOpcode())) {
	Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprLo, Bits32);
	// ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
	NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprHi);
	} else {
	Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprHi, Bits32);
	// cttz(x) = lo_32(x) == 0 ? cttz(hi_32(x)) + 32 : cttz(lo_32(x))
	NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprLo);
	}

	if (!ZeroUndef) {
	// Test if the full 64-bit input is zero.

	// FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
	// which we probably don't want.
	SDValue LoOrHi = isCtlzOpc(Op.getOpcode()) ? Lo : Hi;
	SDValue Lo0OrHi0 = DAG.getSetCC(SL, SetCCVT, LoOrHi, Zero, ISD::SETEQ);
	SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0OrHi0, Hi0orLo0);

	// TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
	// with the same cycles, otherwise it is slower.
	// SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
	// DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);

	const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);

	// The instruction returns -1 for 0 input, but the defined intrinsic
	// behavior is to return the number of bits.
	NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32,
	SrcIsZero, Bits32, NewOpr);
	}

	return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
	}

	SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
	bool Signed) const {
	// Unsigned
	// cul2f(ulong u)
	//{
	// uint lz = clz(u);
	// uint e = (u != 0) ? 127U + 63U - lz : 0;
	// u = (u << lz) & 0x7fffffffffffffffUL;
	// ulong t = u & 0xffffffffffUL;
	// uint v = (e << 23) \| (uint)(u >> 40);
	// uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
	// return as_float(v + r);
	//}
	// Signed
	// cl2f(long l)
	//{
	// long s = l >> 63;
	// float r = cul2f((l + s) ^ s);
	// return s ? -r : r;
	//}

	SDLoc SL(Op);
	SDValue Src = Op.getOperand(0);
	SDValue L = Src;

	SDValue S;
	if (Signed) {
	const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
	S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);

	SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
	L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
	}

	EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), MVT::f32);


	SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
	SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
	SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
	LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);

	SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
	SDValue E = DAG.getSelect(SL, MVT::i32,
	DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
	DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
	ZeroI32);

	SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
	DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
	DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));

	SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
	DAG.getConstant(0xffffffffffULL, SL, MVT::i64));

	SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
	U, DAG.getConstant(40, SL, MVT::i64));

	SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
	DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
	DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, UShl));

	SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
	SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
	SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);

	SDValue One = DAG.getConstant(1, SL, MVT::i32);

	SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);

	SDValue R = DAG.getSelect(SL, MVT::i32,
	RCmp,
	One,
	DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
	R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
	R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);

	if (!Signed)
	return R;

	SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
	return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
	}

	SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
	bool Signed) const {
	SDLoc SL(Op);
	SDValue Src = Op.getOperand(0);

	SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);

	SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
	DAG.getConstant(0, SL, MVT::i32));
	SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
	DAG.getConstant(1, SL, MVT::i32));

	SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
	SL, MVT::f64, Hi);

	SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);

	SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
	DAG.getConstant(32, SL, MVT::i32));
	// TODO: Should this propagate fast-math-flags?
	return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
	}

	SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getOperand(0).getValueType() == MVT::i64 &&
	"operation should be legal");

	// TODO: Factor out code common with LowerSINT_TO_FP.

	EVT DestVT = Op.getValueType();
	if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
	SDLoc DL(Op);
	SDValue Src = Op.getOperand(0);

	SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
	SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
	SDValue FPRound =
	DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);

	return FPRound;
	}

	if (DestVT == MVT::f32)
	return LowerINT_TO_FP32(Op, DAG, false);

	assert(DestVT == MVT::f64);
	return LowerINT_TO_FP64(Op, DAG, false);
	}

	SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getOperand(0).getValueType() == MVT::i64 &&
	"operation should be legal");

	// TODO: Factor out code common with LowerUINT_TO_FP.

	EVT DestVT = Op.getValueType();
	if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
	SDLoc DL(Op);
	SDValue Src = Op.getOperand(0);

	SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
	SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
	SDValue FPRound =
	DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);

	return FPRound;
	}

	if (DestVT == MVT::f32)
	return LowerINT_TO_FP32(Op, DAG, true);

	assert(DestVT == MVT::f64);
	return LowerINT_TO_FP64(Op, DAG, true);
	}

	SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
	bool Signed) const {
	SDLoc SL(Op);

	SDValue Src = Op.getOperand(0);

	SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);

	SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL,
	MVT::f64);
	SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
	MVT::f64);
	// TODO: Should this propagate fast-math-flags?
	SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);

	SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);


	SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);

	SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
	MVT::i32, FloorMul);
	SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);

	SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});

	return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
	}

	SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
	SDLoc DL(Op);
	SDValue N0 = Op.getOperand(0);

	// Convert to target node to get known bits
	if (N0.getValueType() == MVT::f32)
	return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);

	if (getTargetMachine().Options.UnsafeFPMath) {
	// There is a generic expand for FP_TO_FP16 with unsafe fast math.
	return SDValue();
	}

	assert(N0.getSimpleValueType() == MVT::f64);

	// f64 -> f16 conversion using round-to-nearest-even rounding mode.
	const unsigned ExpMask = 0x7ff;
	const unsigned ExpBiasf64 = 1023;
	const unsigned ExpBiasf16 = 15;
	SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
	SDValue One = DAG.getConstant(1, DL, MVT::i32);
	SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
	SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
	DAG.getConstant(32, DL, MVT::i64));
	UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
	U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
	SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
	DAG.getConstant(20, DL, MVT::i64));
	E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
	DAG.getConstant(ExpMask, DL, MVT::i32));
	// Subtract the fp64 exponent bias (1023) to get the real exponent and
	// add the f16 bias (15) to get the biased exponent for the f16 format.
	E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
	DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));

	SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
	DAG.getConstant(8, DL, MVT::i32));
	M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
	DAG.getConstant(0xffe, DL, MVT::i32));

	SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
	DAG.getConstant(0x1ff, DL, MVT::i32));
	MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);

	SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
	M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);

	// (M != 0 ? 0x0200 : 0) \| 0x7c00;
	SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
	DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
	Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));

	// N = M \| (E << 12);
	SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
	DAG.getNode(ISD::SHL, DL, MVT::i32, E,
	DAG.getConstant(12, DL, MVT::i32)));

	// B = clamp(1-E, 0, 13);
	SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
	One, E);
	SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
	B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
	DAG.getConstant(13, DL, MVT::i32));

	SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
	DAG.getConstant(0x1000, DL, MVT::i32));

	SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
	SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
	SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
	D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);

	SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
	SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
	DAG.getConstant(0x7, DL, MVT::i32));
	V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
	DAG.getConstant(2, DL, MVT::i32));
	SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
	One, Zero, ISD::SETEQ);
	SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
	One, Zero, ISD::SETGT);
	V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
	V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);

	V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
	DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
	V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
	I, V, ISD::SETEQ);

	// Extract the sign bit.
	SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
	DAG.getConstant(16, DL, MVT::i32));
	Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
	DAG.getConstant(0x8000, DL, MVT::i32));

	V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
	return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
	}

	SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue Src = Op.getOperand(0);

	// TODO: Factor out code common with LowerFP_TO_UINT.

	EVT SrcVT = Src.getValueType();
	if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
	SDLoc DL(Op);

	SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
	SDValue FpToInt32 =
	DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);

	return FpToInt32;
	}

	if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
	return LowerFP64_TO_INT(Op, DAG, true);

	return SDValue();
	}

	SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue Src = Op.getOperand(0);

	// TODO: Factor out code common with LowerFP_TO_SINT.

	EVT SrcVT = Src.getValueType();
	if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
	SDLoc DL(Op);

	SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
	SDValue FpToInt32 =
	DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);

	return FpToInt32;
	}

	if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
	return LowerFP64_TO_INT(Op, DAG, false);

	return SDValue();
	}

	SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
	SelectionDAG &DAG) const {
	EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
	MVT VT = Op.getSimpleValueType();
	MVT ScalarVT = VT.getScalarType();

	assert(VT.isVector());

	SDValue Src = Op.getOperand(0);
	SDLoc DL(Op);

	// TODO: Don't scalarize on Evergreen?
	unsigned NElts = VT.getVectorNumElements();
	SmallVector<SDValue, 8> Args;
	DAG.ExtractVectorElements(Src, Args, 0, NElts);

	SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
	for (unsigned I = 0; I < NElts; ++I)
	Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);

	return DAG.getBuildVector(VT, DL, Args);
	}

	//===----------------------------------------------------------------------===//
	// Custom DAG optimizations
	//===----------------------------------------------------------------------===//

	static bool isU24(SDValue Op, SelectionDAG &DAG) {
	return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
	}

	static bool isI24(SDValue Op, SelectionDAG &DAG) {
	EVT VT = Op.getValueType();
	return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
	// as unsigned 24-bit values.
	AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24;
	}

	static bool simplifyI24(SDNode *Node24, unsigned OpIdx,
	TargetLowering::DAGCombinerInfo &DCI) {

	SelectionDAG &DAG = DCI.DAG;
	SDValue Op = Node24->getOperand(OpIdx);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT VT = Op.getValueType();

	APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
	APInt KnownZero, KnownOne;
	TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
	if (TLI.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI, TLO))
	return true;

	return false;
	}

	template <typename IntTy>
	static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
	uint32_t Width, const SDLoc &DL) {
	if (Width + Offset < 32) {
	uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
	IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
	return DAG.getConstant(Result, DL, MVT::i32);
	}

	return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
	}

	static bool hasVolatileUser(SDNode *Val) {
	for (SDNode *U : Val->uses()) {
	if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
	if (M->isVolatile())
	return true;
	}
	}

	return false;
	}

	bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
	// i32 vectors are the canonical memory type.
	if (VT.getScalarType() == MVT::i32 \|\| isTypeLegal(VT))
	return false;

	if (!VT.isByteSized())
	return false;

	unsigned Size = VT.getStoreSize();

	if ((Size == 1 \|\| Size == 2 \|\| Size == 4) && !VT.isVector())
	return false;

	if (Size == 3 \|\| (Size > 4 && (Size % 4 != 0)))
	return false;

	return true;
	}

	// Replace load of an illegal type with a store of a bitcast to a friendlier
	// type.
	SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	if (!DCI.isBeforeLegalize())
	return SDValue();

	LoadSDNode *LN = cast<LoadSDNode>(N);
	if (LN->isVolatile() \|\| !ISD::isNormalLoad(LN) \|\| hasVolatileUser(LN))
	return SDValue();

	SDLoc SL(N);
	SelectionDAG &DAG = DCI.DAG;
	EVT VT = LN->getMemoryVT();

	unsigned Size = VT.getStoreSize();
	unsigned Align = LN->getAlignment();
	if (Align < Size && isTypeLegal(VT)) {
	bool IsFast;
	unsigned AS = LN->getAddressSpace();

	// Expand unaligned loads earlier than legalization. Due to visitation order
	// problems during legalization, the emitted instructions to pack and unpack
	// the bytes again are not eliminated in the case of an unaligned copy.
	if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
	if (VT.isVector())
	return scalarizeVectorLoad(LN, DAG);

	SDValue Ops[2];
	std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
	return DAG.getMergeValues(Ops, SDLoc(N));
	}

	if (!IsFast)
	return SDValue();
	}

	if (!shouldCombineMemoryType(VT))
	return SDValue();

	EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);

	SDValue NewLoad
	= DAG.getLoad(NewVT, SL, LN->getChain(),
	LN->getBasePtr(), LN->getMemOperand());

	SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
	DCI.CombineTo(N, BC, NewLoad.getValue(1));
	return SDValue(N, 0);
	}

	// Replace store of an illegal type with a store of a bitcast to a friendlier
	// type.
	SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	if (!DCI.isBeforeLegalize())
	return SDValue();

	StoreSDNode *SN = cast<StoreSDNode>(N);
	if (SN->isVolatile() \|\| !ISD::isNormalStore(SN))
	return SDValue();

	EVT VT = SN->getMemoryVT();
	unsigned Size = VT.getStoreSize();

	SDLoc SL(N);
	SelectionDAG &DAG = DCI.DAG;
	unsigned Align = SN->getAlignment();
	if (Align < Size && isTypeLegal(VT)) {
	bool IsFast;
	unsigned AS = SN->getAddressSpace();

	// Expand unaligned stores earlier than legalization. Due to visitation
	// order problems during legalization, the emitted instructions to pack and
	// unpack the bytes again are not eliminated in the case of an unaligned
	// copy.
	if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
	if (VT.isVector())
	return scalarizeVectorStore(SN, DAG);

	return expandUnalignedStore(SN, DAG);
	}

	if (!IsFast)
	return SDValue();
	}

	if (!shouldCombineMemoryType(VT))
	return SDValue();

	EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
	SDValue Val = SN->getValue();

	//DCI.AddToWorklist(Val.getNode());

	bool OtherUses = !Val.hasOneUse();
	SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
	if (OtherUses) {
	SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
	DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
	}

	return DAG.getStore(SN->getChain(), SL, CastVal,
	SN->getBasePtr(), SN->getMemOperand());
	}

	SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
	if (!CSrc)
	return SDValue();

	const APFloat &F = CSrc->getValueAPF();
	APFloat Zero = APFloat::getZero(F.getSemantics());
	APFloat::cmpResult Cmp0 = F.compare(Zero);
	if (Cmp0 == APFloat::cmpLessThan \|\|
	(Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
	return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
	}

	APFloat One(F.getSemantics(), "1.0");
	APFloat::cmpResult Cmp1 = F.compare(One);
	if (Cmp1 == APFloat::cmpGreaterThan)
	return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));

	return SDValue(CSrc, 0);
	}

	// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
	// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
	// issues.
	SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	SDValue N0 = N->getOperand(0);

	// (vt2 (assertzext (truncate vt0:x), vt1)) ->
	// (vt2 (truncate (assertzext vt0:x, vt1)))
	if (N0.getOpcode() == ISD::TRUNCATE) {
	SDValue N1 = N->getOperand(1);
	EVT ExtVT = cast<VTSDNode>(N1)->getVT();
	SDLoc SL(N);

	SDValue Src = N0.getOperand(0);
	EVT SrcVT = Src.getValueType();
	if (SrcVT.bitsGE(ExtVT)) {
	SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
	return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
	}
	}

	return SDValue();
	}
	/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
	/// binary operation \p Opc to it with the corresponding constant operands.
	SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
	DAGCombinerInfo &DCI, const SDLoc &SL,
	unsigned Opc, SDValue LHS,
	uint32_t ValLo, uint32_t ValHi) const {
	SelectionDAG &DAG = DCI.DAG;
	SDValue Lo, Hi;
	std::tie(Lo, Hi) = split64BitValue(LHS, DAG);

	SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
	SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);

	SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
	SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);

	// Re-visit the ands. It's possible we eliminated one of them and it could
	// simplify the vector.
	DCI.AddToWorklist(Lo.getNode());
	DCI.AddToWorklist(Hi.getNode());

	SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
	return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
	}

	SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	EVT VT = N->getValueType(0);

	ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!RHS)
	return SDValue();

	SDValue LHS = N->getOperand(0);
	unsigned RHSVal = RHS->getZExtValue();
	if (!RHSVal)
	return LHS;

	SDLoc SL(N);
	SelectionDAG &DAG = DCI.DAG;

	switch (LHS->getOpcode()) {
	default:
	break;
	case ISD::ZERO_EXTEND:
	case ISD::SIGN_EXTEND:
	case ISD::ANY_EXTEND: {
	SDValue X = LHS->getOperand(0);

	if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
	isTypeLegal(MVT::v2i16)) {
	// Prefer build_vector as the canonical form if packed types are legal.
	// (shl ([asz]ext i16:x), 16 -> build_vector 0, x
	SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
	{ DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
	return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
	}

	// shl (ext x) => zext (shl x), if shift does not overflow int
	if (VT != MVT::i64)
	break;
	KnownBits Known;
	DAG.computeKnownBits(X, Known);
	unsigned LZ = Known.countMinLeadingZeros();
	if (LZ < RHSVal)
	break;
	EVT XVT = X.getValueType();
	SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
	return DAG.getZExtOrTrunc(Shl, SL, VT);
	}
	}

	if (VT != MVT::i64)
	return SDValue();

	// i64 (shl x, C) -> (build_pair 0, (shl x, C -32))

	// On some subtargets, 64-bit shift is a quarter rate instruction. In the
	// common case, splitting this into a move and a 32-bit shift is faster and
	// the same code size.
	if (RHSVal < 32)
	return SDValue();

	SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);

	SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
	SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);

	const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);

	SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
	return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
	}

	SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	if (N->getValueType(0) != MVT::i64)
	return SDValue();

	const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!RHS)
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	SDLoc SL(N);
	unsigned RHSVal = RHS->getZExtValue();

	// (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
	if (RHSVal == 32) {
	SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
	SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
	DAG.getConstant(31, SL, MVT::i32));

	SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
	return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
	}

	// (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
	if (RHSVal == 63) {
	SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
	SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
	DAG.getConstant(31, SL, MVT::i32));
	SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
	return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
	}

	return SDValue();
	}

	SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	if (N->getValueType(0) != MVT::i64)
	return SDValue();

	const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!RHS)
	return SDValue();

	unsigned ShiftAmt = RHS->getZExtValue();
	if (ShiftAmt < 32)
	return SDValue();

	// srl i64:x, C for C >= 32
	// =>
	// build_pair (srl hi_32(x), C - 32), 0

	SelectionDAG &DAG = DCI.DAG;
	SDLoc SL(N);

	SDValue One = DAG.getConstant(1, SL, MVT::i32);
	SDValue Zero = DAG.getConstant(0, SL, MVT::i32);

	SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0));
	SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
	VecOp, One);

	SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
	SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);

	SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});

	return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
	}

	// We need to specifically handle i64 mul here to avoid unnecessary conversion
	// instructions. If we only match on the legalized i64 mul expansion,
	// SimplifyDemandedBits will be unable to remove them because there will be
	// multiple uses due to the separate mul + mulh[su].
	static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
	SDValue N0, SDValue N1, unsigned Size, bool Signed) {
	if (Size <= 32) {
	unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
	return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
	}

	// Because we want to eliminate extension instructions before the
	// operation, we need to create a single user here (i.e. not the separate
	// mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it.

	unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24;

	SDValue Mul = DAG.getNode(MulOpc, SL,
	DAG.getVTList(MVT::i32, MVT::i32), N0, N1);

	return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64,
	Mul.getValue(0), Mul.getValue(1));
	}

	SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	EVT VT = N->getValueType(0);

	unsigned Size = VT.getSizeInBits();
	if (VT.isVector() \|\| Size > 64)
	return SDValue();

	// There are i16 integer mul/mad.
	if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	SDLoc DL(N);

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue Mul;

	if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
	N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
	N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
	Mul = getMul24(DAG, DL, N0, N1, Size, false);
	} else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
	N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
	N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
	Mul = getMul24(DAG, DL, N0, N1, Size, true);
	} else {
	return SDValue();
	}

	// We need to use sext even for MUL_U24, because MUL_U24 is used
	// for signed multiply of 8 and 16-bit types.
	return DAG.getSExtOrTrunc(Mul, DL, VT);
	}

	SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	EVT VT = N->getValueType(0);

	if (!Subtarget->hasMulI24() \|\| VT.isVector())
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	SDLoc DL(N);

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	if (!isI24(N0, DAG) \|\| !isI24(N1, DAG))
	return SDValue();

	N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
	N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);

	SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
	DCI.AddToWorklist(Mulhi.getNode());
	return DAG.getSExtOrTrunc(Mulhi, DL, VT);
	}

	SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	EVT VT = N->getValueType(0);

	if (!Subtarget->hasMulU24() \|\| VT.isVector() \|\| VT.getSizeInBits() > 32)
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	SDLoc DL(N);

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	if (!isU24(N0, DAG) \|\| !isU24(N1, DAG))
	return SDValue();

	N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
	N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);

	SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
	DCI.AddToWorklist(Mulhi.getNode());
	return DAG.getZExtOrTrunc(Mulhi, DL, VT);
	}

	SDValue AMDGPUTargetLowering::performMulLoHi24Combine(
	SDNode *N, DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;

	// Simplify demanded bits before splitting into multiple users.
	if (simplifyI24(N, 0, DCI) \|\| simplifyI24(N, 1, DCI))
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24);

	unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
	unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;

	SDLoc SL(N);

	SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
	SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
	return DAG.getMergeValues({ MulLo, MulHi }, SL);
	}

	static bool isNegativeOne(SDValue Val) {
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
	return C->isAllOnesValue();
	return false;
	}

	SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
	SDValue Op,
	const SDLoc &DL,
	unsigned Opc) const {
	EVT VT = Op.getValueType();
	EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
	if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
	LegalVT != MVT::i16))
	return SDValue();

	if (VT != MVT::i32)
	Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);

	SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
	if (VT != MVT::i32)
	FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);

	return FFBX;
	}

	// The native instructions return -1 on 0 input. Optimize out a select that
	// produces -1 on 0.
	//
	// TODO: If zero is not undef, we could also do this if the output is compared
	// against the bitwidth.
	//
	// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
	SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
	SDValue LHS, SDValue RHS,
	DAGCombinerInfo &DCI) const {
	ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
	if (!CmpRhs \|\| !CmpRhs->isNullValue())
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
	SDValue CmpLHS = Cond.getOperand(0);

	unsigned Opc = isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 :
	AMDGPUISD::FFBH_U32;

	// select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
	// select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
	if (CCOpcode == ISD::SETEQ &&
	(isCtlzOpc(RHS.getOpcode()) \|\| isCttzOpc(RHS.getOpcode())) &&
	RHS.getOperand(0) == CmpLHS &&
	isNegativeOne(LHS)) {
	return getFFBX_U32(DAG, CmpLHS, SL, Opc);
	}

	// select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
	// select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
	if (CCOpcode == ISD::SETNE &&
	(isCtlzOpc(LHS.getOpcode()) \|\| isCttzOpc(RHS.getOpcode())) &&
	LHS.getOperand(0) == CmpLHS &&
	isNegativeOne(RHS)) {
	return getFFBX_U32(DAG, CmpLHS, SL, Opc);
	}

	return SDValue();
	}

	static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
	unsigned Op,
	const SDLoc &SL,
	SDValue Cond,
	SDValue N1,
	SDValue N2) {
	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N1.getValueType();

	SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
	N1.getOperand(0), N2.getOperand(0));
	DCI.AddToWorklist(NewSelect.getNode());
	return DAG.getNode(Op, SL, VT, NewSelect);
	}

	// Pull a free FP operation out of a select so it may fold into uses.
	//
	// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
	// select c, (fneg x), k -> fneg (select c, x, (fneg k))
	//
	// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
	// select c, (fabs x), +k -> fabs (select c, x, k)
	static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
	SDValue N) {
	SelectionDAG &DAG = DCI.DAG;
	SDValue Cond = N.getOperand(0);
	SDValue LHS = N.getOperand(1);
	SDValue RHS = N.getOperand(2);

	EVT VT = N.getValueType();
	if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) \|\|
	(LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
	return distributeOpThroughSelect(DCI, LHS.getOpcode(),
	SDLoc(N), Cond, LHS, RHS);
	}

	bool Inv = false;
	if (RHS.getOpcode() == ISD::FABS \|\| RHS.getOpcode() == ISD::FNEG) {
	std::swap(LHS, RHS);
	Inv = true;
	}

	// TODO: Support vector constants.
	ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
	if ((LHS.getOpcode() == ISD::FNEG \|\| LHS.getOpcode() == ISD::FABS) && CRHS) {
	SDLoc SL(N);
	// If one side is an fneg/fabs and the other is a constant, we can push the
	// fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
	SDValue NewLHS = LHS.getOperand(0);
	SDValue NewRHS = RHS;

	// Careful: if the neg can be folded up, don't try to pull it back down.
	bool ShouldFoldNeg = true;

	if (NewLHS.hasOneUse()) {
	unsigned Opc = NewLHS.getOpcode();
	if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
	ShouldFoldNeg = false;
	if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
	ShouldFoldNeg = false;
	}

	if (ShouldFoldNeg) {
	if (LHS.getOpcode() == ISD::FNEG)
	NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
	else if (CRHS->isNegative())
	return SDValue();

	if (Inv)
	std::swap(NewLHS, NewRHS);

	SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
	Cond, NewLHS, NewRHS);
	DCI.AddToWorklist(NewSelect.getNode());
	return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
	}
	}

	return SDValue();
	}


	SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
	return Folded;

	SDValue Cond = N->getOperand(0);
	if (Cond.getOpcode() != ISD::SETCC)
	return SDValue();

	EVT VT = N->getValueType(0);
	SDValue LHS = Cond.getOperand(0);
	SDValue RHS = Cond.getOperand(1);
	SDValue CC = Cond.getOperand(2);

	SDValue True = N->getOperand(1);
	SDValue False = N->getOperand(2);

	if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
	SelectionDAG &DAG = DCI.DAG;
	if ((DAG.isConstantValueOfAnyType(True) \|\|
	DAG.isConstantValueOfAnyType(True)) &&
	(!DAG.isConstantValueOfAnyType(False) &&
	!DAG.isConstantValueOfAnyType(False))) {
	// Swap cmp + select pair to move constant to false input.
	// This will allow using VOPC cndmasks more often.
	// select (setcc x, y), k, x -> select (setcc y, x) x, x

	SDLoc SL(N);
	ISD::CondCode NewCC = getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
	LHS.getValueType().isInteger());

	SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
	return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
	}

	if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
	SDValue MinMax
	= combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
	// Revisit this node so we can catch min3/max3/med3 patterns.
	//DCI.AddToWorklist(MinMax.getNode());
	return MinMax;
	}
	}

	// There's no reason to not do this if the condition has other uses.
	return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
	}

	static bool isConstantFPZero(SDValue N) {
	if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
	return C->isZero() && !C->isNegative();
	return false;
	}

	static unsigned inverseMinMax(unsigned Opc) {
	switch (Opc) {
	case ISD::FMAXNUM:
	return ISD::FMINNUM;
	case ISD::FMINNUM:
	return ISD::FMAXNUM;
	case AMDGPUISD::FMAX_LEGACY:
	return AMDGPUISD::FMIN_LEGACY;
	case AMDGPUISD::FMIN_LEGACY:
	return AMDGPUISD::FMAX_LEGACY;
	default:
	llvm_unreachable("invalid min/max opcode");
	}
	}

	SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	unsigned Opc = N0.getOpcode();

	// If the input has multiple uses and we can either fold the negate down, or
	// the other uses cannot, give up. This both prevents unprofitable
	// transformations and infinite loops: we won't repeatedly try to fold around
	// a negate that has no 'good' form.
	if (N0.hasOneUse()) {
	// This may be able to fold into the source, but at a code size cost. Don't
	// fold if the fold into the user is free.
	if (allUsesHaveSourceMods(N, 0))
	return SDValue();
	} else {
	if (fnegFoldsIntoOp(Opc) &&
	(allUsesHaveSourceMods(N) \|\| !allUsesHaveSourceMods(N0.getNode())))
	return SDValue();
	}

	SDLoc SL(N);
	switch (Opc) {
	case ISD::FADD: {
	if (!mayIgnoreSignedZero(N0))
	return SDValue();

	// (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
	SDValue LHS = N0.getOperand(0);
	SDValue RHS = N0.getOperand(1);

	if (LHS.getOpcode() != ISD::FNEG)
	LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
	else
	LHS = LHS.getOperand(0);

	if (RHS.getOpcode() != ISD::FNEG)
	RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
	else
	RHS = RHS.getOperand(0);

	SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
	if (!N0.hasOneUse())
	DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
	return Res;
	}
	case ISD::FMUL:
	case AMDGPUISD::FMUL_LEGACY: {
	// (fneg (fmul x, y)) -> (fmul x, (fneg y))
	// (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
	SDValue LHS = N0.getOperand(0);
	SDValue RHS = N0.getOperand(1);

	if (LHS.getOpcode() == ISD::FNEG)
	LHS = LHS.getOperand(0);
	else if (RHS.getOpcode() == ISD::FNEG)
	RHS = RHS.getOperand(0);
	else
	RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);

	SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
	if (!N0.hasOneUse())
	DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
	return Res;
	}
	case ISD::FMA:
	case ISD::FMAD: {
	if (!mayIgnoreSignedZero(N0))
	return SDValue();

	// (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
	SDValue LHS = N0.getOperand(0);
	SDValue MHS = N0.getOperand(1);
	SDValue RHS = N0.getOperand(2);

	if (LHS.getOpcode() == ISD::FNEG)
	LHS = LHS.getOperand(0);
	else if (MHS.getOpcode() == ISD::FNEG)
	MHS = MHS.getOperand(0);
	else
	MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);

	if (RHS.getOpcode() != ISD::FNEG)
	RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
	else
	RHS = RHS.getOperand(0);

	SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
	if (!N0.hasOneUse())
	DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
	return Res;
	}
	case ISD::FMAXNUM:
	case ISD::FMINNUM:
	case AMDGPUISD::FMAX_LEGACY:
	case AMDGPUISD::FMIN_LEGACY: {
	// fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
	// fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
	// fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
	// fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)

	SDValue LHS = N0.getOperand(0);
	SDValue RHS = N0.getOperand(1);

	// 0 doesn't have a negated inline immediate.
	// TODO: Shouldn't fold 1/2pi either, and should be generalized to other
	// operations.
	if (isConstantFPZero(RHS))
	return SDValue();

	SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
	SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
	unsigned Opposite = inverseMinMax(Opc);

	SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
	if (!N0.hasOneUse())
	DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
	return Res;
	}
	case ISD::FP_EXTEND:
	case ISD::FTRUNC:
	case ISD::FRINT:
	case ISD::FNEARBYINT: // XXX - Should fround be handled?
	case ISD::FSIN:
	case AMDGPUISD::RCP:
	case AMDGPUISD::RCP_LEGACY:
	case AMDGPUISD::SIN_HW: {
	SDValue CvtSrc = N0.getOperand(0);
	if (CvtSrc.getOpcode() == ISD::FNEG) {
	// (fneg (fp_extend (fneg x))) -> (fp_extend x)
	// (fneg (rcp (fneg x))) -> (rcp x)
	return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
	}

	if (!N0.hasOneUse())
	return SDValue();

	// (fneg (fp_extend x)) -> (fp_extend (fneg x))
	// (fneg (rcp x)) -> (rcp (fneg x))
	SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
	return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
	}
	case ISD::FP_ROUND: {
	SDValue CvtSrc = N0.getOperand(0);

	if (CvtSrc.getOpcode() == ISD::FNEG) {
	// (fneg (fp_round (fneg x))) -> (fp_round x)
	return DAG.getNode(ISD::FP_ROUND, SL, VT,
	CvtSrc.getOperand(0), N0.getOperand(1));
	}

	if (!N0.hasOneUse())
	return SDValue();

	// (fneg (fp_round x)) -> (fp_round (fneg x))
	SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
	return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
	}
	case ISD::FP16_TO_FP: {
	// v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
	// f16, but legalization of f16 fneg ends up pulling it out of the source.
	// Put the fneg back as a legal source operation that can be matched later.
	SDLoc SL(N);

	SDValue Src = N0.getOperand(0);
	EVT SrcVT = Src.getValueType();

	// fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
	SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
	DAG.getConstant(0x8000, SL, SrcVT));
	return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
	}
	default:
	return SDValue();
	}
	}

	SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	SDValue N0 = N->getOperand(0);

	if (!N0.hasOneUse())
	return SDValue();

	switch (N0.getOpcode()) {
	case ISD::FP16_TO_FP: {
	assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
	SDLoc SL(N);
	SDValue Src = N0.getOperand(0);
	EVT SrcVT = Src.getValueType();

	// fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
	SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
	DAG.getConstant(0x7fff, SL, SrcVT));
	return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
	}
	default:
	return SDValue();
	}
	}

	SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	SDLoc DL(N);

	switch(N->getOpcode()) {
	default:
	break;
	case ISD::BITCAST: {
	EVT DestVT = N->getValueType(0);

	// Push casts through vector builds. This helps avoid emitting a large
	// number of copies when materializing floating point vector constants.
	//
	// vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
	// vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
	if (DestVT.isVector()) {
	SDValue Src = N->getOperand(0);
	if (Src.getOpcode() == ISD::BUILD_VECTOR) {
	EVT SrcVT = Src.getValueType();
	unsigned NElts = DestVT.getVectorNumElements();

	if (SrcVT.getVectorNumElements() == NElts) {
	EVT DestEltVT = DestVT.getVectorElementType();

	SmallVector<SDValue, 8> CastedElts;
	SDLoc SL(N);
	for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
	SDValue Elt = Src.getOperand(I);
	CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
	}

	return DAG.getBuildVector(DestVT, SL, CastedElts);
	}
	}
	}

	if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
	break;

	// Fold bitcasts of constants.
	//
	// v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
	// TODO: Generalize and move to DAGCombiner
	SDValue Src = N->getOperand(0);
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
	assert(Src.getValueType() == MVT::i64);
	SDLoc SL(N);
	uint64_t CVal = C->getZExtValue();
	return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
	DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
	DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
	}

	if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
	const APInt &Val = C->getValueAPF().bitcastToAPInt();
	SDLoc SL(N);
	uint64_t CVal = Val.getZExtValue();
	SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
	DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
	DAG.getConstant(Hi_32(CVal), SL, MVT::i32));

	return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
	}

	break;
	}
	case ISD::SHL: {
	if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
	break;

	return performShlCombine(N, DCI);
	}
	case ISD::SRL: {
	if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
	break;

	return performSrlCombine(N, DCI);
	}
	case ISD::SRA: {
	if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
	break;

	return performSraCombine(N, DCI);
	}
	case ISD::MUL:
	return performMulCombine(N, DCI);
	case ISD::MULHS:
	return performMulhsCombine(N, DCI);
	case ISD::MULHU:
	return performMulhuCombine(N, DCI);
	case AMDGPUISD::MUL_I24:
	case AMDGPUISD::MUL_U24:
	case AMDGPUISD::MULHI_I24:
	case AMDGPUISD::MULHI_U24: {
	// If the first call to simplify is successfull, then N may end up being
	// deleted, so we shouldn't call simplifyI24 again.
	simplifyI24(N, 0, DCI) \|\| simplifyI24(N, 1, DCI);
	return SDValue();
	}
	case AMDGPUISD::MUL_LOHI_I24:
	case AMDGPUISD::MUL_LOHI_U24:
	return performMulLoHi24Combine(N, DCI);
	case ISD::SELECT:
	return performSelectCombine(N, DCI);
	case ISD::FNEG:
	return performFNegCombine(N, DCI);
	case ISD::FABS:
	return performFAbsCombine(N, DCI);
	case AMDGPUISD::BFE_I32:
	case AMDGPUISD::BFE_U32: {
	assert(!N->getValueType(0).isVector() &&
	"Vector handling of BFE not implemented");
	ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
	if (!Width)
	break;

	uint32_t WidthVal = Width->getZExtValue() & 0x1f;
	if (WidthVal == 0)
	return DAG.getConstant(0, DL, MVT::i32);

	ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!Offset)
	break;

	SDValue BitsFrom = N->getOperand(0);
	uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;

	bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;

	if (OffsetVal == 0) {
	// This is already sign / zero extended, so try to fold away extra BFEs.
	unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);

	unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
	if (OpSignBits >= SignBits)
	return BitsFrom;

	EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
	if (Signed) {
	// This is a sign_extend_inreg. Replace it to take advantage of existing
	// DAG Combines. If not eliminated, we will match back to BFE during
	// selection.

	// TODO: The sext_inreg of extended types ends, although we can could
	// handle them in a single BFE.
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
	DAG.getValueType(SmallVT));
	}

	return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
	}

	if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
	if (Signed) {
	return constantFoldBFE<int32_t>(DAG,
	CVal->getSExtValue(),
	OffsetVal,
	WidthVal,
	DL);
	}

	return constantFoldBFE<uint32_t>(DAG,
	CVal->getZExtValue(),
	OffsetVal,
	WidthVal,
	DL);
	}

	if ((OffsetVal + WidthVal) >= 32 &&
	!(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
	SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
	return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
	BitsFrom, ShiftVal);
	}

	if (BitsFrom.hasOneUse()) {
	APInt Demanded = APInt::getBitsSet(32,
	OffsetVal,
	OffsetVal + WidthVal);

	KnownBits Known;
	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
	!DCI.isBeforeLegalizeOps());
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) \|\|
	TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
	DCI.CommitTargetLoweringOpt(TLO);
	}
	}

	break;
	}
	case ISD::LOAD:
	return performLoadCombine(N, DCI);
	case ISD::STORE:
	return performStoreCombine(N, DCI);
	case AMDGPUISD::CLAMP:
	return performClampCombine(N, DCI);
	case AMDGPUISD::RCP: {
	if (const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) {
	// XXX - Should this flush denormals?
	const APFloat &Val = CFP->getValueAPF();
	APFloat One(Val.getSemantics(), "1.0");
	return DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
	}

	break;
	}
	case ISD::AssertZext:
	case ISD::AssertSext:
	return performAssertSZExtCombine(N, DCI);
	}
	return SDValue();
	}

	//===----------------------------------------------------------------------===//
	// Helper functions
	//===----------------------------------------------------------------------===//

	SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
	const TargetRegisterClass *RC,
	unsigned Reg, EVT VT,
	const SDLoc &SL,
	bool RawReg) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineRegisterInfo &MRI = MF.getRegInfo();
	unsigned VReg;

	if (!MRI.isLiveIn(Reg)) {
	VReg = MRI.createVirtualRegister(RC);
	MRI.addLiveIn(Reg, VReg);
	} else {
	VReg = MRI.getLiveInVirtReg(Reg);
	}

	if (RawReg)
	return DAG.getRegister(VReg, VT);

	return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
	}

	SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
	EVT VT,
	const SDLoc &SL,
	int64_t Offset) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();

	int FI = MFI.CreateFixedObject(VT.getStoreSize(), Offset, true);
	auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
	SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);

	return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, 4,
	MachineMemOperand::MODereferenceable \|
	MachineMemOperand::MOInvariant);
	}

	SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
	const SDLoc &SL,
	SDValue Chain,
	SDValue StackPtr,
	SDValue ArgVal,
	int64_t Offset) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);

	SDValue Ptr = DAG.getObjectPtrOffset(SL, StackPtr, Offset);
	SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4,
	MachineMemOperand::MODereferenceable);
	return Store;
	}

	SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
	const TargetRegisterClass *RC,
	EVT VT, const SDLoc &SL,
	const ArgDescriptor &Arg) const {
	assert(Arg && "Attempting to load missing argument");

	if (Arg.isRegister())
	return CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL);
	return loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
	}

	uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
	const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {
	unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr();
	uint64_t ArgOffset = alignTo(MFI->getABIArgOffset(), Alignment);
	switch (Param) {
	case GRID_DIM:
	return ArgOffset;
	case GRID_OFFSET:
	return ArgOffset + 4;
	}
	llvm_unreachable("unexpected implicit parameter type");
	}

	#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;

	const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
	switch ((AMDGPUISD::NodeType)Opcode) {
	case AMDGPUISD::FIRST_NUMBER: break;
	// AMDIL DAG nodes
	NODE_NAME_CASE(UMUL);
	NODE_NAME_CASE(BRANCH_COND);

	// AMDGPU DAG nodes
	NODE_NAME_CASE(IF)
	NODE_NAME_CASE(ELSE)
	NODE_NAME_CASE(LOOP)
	NODE_NAME_CASE(CALL)
	NODE_NAME_CASE(TC_RETURN)
	NODE_NAME_CASE(TRAP)
	NODE_NAME_CASE(RET_FLAG)
	NODE_NAME_CASE(RETURN_TO_EPILOG)
	NODE_NAME_CASE(ENDPGM)
	NODE_NAME_CASE(DWORDADDR)
	NODE_NAME_CASE(FRACT)
	NODE_NAME_CASE(SETCC)
	NODE_NAME_CASE(SETREG)
	NODE_NAME_CASE(FMA_W_CHAIN)
	NODE_NAME_CASE(FMUL_W_CHAIN)
	NODE_NAME_CASE(CLAMP)
	NODE_NAME_CASE(COS_HW)
	NODE_NAME_CASE(SIN_HW)
	NODE_NAME_CASE(FMAX_LEGACY)
	NODE_NAME_CASE(FMIN_LEGACY)
	NODE_NAME_CASE(FMAX3)
	NODE_NAME_CASE(SMAX3)
	NODE_NAME_CASE(UMAX3)
	NODE_NAME_CASE(FMIN3)
	NODE_NAME_CASE(SMIN3)
	NODE_NAME_CASE(UMIN3)
	NODE_NAME_CASE(FMED3)
	NODE_NAME_CASE(SMED3)
	NODE_NAME_CASE(UMED3)
	NODE_NAME_CASE(URECIP)
	NODE_NAME_CASE(DIV_SCALE)
	NODE_NAME_CASE(DIV_FMAS)
	NODE_NAME_CASE(DIV_FIXUP)
	NODE_NAME_CASE(FMAD_FTZ)
	NODE_NAME_CASE(TRIG_PREOP)
	NODE_NAME_CASE(RCP)
	NODE_NAME_CASE(RSQ)
	NODE_NAME_CASE(RCP_LEGACY)
	NODE_NAME_CASE(RSQ_LEGACY)
	NODE_NAME_CASE(FMUL_LEGACY)
	NODE_NAME_CASE(RSQ_CLAMP)
	NODE_NAME_CASE(LDEXP)
	NODE_NAME_CASE(FP_CLASS)
	NODE_NAME_CASE(DOT4)
	NODE_NAME_CASE(CARRY)
	NODE_NAME_CASE(BORROW)
	NODE_NAME_CASE(BFE_U32)
	NODE_NAME_CASE(BFE_I32)
	NODE_NAME_CASE(BFI)
	NODE_NAME_CASE(BFM)
	NODE_NAME_CASE(FFBH_U32)
	NODE_NAME_CASE(FFBH_I32)
	NODE_NAME_CASE(FFBL_B32)
	NODE_NAME_CASE(MUL_U24)
	NODE_NAME_CASE(MUL_I24)
	NODE_NAME_CASE(MULHI_U24)
	NODE_NAME_CASE(MULHI_I24)
	NODE_NAME_CASE(MUL_LOHI_U24)
	NODE_NAME_CASE(MUL_LOHI_I24)
	NODE_NAME_CASE(MAD_U24)
	NODE_NAME_CASE(MAD_I24)
	NODE_NAME_CASE(MAD_I64_I32)
	NODE_NAME_CASE(MAD_U64_U32)
	NODE_NAME_CASE(TEXTURE_FETCH)
	NODE_NAME_CASE(EXPORT)
	NODE_NAME_CASE(EXPORT_DONE)
	NODE_NAME_CASE(R600_EXPORT)
	NODE_NAME_CASE(CONST_ADDRESS)
	NODE_NAME_CASE(REGISTER_LOAD)
	NODE_NAME_CASE(REGISTER_STORE)
	NODE_NAME_CASE(SAMPLE)
	NODE_NAME_CASE(SAMPLEB)
	NODE_NAME_CASE(SAMPLED)
	NODE_NAME_CASE(SAMPLEL)
	NODE_NAME_CASE(CVT_F32_UBYTE0)
	NODE_NAME_CASE(CVT_F32_UBYTE1)
	NODE_NAME_CASE(CVT_F32_UBYTE2)
	NODE_NAME_CASE(CVT_F32_UBYTE3)
	NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
	+ NODE_NAME_CASE(CVT_PKNORM_I16_F32)
	+ NODE_NAME_CASE(CVT_PKNORM_U16_F32)
	+ NODE_NAME_CASE(CVT_PK_I16_I32)
	+ NODE_NAME_CASE(CVT_PK_U16_U32)
	NODE_NAME_CASE(FP_TO_FP16)
	NODE_NAME_CASE(FP16_ZEXT)
	NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
	NODE_NAME_CASE(CONST_DATA_PTR)
	NODE_NAME_CASE(PC_ADD_REL_OFFSET)
	NODE_NAME_CASE(KILL)
	NODE_NAME_CASE(DUMMY_CHAIN)
	case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
	NODE_NAME_CASE(INIT_EXEC)
	NODE_NAME_CASE(INIT_EXEC_FROM_INPUT)
	NODE_NAME_CASE(SENDMSG)
	NODE_NAME_CASE(SENDMSGHALT)
	NODE_NAME_CASE(INTERP_MOV)
	NODE_NAME_CASE(INTERP_P1)
	NODE_NAME_CASE(INTERP_P2)
	NODE_NAME_CASE(STORE_MSKOR)
	NODE_NAME_CASE(LOAD_CONSTANT)
	NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
	NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3)
	NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
	NODE_NAME_CASE(ATOMIC_CMP_SWAP)
	NODE_NAME_CASE(ATOMIC_INC)
	NODE_NAME_CASE(ATOMIC_DEC)
	NODE_NAME_CASE(BUFFER_LOAD)
	NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
	NODE_NAME_CASE(BUFFER_STORE)
	NODE_NAME_CASE(BUFFER_STORE_FORMAT)
	NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
	NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
	NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
	NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
	NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
	NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
	NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
	NODE_NAME_CASE(BUFFER_ATOMIC_AND)
	NODE_NAME_CASE(BUFFER_ATOMIC_OR)
	NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
	NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
	case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
	}
	return nullptr;
	}

	SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
	SelectionDAG &DAG, int Enabled,
	int &RefinementSteps,
	bool &UseOneConstNR,
	bool Reciprocal) const {
	EVT VT = Operand.getValueType();

	if (VT == MVT::f32) {
	RefinementSteps = 0;
	return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
	}

	// TODO: There is also f64 rsq instruction, but the documentation is less
	// clear on its precision.

	return SDValue();
	}

	SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
	SelectionDAG &DAG, int Enabled,
	int &RefinementSteps) const {
	EVT VT = Operand.getValueType();

	if (VT == MVT::f32) {
	// Reciprocal, < 1 ulp error.
	//
	// This reciprocal approximation converges to < 0.5 ulp error with one
	// newton rhapson performed with two fused multiple adds (FMAs).

	RefinementSteps = 0;
	return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
	}

	// TODO: There is also f64 rcp instruction, but the documentation is less
	// clear on its precision.

	return SDValue();
	}

	void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
	const SDValue Op, KnownBits &Known,
	const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {

	Known.resetAll(); // Don't know anything.

	unsigned Opc = Op.getOpcode();

	switch (Opc) {
	default:
	break;
	case AMDGPUISD::CARRY:
	case AMDGPUISD::BORROW: {
	Known.Zero = APInt::getHighBitsSet(32, 31);
	break;
	}

	case AMDGPUISD::BFE_I32:
	case AMDGPUISD::BFE_U32: {
	ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
	if (!CWidth)
	return;

	uint32_t Width = CWidth->getZExtValue() & 0x1f;

	if (Opc == AMDGPUISD::BFE_U32)
	Known.Zero = APInt::getHighBitsSet(32, 32 - Width);

	break;
	}
	case AMDGPUISD::FP_TO_FP16:
	case AMDGPUISD::FP16_ZEXT: {
	unsigned BitWidth = Known.getBitWidth();

	// High bits are zero.
	Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
	break;
	}
	case AMDGPUISD::MUL_U24:
	case AMDGPUISD::MUL_I24: {
	KnownBits LHSKnown, RHSKnown;
	DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1);
	DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1);

	unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
	RHSKnown.countMinTrailingZeros();
	Known.Zero.setLowBits(std::min(TrailZ, 32u));

	unsigned LHSValBits = 32 - std::max(LHSKnown.countMinSignBits(), 8u);
	unsigned RHSValBits = 32 - std::max(RHSKnown.countMinSignBits(), 8u);
	unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
	if (MaxValBits >= 32)
	break;
	bool Negative = false;
	if (Opc == AMDGPUISD::MUL_I24) {
	bool LHSNegative = !!(LHSKnown.One & (1 << 23));
	bool LHSPositive = !!(LHSKnown.Zero & (1 << 23));
	bool RHSNegative = !!(RHSKnown.One & (1 << 23));
	bool RHSPositive = !!(RHSKnown.Zero & (1 << 23));
	if ((!LHSNegative && !LHSPositive) \|\| (!RHSNegative && !RHSPositive))
	break;
	Negative = (LHSNegative && RHSPositive) \|\| (LHSPositive && RHSNegative);
	}
	if (Negative)
	Known.One.setHighBits(32 - MaxValBits);
	else
	Known.Zero.setHighBits(32 - MaxValBits);
	break;
	}
	case ISD::INTRINSIC_WO_CHAIN: {
	unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	switch (IID) {
	case Intrinsic::amdgcn_mbcnt_lo:
	case Intrinsic::amdgcn_mbcnt_hi: {
	// These return at most the wavefront size - 1.
	unsigned Size = Op.getValueType().getSizeInBits();
	Known.Zero.setHighBits(Size - Subtarget->getWavefrontSizeLog2());
	break;
	}
	default:
	break;
	}
	}
	}
	}

	unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
	SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
	unsigned Depth) const {
	switch (Op.getOpcode()) {
	case AMDGPUISD::BFE_I32: {
	ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
	if (!Width)
	return 1;

	unsigned SignBits = 32 - Width->getZExtValue() + 1;
	if (!isNullConstant(Op.getOperand(1)))
	return SignBits;

	// TODO: Could probably figure something out with non-0 offsets.
	unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
	return std::max(SignBits, Op0SignBits);
	}

	case AMDGPUISD::BFE_U32: {
	ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
	return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
	}

	case AMDGPUISD::CARRY:
	case AMDGPUISD::BORROW:
	return 31;
	case AMDGPUISD::FP_TO_FP16:
	case AMDGPUISD::FP16_ZEXT:
	return 16;
	default:
	return 1;
	}
	}
	Index: head/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
	===================================================================
	--- head/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h (revision 329409)
	+++ head/contrib/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h (revision 329410)
	@@ -1,481 +1,485 @@
	//===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	/// \file
	/// \brief Interface definition of the TargetLowering class that is common
	/// to all AMD GPUs.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H
	#define LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H

	#include "AMDGPU.h"
	#include "llvm/CodeGen/CallingConvLower.h"
	#include "llvm/CodeGen/TargetLowering.h"

	namespace llvm {

	class AMDGPUMachineFunction;
	class AMDGPUSubtarget;
	struct ArgDescriptor;

	class AMDGPUTargetLowering : public TargetLowering {
	private:
	/// \returns AMDGPUISD::FFBH_U32 node if the incoming \p Op may have been
	/// legalized from a smaller type VT. Need to match pre-legalized type because
	/// the generic legalization inserts the add/sub between the select and
	/// compare.
	SDValue getFFBX_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL, unsigned Opc) const;

	public:
	static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG);
	static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG);

	protected:
	const AMDGPUSubtarget *Subtarget;
	AMDGPUAS AMDGPUASI;

	SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
	/// \brief Split a vector store into multiple scalar stores.
	/// \returns The resulting chain.

	SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const;

	SDValue LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFLOG(SDValue Op, SelectionDAG &Dag,
	double Log2BaseInverted) const;

	SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const;

	SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const;
	SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const;
	SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;

	SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const;
	SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;

	SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;

	protected:
	bool shouldCombineMemoryType(EVT VT) const;
	SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const;

	SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL,
	unsigned Opc, SDValue LHS,
	uint32_t ValLo, uint32_t ValHi) const;
	SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue performMulLoHi24Combine(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,
	SDValue RHS, DAGCombinerInfo &DCI) const;
	SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const;

	static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);

	virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
	SelectionDAG &DAG) const;

	/// Return 64-bit value Op as two 32-bit integers.
	std::pair<SDValue, SDValue> split64BitValue(SDValue Op,
	SelectionDAG &DAG) const;
	SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const;
	SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const;

	/// \brief Split a vector load into 2 loads of half the vector.
	SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const;

	/// \brief Split a vector store into 2 stores of half the vector.
	SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;

	SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
	SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const;
	void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG,
	SmallVectorImpl<SDValue> &Results) const;
	void analyzeFormalArgumentsCompute(CCState &State,
	const SmallVectorImpl<ISD::InputArg> &Ins) const;
	public:
	AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);

	bool mayIgnoreSignedZero(SDValue Op) const {
	if (getTargetMachine().Options.NoSignedZerosFPMath)
	return true;

	const auto Flags = Op.getNode()->getFlags();
	if (Flags.isDefined())
	return Flags.hasNoSignedZeros();

	return false;
	}

	static bool allUsesHaveSourceMods(const SDNode *N,
	unsigned CostThreshold = 4);
	bool isFAbsFree(EVT VT) const override;
	bool isFNegFree(EVT VT) const override;
	bool isTruncateFree(EVT Src, EVT Dest) const override;
	bool isTruncateFree(Type Src, Type Dest) const override;

	bool isZExtFree(Type Src, Type Dest) const override;
	bool isZExtFree(EVT Src, EVT Dest) const override;
	bool isZExtFree(SDValue Val, EVT VT2) const override;
	bool isFPExtFoldable(unsigned Opcode, EVT DestVT, EVT SrcVT) const override;

	bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;

	MVT getVectorIdxTy(const DataLayout &) const override;
	bool isSelectSupported(SelectSupportKind) const override;

	bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
	bool ShouldShrinkFPConstant(EVT VT) const override;
	bool shouldReduceLoadWidth(SDNode *Load,
	ISD::LoadExtType ExtType,
	EVT ExtVT) const override;

	bool isLoadBitCastBeneficial(EVT, EVT) const final;

	bool storeOfVectorConstantIsCheap(EVT MemVT,
	unsigned NumElem,
	unsigned AS) const override;
	bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override;
	bool isCheapToSpeculateCttz() const override;
	bool isCheapToSpeculateCtlz() const override;

	static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
	static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);

	SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
	SelectionDAG &DAG) const override;

	SDValue addTokenForArgument(SDValue Chain,
	SelectionDAG &DAG,
	MachineFrameInfo &MFI,
	int ClobberedFI) const;

	SDValue lowerUnhandledCall(CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals,
	StringRef Reason) const;
	SDValue LowerCall(CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const override;

	SDValue LowerDYNAMIC_STACKALLOC(SDValue Op,
	SelectionDAG &DAG) const;

	SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
	SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
	void ReplaceNodeResults(SDNode * N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const override;

	SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS,
	SDValue RHS, SDValue True, SDValue False,
	SDValue CC, DAGCombinerInfo &DCI) const;

	const char* getTargetNodeName(unsigned Opcode) const override;

	// FIXME: Turn off MergeConsecutiveStores() before Instruction Selection
	// for AMDGPU.
	// A commit ( git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@319036
	// 91177308-0d34-0410-b5e6-96231b3b80d8 ) turned on
	// MergeConsecutiveStores() before Instruction Selection for all targets.
	// Enough AMDGPU compiles go into an infinite loop ( MergeConsecutiveStores()
	// merges two stores; LegalizeStoreOps() un-merges; MergeConsecutiveStores()
	// re-merges, etc. ) to warrant turning it off for now.
	bool mergeStoresAfterLegalization() const override { return false; }

	bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override {
	return true;
	}
	SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
	int &RefinementSteps, bool &UseOneConstNR,
	bool Reciprocal) const override;
	SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
	int &RefinementSteps) const override;

	virtual SDNode PostISelFolding(MachineSDNode N,
	SelectionDAG &DAG) const = 0;

	/// \brief Determine which of the bits specified in \p Mask are known to be
	/// either zero or one and return them in the \p KnownZero and \p KnownOne
	/// bitsets.
	void computeKnownBitsForTargetNode(const SDValue Op,
	KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth = 0) const override;

	unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth = 0) const override;

	/// \brief Helper function that adds Reg to the LiveIn list of the DAG's
	/// MachineFunction.
	///
	/// \returns a RegisterSDNode representing Reg if \p RawReg is true, otherwise
	/// a copy from the register.
	SDValue CreateLiveInRegister(SelectionDAG &DAG,
	const TargetRegisterClass *RC,
	unsigned Reg, EVT VT,
	const SDLoc &SL,
	bool RawReg = false) const;
	SDValue CreateLiveInRegister(SelectionDAG &DAG,
	const TargetRegisterClass *RC,
	unsigned Reg, EVT VT) const {
	return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()));
	}

	// Returns the raw live in register rather than a copy from it.
	SDValue CreateLiveInRegisterRaw(SelectionDAG &DAG,
	const TargetRegisterClass *RC,
	unsigned Reg, EVT VT) const {
	return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()), true);
	}

	/// Similar to CreateLiveInRegister, except value maybe loaded from a stack
	/// slot rather than passed in a register.
	SDValue loadStackInputValue(SelectionDAG &DAG,
	EVT VT,
	const SDLoc &SL,
	int64_t Offset) const;

	SDValue storeStackInputValue(SelectionDAG &DAG,
	const SDLoc &SL,
	SDValue Chain,
	SDValue StackPtr,
	SDValue ArgVal,
	int64_t Offset) const;

	SDValue loadInputValue(SelectionDAG &DAG,
	const TargetRegisterClass *RC,
	EVT VT, const SDLoc &SL,
	const ArgDescriptor &Arg) const;

	enum ImplicitParameter {
	FIRST_IMPLICIT,
	GRID_DIM = FIRST_IMPLICIT,
	GRID_OFFSET,
	};

	/// \brief Helper function that returns the byte offset of the given
	/// type of implicit parameter.
	uint32_t getImplicitParameterOffset(const AMDGPUMachineFunction *MFI,
	const ImplicitParameter Param) const;

	AMDGPUAS getAMDGPUAS() const {
	return AMDGPUASI;
	}

	MVT getFenceOperandTy(const DataLayout &DL) const override {
	return MVT::i32;
	}
	};

	namespace AMDGPUISD {

	enum NodeType : unsigned {
	// AMDIL ISD Opcodes
	FIRST_NUMBER = ISD::BUILTIN_OP_END,
	UMUL, // 32bit unsigned multiplication
	BRANCH_COND,
	// End AMDIL ISD Opcodes

	// Function call.
	CALL,
	TC_RETURN,
	TRAP,

	// Masked control flow nodes.
	IF,
	ELSE,
	LOOP,

	// A uniform kernel return that terminates the wavefront.
	ENDPGM,

	// Return to a shader part's epilog code.
	RETURN_TO_EPILOG,

	// Return with values from a non-entry function.
	RET_FLAG,

	DWORDADDR,
	FRACT,

	/// CLAMP value between 0.0 and 1.0. NaN clamped to 0, following clamp output
	/// modifier behavior with dx10_enable.
	CLAMP,

	// This is SETCC with the full mask result which is used for a compare with a
	// result bit per item in the wavefront.
	SETCC,
	SETREG,
	// FP ops with input and output chain.
	FMA_W_CHAIN,
	FMUL_W_CHAIN,

	// SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi.
	// Denormals handled on some parts.
	COS_HW,
	SIN_HW,
	FMAX_LEGACY,
	FMIN_LEGACY,
	FMAX3,
	SMAX3,
	UMAX3,
	FMIN3,
	SMIN3,
	UMIN3,
	FMED3,
	SMED3,
	UMED3,
	URECIP,
	DIV_SCALE,
	DIV_FMAS,
	DIV_FIXUP,
	// For emitting ISD::FMAD when f32 denormals are enabled because mac/mad is
	// treated as an illegal operation.
	FMAD_FTZ,
	TRIG_PREOP, // 1 ULP max error for f64

	// RCP, RSQ - For f32, 1 ULP max error, no denormal handling.
	// For f64, max error 2^29 ULP, handles denormals.
	RCP,
	RSQ,
	RCP_LEGACY,
	RSQ_LEGACY,
	FMUL_LEGACY,
	RSQ_CLAMP,
	LDEXP,
	FP_CLASS,
	DOT4,
	CARRY,
	BORROW,
	BFE_U32, // Extract range of bits with zero extension to 32-bits.
	BFE_I32, // Extract range of bits with sign extension to 32-bits.
	BFI, // (src0 & src1) \| (~src0 & src2)
	BFM, // Insert a range of bits into a 32-bit word.
	FFBH_U32, // ctlz with -1 if input is zero.
	FFBH_I32,
	FFBL_B32, // cttz with -1 if input is zero.
	MUL_U24,
	MUL_I24,
	MULHI_U24,
	MULHI_I24,
	MAD_U24,
	MAD_I24,
	MAD_U64_U32,
	MAD_I64_I32,
	MUL_LOHI_I24,
	MUL_LOHI_U24,
	TEXTURE_FETCH,
	EXPORT, // exp on SI+
	EXPORT_DONE, // exp on SI+ with done bit set
	R600_EXPORT,
	CONST_ADDRESS,
	REGISTER_LOAD,
	REGISTER_STORE,
	SAMPLE,
	SAMPLEB,
	SAMPLED,
	SAMPLEL,

	// These cvt_f32_ubyte* nodes need to remain consecutive and in order.
	CVT_F32_UBYTE0,
	CVT_F32_UBYTE1,
	CVT_F32_UBYTE2,
	CVT_F32_UBYTE3,

	// Convert two float 32 numbers into a single register holding two packed f16
	// with round to zero.
	CVT_PKRTZ_F16_F32,
	+ CVT_PKNORM_I16_F32,
	+ CVT_PKNORM_U16_F32,
	+ CVT_PK_I16_I32,
	+ CVT_PK_U16_U32,

	// Same as the standard node, except the high bits of the resulting integer
	// are known 0.
	FP_TO_FP16,

	// Wrapper around fp16 results that are known to zero the high bits.
	FP16_ZEXT,

	/// This node is for VLIW targets and it is used to represent a vector
	/// that is stored in consecutive registers with the same channel.
	/// For example:
	/// \|X \|Y\|Z\|W\|
	/// T0\|v.x\| \| \| \|
	/// T1\|v.y\| \| \| \|
	/// T2\|v.z\| \| \| \|
	/// T3\|v.w\| \| \| \|
	BUILD_VERTICAL_VECTOR,
	/// Pointer to the start of the shader's constant data.
	CONST_DATA_PTR,
	INIT_EXEC,
	INIT_EXEC_FROM_INPUT,
	SENDMSG,
	SENDMSGHALT,
	INTERP_MOV,
	INTERP_P1,
	INTERP_P2,
	PC_ADD_REL_OFFSET,
	KILL,
	DUMMY_CHAIN,
	FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
	STORE_MSKOR,
	LOAD_CONSTANT,
	TBUFFER_STORE_FORMAT,
	TBUFFER_STORE_FORMAT_X3,
	TBUFFER_LOAD_FORMAT,
	ATOMIC_CMP_SWAP,
	ATOMIC_INC,
	ATOMIC_DEC,
	BUFFER_LOAD,
	BUFFER_LOAD_FORMAT,
	BUFFER_STORE,
	BUFFER_STORE_FORMAT,
	BUFFER_ATOMIC_SWAP,
	BUFFER_ATOMIC_ADD,
	BUFFER_ATOMIC_SUB,
	BUFFER_ATOMIC_SMIN,
	BUFFER_ATOMIC_UMIN,
	BUFFER_ATOMIC_SMAX,
	BUFFER_ATOMIC_UMAX,
	BUFFER_ATOMIC_AND,
	BUFFER_ATOMIC_OR,
	BUFFER_ATOMIC_XOR,
	BUFFER_ATOMIC_CMPSWAP,
	LAST_AMDGPU_ISD_NUMBER
	};


	} // End namespace AMDGPUISD

	} // End namespace llvm

	#endif
	Index: head/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
	===================================================================
	--- head/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp (revision 329409)
	+++ head/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp (revision 329410)
	@@ -1,110 +1,128 @@
	//===-- AMDGPUInstrInfo.cpp - Base class for AMD GPU InstrInfo ------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	/// \file
	/// \brief Implementation of the TargetInstrInfo class that is common to all
	/// AMD GPUs.
	//
	//===----------------------------------------------------------------------===//

	#include "AMDGPUInstrInfo.h"
	#include "AMDGPURegisterInfo.h"
	#include "AMDGPUTargetMachine.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"

	using namespace llvm;

	#define GET_INSTRINFO_CTOR_DTOR
	#include "AMDGPUGenInstrInfo.inc"

	// Pin the vtable to this file.
	void AMDGPUInstrInfo::anchor() {}

	AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &ST)
	: AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
	ST(ST),
	AMDGPUASI(ST.getAMDGPUAS()) {}

	// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
	// the first 16 loads will be interleaved with the stores, and the next 16 will
	// be clustered as expected. It should really split into 2 16 store batches.
	//
	// Loads are clustered until this returns false, rather than trying to schedule
	// groups of stores. This also means we have to deal with saying different
	// address space loads should be clustered, and ones which might cause bank
	// conflicts.
	//
	// This might be deprecated so it might not be worth that much effort to fix.
	bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode Load0, SDNode Load1,
	int64_t Offset0, int64_t Offset1,
	unsigned NumLoads) const {
	assert(Offset1 > Offset0 &&
	"Second offset should be larger than first offset!");
	// If we have less than 16 loads in a row, and the offsets are within 64
	// bytes, then schedule together.

	// A cacheline is 64 bytes (for global memory).
	return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
	}

	// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
	enum SIEncodingFamily {
	SI = 0,
	VI = 1,
	SDWA = 2,
	SDWA9 = 3,
	GFX9 = 4
	};

	static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) {
	switch (ST.getGeneration()) {
	case AMDGPUSubtarget::SOUTHERN_ISLANDS:
	case AMDGPUSubtarget::SEA_ISLANDS:
	return SIEncodingFamily::SI;
	case AMDGPUSubtarget::VOLCANIC_ISLANDS:
	case AMDGPUSubtarget::GFX9:
	return SIEncodingFamily::VI;

	// FIXME: This should never be called for r600 GPUs.
	case AMDGPUSubtarget::R600:
	case AMDGPUSubtarget::R700:
	case AMDGPUSubtarget::EVERGREEN:
	case AMDGPUSubtarget::NORTHERN_ISLANDS:
	return SIEncodingFamily::SI;
	}

	llvm_unreachable("Unknown subtarget generation!");
	}

	int AMDGPUInstrInfo::pseudoToMCOpcode(int Opcode) const {
	SIEncodingFamily Gen = subtargetEncodingFamily(ST);

	if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
	ST.getGeneration() >= AMDGPUSubtarget::GFX9)
	Gen = SIEncodingFamily::GFX9;

	if (get(Opcode).TSFlags & SIInstrFlags::SDWA)
	Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9
	: SIEncodingFamily::SDWA;

	int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);

	// -1 means that Opcode is already a native instruction.
	if (MCOp == -1)
	return Opcode;

	// (uint16_t)-1 means that Opcode is a pseudo instruction that has
	// no encoding in the given subtarget generation.
	if (MCOp == (uint16_t)-1)
	return -1;

	return MCOp;
	}
	+
	+// TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence.
	+bool AMDGPUInstrInfo::isUniformMMO(const MachineMemOperand *MMO) {
	+ const Value *Ptr = MMO->getValue();
	+ // UndefValue means this is a load of a kernel input. These are uniform.
	+ // Sometimes LDS instructions have constant pointers.
	+ // If Ptr is null, then that means this mem operand contains a
	+ // PseudoSourceValue like GOT.
	+ if (!Ptr \|\| isa<UndefValue>(Ptr) \|\|
	+ isa<Constant>(Ptr) \|\| isa<GlobalValue>(Ptr))
	+ return true;
	+
	+ if (const Argument *Arg = dyn_cast<Argument>(Ptr))
	+ return AMDGPU::isArgPassedInSGPR(Arg);
	+
	+ const Instruction *I = dyn_cast<Instruction>(Ptr);
	+ return I && I->getMetadata("amdgpu.uniform");
	+}
	Index: head/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
	===================================================================
	--- head/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h (revision 329409)
	+++ head/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h (revision 329410)
	@@ -1,56 +1,58 @@
	//===-- AMDGPUInstrInfo.h - AMDGPU Instruction Information ------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	/// \file
	/// \brief Contains the definition of a TargetInstrInfo class that is common
	/// to all AMD GPUs.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H
	#define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H

	#include "AMDGPU.h"
	#include "Utils/AMDGPUBaseInfo.h"
	#include "llvm/CodeGen/TargetInstrInfo.h"

	#define GET_INSTRINFO_HEADER
	#include "AMDGPUGenInstrInfo.inc"
	#undef GET_INSTRINFO_HEADER

	namespace llvm {

	class AMDGPUSubtarget;
	class MachineFunction;
	class MachineInstr;
	class MachineInstrBuilder;

	class AMDGPUInstrInfo : public AMDGPUGenInstrInfo {
	private:
	const AMDGPUSubtarget &ST;

	virtual void anchor();
	protected:
	AMDGPUAS AMDGPUASI;

	public:
	explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st);

	bool shouldScheduleLoadsNear(SDNode Load1, SDNode Load2,
	int64_t Offset1, int64_t Offset2,
	unsigned NumLoads) const override;

	/// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
	/// Return -1 if the target-specific opcode for the pseudo instruction does
	/// not exist. If Opcode is not a pseudo instruction, this is identity.
	int pseudoToMCOpcode(int Opcode) const;
	+
	+ static bool isUniformMMO(const MachineMemOperand *MMO);
	};
	} // End llvm namespace

	#endif
	Index: head/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
	===================================================================
	--- head/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td (revision 329409)
	+++ head/contrib/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td (revision 329410)
	@@ -1,419 +1,427 @@
	//===-- AMDGPUInstrInfo.td - AMDGPU DAG nodes --------------- tablegen --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains DAG node defintions for the AMDGPU target.
	//
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// AMDGPU DAG Profiles
	//===----------------------------------------------------------------------===//

	def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [
	SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3>
	]>;

	def AMDGPUTrigPreOp : SDTypeProfile<1, 2,
	[SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]
	>;

	def AMDGPULdExpOp : SDTypeProfile<1, 2,
	[SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]
	>;

	def AMDGPUFPClassOp : SDTypeProfile<1, 2,
	[SDTCisInt<0>, SDTCisFP<1>, SDTCisInt<2>]
	>;

	def AMDGPUFPPackOp : SDTypeProfile<1, 2,
	[SDTCisFP<1>, SDTCisSameAs<1, 2>]
	>;

	+def AMDGPUIntPackOp : SDTypeProfile<1, 2,
	+ [SDTCisInt<1>, SDTCisSameAs<1, 2>]
	+>;
	+
	def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
	[SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
	>;

	// float, float, float, vcc
	def AMDGPUFmasOp : SDTypeProfile<1, 4,
	[SDTCisFP<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<4>]
	>;

	def AMDGPUKillSDT : SDTypeProfile<0, 1, [SDTCisInt<0>]>;

	def AMDGPUIfOp : SDTypeProfile<1, 2,
	[SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, OtherVT>]
	>;

	def AMDGPUElseOp : SDTypeProfile<1, 2,
	[SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, OtherVT>]
	>;

	def AMDGPULoopOp : SDTypeProfile<0, 2,
	[SDTCisVT<0, i64>, SDTCisVT<1, OtherVT>]
	>;

	def AMDGPUBreakOp : SDTypeProfile<1, 1,
	[SDTCisVT<0, i64>, SDTCisVT<1, i64>]
	>;

	def AMDGPUIfBreakOp : SDTypeProfile<1, 2,
	[SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, i64>]
	>;

	def AMDGPUElseBreakOp : SDTypeProfile<1, 2,
	[SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, i64>]
	>;

	def AMDGPUAddeSubeOp : SDTypeProfile<2, 3,
	[SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVT<0, i32>, SDTCisVT<1, i1>, SDTCisVT<4, i1>]
	>;

	def SDT_AMDGPUTCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;

	//===----------------------------------------------------------------------===//
	// AMDGPU DAG Nodes
	//

	def AMDGPUif : SDNode<"AMDGPUISD::IF", AMDGPUIfOp, [SDNPHasChain]>;
	def AMDGPUelse : SDNode<"AMDGPUISD::ELSE", AMDGPUElseOp, [SDNPHasChain]>;
	def AMDGPUloop : SDNode<"AMDGPUISD::LOOP", AMDGPULoopOp, [SDNPHasChain]>;

	def callseq_start : SDNode<"ISD::CALLSEQ_START",
	SDCallSeqStart<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>,
	[SDNPHasChain, SDNPOutGlue]
	>;

	def callseq_end : SDNode<"ISD::CALLSEQ_END",
	SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>,
	[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]
	>;

	def AMDGPUcall : SDNode<"AMDGPUISD::CALL",
	SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
	[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
	SDNPVariadic]
	>;

	def AMDGPUtc_return: SDNode<"AMDGPUISD::TC_RETURN", SDT_AMDGPUTCRET,
	[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
	>;

	def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP",
	SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>,
	[SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPInGlue]
	>;

	def AMDGPUconstdata_ptr : SDNode<
	"AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, iPTR>,
	SDTCisVT<0, iPTR>]>
	>;

	// This argument to this node is a dword address.
	def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;

	// Force dependencies for vector trunc stores
	def R600dummy_chain : SDNode<"AMDGPUISD::DUMMY_CHAIN", SDTNone, [SDNPHasChain]>;

	def AMDGPUcos : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>;
	def AMDGPUsin : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>;

	// out = a - floor(a)
	def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>;

	// out = 1.0 / a
	def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;

	// out = 1.0 / sqrt(a)
	def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;

	// out = 1.0 / sqrt(a)
	def AMDGPUrcp_legacy : SDNode<"AMDGPUISD::RCP_LEGACY", SDTFPUnaryOp>;
	def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>;

	// out = 1.0 / sqrt(a) result clamped to +/- max_float.
	def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>;

	def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;

	def AMDGPUpkrtz_f16_f32 : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>;
	+def AMDGPUpknorm_i16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>;
	+def AMDGPUpknorm_u16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>;
	+def AMDGPUpk_i16_i32 : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>;
	+def AMDGPUpk_u16_u32 : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>;
	def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
	def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>;


	def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>;

	// out = max(a, b) a and b are floats, where a nan comparison fails.
	// This is not commutative because this gives the second operand:
	// x < nan ? x : nan -> nan
	// nan < x ? nan : x -> x
	def AMDGPUfmax_legacy : SDNode<"AMDGPUISD::FMAX_LEGACY", SDTFPBinOp,
	[]
	>;

	def AMDGPUfmul_legacy : SDNode<"AMDGPUISD::FMUL_LEGACY", SDTFPBinOp,
	[SDNPCommutative, SDNPAssociative]
	>;

	def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;

	// out = min(a, b) a and b are floats, where a nan comparison fails.
	def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp,
	[]
	>;

	// FIXME: TableGen doesn't like commutative instructions with more
	// than 2 operands.
	// out = max(a, b, c) a, b and c are floats
	def AMDGPUfmax3 : SDNode<"AMDGPUISD::FMAX3", SDTFPTernaryOp,
	[/SDNPCommutative, SDNPAssociative/]
	>;

	// out = max(a, b, c) a, b, and c are signed ints
	def AMDGPUsmax3 : SDNode<"AMDGPUISD::SMAX3", AMDGPUDTIntTernaryOp,
	[/SDNPCommutative, SDNPAssociative/]
	>;

	// out = max(a, b, c) a, b and c are unsigned ints
	def AMDGPUumax3 : SDNode<"AMDGPUISD::UMAX3", AMDGPUDTIntTernaryOp,
	[/SDNPCommutative, SDNPAssociative/]
	>;

	// out = min(a, b, c) a, b and c are floats
	def AMDGPUfmin3 : SDNode<"AMDGPUISD::FMIN3", SDTFPTernaryOp,
	[/SDNPCommutative, SDNPAssociative/]
	>;

	// out = min(a, b, c) a, b and c are signed ints
	def AMDGPUsmin3 : SDNode<"AMDGPUISD::SMIN3", AMDGPUDTIntTernaryOp,
	[/SDNPCommutative, SDNPAssociative/]
	>;

	// out = min(a, b) a and b are unsigned ints
	def AMDGPUumin3 : SDNode<"AMDGPUISD::UMIN3", AMDGPUDTIntTernaryOp,
	[/SDNPCommutative, SDNPAssociative/]
	>;

	// out = (src0 + src1 > 0xFFFFFFFF) ? 1 : 0
	def AMDGPUcarry : SDNode<"AMDGPUISD::CARRY", SDTIntBinOp, []>;

	// out = (src1 > src0) ? 1 : 0
	def AMDGPUborrow : SDNode<"AMDGPUISD::BORROW", SDTIntBinOp, []>;

	// TODO: remove AMDGPUadde/AMDGPUsube when ADDCARRY/SUBCARRY get their own
	// nodes in TargetSelectionDAG.td.
	def AMDGPUadde : SDNode<"ISD::ADDCARRY", AMDGPUAddeSubeOp, []>;

	def AMDGPUsube : SDNode<"ISD::SUBCARRY", AMDGPUAddeSubeOp, []>;

	def AMDGPUSetCCOp : SDTypeProfile<1, 3, [ // setcc
	SDTCisVT<0, i64>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT>
	]>;

	def AMDGPUsetcc : SDNode<"AMDGPUISD::SETCC", AMDGPUSetCCOp>;

	def AMDGPUSetRegOp : SDTypeProfile<0, 2, [
	SDTCisInt<0>, SDTCisInt<1>
	]>;

	def AMDGPUsetreg : SDNode<"AMDGPUISD::SETREG", AMDGPUSetRegOp, [
	SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;

	def AMDGPUfma : SDNode<"AMDGPUISD::FMA_W_CHAIN", SDTFPTernaryOp, [
	SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;

	def AMDGPUmul : SDNode<"AMDGPUISD::FMUL_W_CHAIN", SDTFPBinOp, [
	SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;

	def AMDGPUcvt_f32_ubyte0 : SDNode<"AMDGPUISD::CVT_F32_UBYTE0",
	SDTIntToFPOp, []>;
	def AMDGPUcvt_f32_ubyte1 : SDNode<"AMDGPUISD::CVT_F32_UBYTE1",
	SDTIntToFPOp, []>;
	def AMDGPUcvt_f32_ubyte2 : SDNode<"AMDGPUISD::CVT_F32_UBYTE2",
	SDTIntToFPOp, []>;
	def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3",
	SDTIntToFPOp, []>;


	// urecip - This operation is a helper for integer division, it returns the
	// result of 1 / a as a fractional unsigned integer.
	// out = (2^32 / a) + e
	// e is rounding error
	def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>;

	// Special case divide preop and flags.
	def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>;

	// Special case divide FMA with scale and flags (src0 = Quotient,
	// src1 = Denominator, src2 = Numerator).
	def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp>;

	// Single or double precision division fixup.
	// Special case divide fixup and flags(src0 = Quotient, src1 =
	// Denominator, src2 = Numerator).
	def AMDGPUdiv_fixup : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>;

	def AMDGPUfmad_ftz : SDNode<"AMDGPUISD::FMAD_FTZ", SDTFPTernaryOp>;

	// Look Up 2.0 / pi src0 with segment select src1[4:0]
	def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>;

	def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD",
	SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
	[SDNPHasChain, SDNPMayLoad]>;

	def AMDGPUregister_store : SDNode<"AMDGPUISD::REGISTER_STORE",
	SDTypeProfile<0, 3, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
	[SDNPHasChain, SDNPMayStore]>;

	// MSKOR instructions are atomic memory instructions used mainly for storing
	// 8-bit and 16-bit values. The definition is:
	//
	// MSKOR(dst, mask, src) MEM[dst] = ((MEM[dst] & ~mask) \| src)
	//
	// src0: vec4(src, 0, 0, mask)
	// src1: dst - rat offset (aka pointer) in dwords
	def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR",
	SDTypeProfile<0, 2, []>,
	[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;

	def AMDGPUatomic_cmp_swap : SDNode<"AMDGPUISD::ATOMIC_CMP_SWAP",
	SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisVec<2>]>,
	[SDNPHasChain, SDNPMayStore, SDNPMayLoad,
	SDNPMemOperand]>;

	def AMDGPUround : SDNode<"ISD::FROUND",
	SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>;

	def AMDGPUbfe_u32 : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>;
	def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
	def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
	def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;

	def AMDGPUffbh_u32 : SDNode<"AMDGPUISD::FFBH_U32", SDTIntUnaryOp>;
	def AMDGPUffbh_i32 : SDNode<"AMDGPUISD::FFBH_I32", SDTIntUnaryOp>;

	def AMDGPUffbl_b32 : SDNode<"AMDGPUISD::FFBL_B32", SDTIntUnaryOp>;

	// Signed and unsigned 24-bit multiply. The highest 8-bits are ignore
	// when performing the mulitply. The result is a 32-bit value.
	def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
	[SDNPCommutative, SDNPAssociative]
	>;
	def AMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp,
	[SDNPCommutative, SDNPAssociative]
	>;

	def AMDGPUmulhi_u24 : SDNode<"AMDGPUISD::MULHI_U24", SDTIntBinOp,
	[SDNPCommutative, SDNPAssociative]
	>;
	def AMDGPUmulhi_i24 : SDNode<"AMDGPUISD::MULHI_I24", SDTIntBinOp,
	[SDNPCommutative, SDNPAssociative]
	>;

	def AMDGPUmad_u24 : SDNode<"AMDGPUISD::MAD_U24", AMDGPUDTIntTernaryOp,
	[]
	>;
	def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp,
	[]
	>;

	def AMDGPUsmed3 : SDNode<"AMDGPUISD::SMED3", AMDGPUDTIntTernaryOp,
	[]
	>;

	def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,
	[]
	>;

	def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;

	def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC",
	SDTypeProfile<0, 1, [SDTCisInt<0>]>,
	[SDNPHasChain, SDNPInGlue]>;

	def AMDGPUinit_exec_from_input : SDNode<"AMDGPUISD::INIT_EXEC_FROM_INPUT",
	SDTypeProfile<0, 2,
	[SDTCisInt<0>, SDTCisInt<1>]>,
	[SDNPHasChain, SDNPInGlue]>;

	def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG",
	SDTypeProfile<0, 1, [SDTCisInt<0>]>,
	[SDNPHasChain, SDNPInGlue]>;

	def AMDGPUsendmsghalt : SDNode<"AMDGPUISD::SENDMSGHALT",
	SDTypeProfile<0, 1, [SDTCisInt<0>]>,
	[SDNPHasChain, SDNPInGlue]>;

	def AMDGPUinterp_mov : SDNode<"AMDGPUISD::INTERP_MOV",
	SDTypeProfile<1, 3, [SDTCisFP<0>]>,
	[SDNPInGlue]>;

	def AMDGPUinterp_p1 : SDNode<"AMDGPUISD::INTERP_P1",
	SDTypeProfile<1, 3, [SDTCisFP<0>]>,
	[SDNPInGlue, SDNPOutGlue]>;

	def AMDGPUinterp_p2 : SDNode<"AMDGPUISD::INTERP_P2",
	SDTypeProfile<1, 4, [SDTCisFP<0>]>,
	[SDNPInGlue]>;


	def AMDGPUkill : SDNode<"AMDGPUISD::KILL", AMDGPUKillSDT,
	[SDNPHasChain, SDNPSideEffect]>;

	// SI+ export
	def AMDGPUExportOp : SDTypeProfile<0, 8, [
	SDTCisInt<0>, // i8 tgt
	SDTCisInt<1>, // i8 en
	// i32 or f32 src0
	SDTCisSameAs<3, 2>, // f32 src1
	SDTCisSameAs<4, 2>, // f32 src2
	SDTCisSameAs<5, 2>, // f32 src3
	SDTCisInt<6>, // i1 compr
	// skip done
	SDTCisInt<1> // i1 vm

	]>;

	def AMDGPUexport: SDNode<"AMDGPUISD::EXPORT", AMDGPUExportOp,
	[SDNPHasChain, SDNPMayStore]>;

	def AMDGPUexport_done: SDNode<"AMDGPUISD::EXPORT_DONE", AMDGPUExportOp,
	[SDNPHasChain, SDNPMayLoad, SDNPMayStore]>;


	def R600ExportOp : SDTypeProfile<0, 7, [SDTCisFP<0>, SDTCisInt<1>]>;

	def R600_EXPORT: SDNode<"AMDGPUISD::R600_EXPORT", R600ExportOp,
	[SDNPHasChain, SDNPSideEffect]>;

	//===----------------------------------------------------------------------===//
	// Flow Control Profile Types
	//===----------------------------------------------------------------------===//
	// Branch instruction where second and third are basic blocks
	def SDTIL_BRCond : SDTypeProfile<0, 2, [
	SDTCisVT<0, OtherVT>
	]>;

	//===----------------------------------------------------------------------===//
	// Flow Control DAG Nodes
	//===----------------------------------------------------------------------===//
	def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>;

	//===----------------------------------------------------------------------===//
	// Call/Return DAG Nodes
	//===----------------------------------------------------------------------===//
	def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone,
	[SDNPHasChain, SDNPOptInGlue]>;

	def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone,
	[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;

	def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
	[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
	>;
	Index: head/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
	===================================================================
	--- head/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (revision 329409)
	+++ head/contrib/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (revision 329410)
	@@ -1,227 +1,227 @@
	//===- AMDGPURegisterBankInfo.cpp -------------------------------- C++ --==//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	/// \file
	/// This file implements the targeting of the RegisterBankInfo class for
	/// AMDGPU.
	/// \todo This should be generated by TableGen.
	//===----------------------------------------------------------------------===//

	#include "AMDGPURegisterBankInfo.h"
	#include "AMDGPUInstrInfo.h"
	#include "SIRegisterInfo.h"
	#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
	#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/CodeGen/TargetSubtargetInfo.h"
	#include "llvm/IR/Constants.h"

	#define GET_TARGET_REGBANK_IMPL
	#include "AMDGPUGenRegisterBank.inc"

	// This file will be TableGen'ed at some point.
	#include "AMDGPUGenRegisterBankInfo.def"

	using namespace llvm;

	AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI)
	: AMDGPUGenRegisterBankInfo(),
	TRI(static_cast<const SIRegisterInfo*>(&TRI)) {

	// HACK: Until this is fully tablegen'd
	static bool AlreadyInit = false;
	if (AlreadyInit)
	return;

	AlreadyInit = true;

	const RegisterBank &RBSGPR = getRegBank(AMDGPU::SGPRRegBankID);
	(void)RBSGPR;
	assert(&RBSGPR == &AMDGPU::SGPRRegBank);

	const RegisterBank &RBVGPR = getRegBank(AMDGPU::VGPRRegBankID);
	(void)RBVGPR;
	assert(&RBVGPR == &AMDGPU::VGPRRegBank);

	}

	unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &A,
	const RegisterBank &B,
	unsigned Size) const {
	return RegisterBankInfo::copyCost(A, B, Size);
	}

	const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass(
	const TargetRegisterClass &RC) const {

	if (TRI->isSGPRClass(&RC))
	return getRegBank(AMDGPU::SGPRRegBankID);

	return getRegBank(AMDGPU::VGPRRegBankID);
	}

	RegisterBankInfo::InstructionMappings
	AMDGPURegisterBankInfo::getInstrAlternativeMappings(
	const MachineInstr &MI) const {

	const MachineFunction &MF = *MI.getParent()->getParent();
	const MachineRegisterInfo &MRI = MF.getRegInfo();

	unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);

	InstructionMappings AltMappings;
	switch (MI.getOpcode()) {
	case TargetOpcode::G_LOAD: {
	// FIXME: Should we be hard coding the size for these mappings?
	const InstructionMapping &SSMapping = getInstructionMapping(
	1, 1, getOperandsMapping(
	{AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
	AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}),
	2); // Num Operands
	AltMappings.push_back(&SSMapping);

	const InstructionMapping &VVMapping = getInstructionMapping(
	2, 1, getOperandsMapping(
	{AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
	AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64)}),
	2); // Num Operands
	AltMappings.push_back(&VVMapping);

	// FIXME: Should this be the pointer-size (64-bits) or the size of the
	// register that will hold the bufffer resourc (128-bits).
	const InstructionMapping &VSMapping = getInstructionMapping(
	3, 1, getOperandsMapping(
	{AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
	AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}),
	2); // Num Operands
	AltMappings.push_back(&VSMapping);

	return AltMappings;

	}
	default:
	break;
	}
	return RegisterBankInfo::getInstrAlternativeMappings(MI);
	}

	void AMDGPURegisterBankInfo::applyMappingImpl(
	const OperandsMapper &OpdMapper) const {
	return applyDefaultMapping(OpdMapper);
	}

	static bool isInstrUniform(const MachineInstr &MI) {
	if (!MI.hasOneMemOperand())
	return false;

	const MachineMemOperand MMO = MI.memoperands_begin();
	- return AMDGPU::isUniformMMO(MMO);
	+ return AMDGPUInstrInfo::isUniformMMO(MMO);
	}

	const RegisterBankInfo::InstructionMapping &
	AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {

	const MachineFunction &MF = *MI.getParent()->getParent();
	const MachineRegisterInfo &MRI = MF.getRegInfo();
	SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
	unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
	unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);

	const ValueMapping *ValMapping;
	const ValueMapping *PtrMapping;

	if (isInstrUniform(MI)) {
	// We have a uniform instruction so we want to use an SMRD load
	ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
	PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
	} else {
	ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
	// FIXME: What would happen if we used SGPRRegBankID here?
	PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
	}

	OpdsMapping[0] = ValMapping;
	OpdsMapping[1] = PtrMapping;
	const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
	1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
	return Mapping;

	// FIXME: Do we want to add a mapping for FLAT load, or should we just
	// handle that during instruction selection?
	}

	const RegisterBankInfo::InstructionMapping &
	AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
	const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);

	if (Mapping.isValid())
	return Mapping;

	const MachineFunction &MF = *MI.getParent()->getParent();
	const MachineRegisterInfo &MRI = MF.getRegInfo();
	SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());

	bool IsComplete = true;
	switch (MI.getOpcode()) {
	default:
	IsComplete = false;
	break;
	case AMDGPU::G_CONSTANT: {
	unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
	OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
	break;
	}
	case AMDGPU::G_GEP: {
	for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
	if (!MI.getOperand(i).isReg())
	continue;

	unsigned Size = MRI.getType(MI.getOperand(i).getReg()).getSizeInBits();
	OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
	}
	break;
	}
	case AMDGPU::G_STORE: {
	assert(MI.getOperand(0).isReg());
	unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
	// FIXME: We need to specify a different reg bank once scalar stores
	// are supported.
	const ValueMapping *ValMapping =
	AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
	// FIXME: Depending on the type of store, the pointer could be in
	// the SGPR Reg bank.
	// FIXME: Pointer size should be based on the address space.
	const ValueMapping *PtrMapping =
	AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);

	OpdsMapping[0] = ValMapping;
	OpdsMapping[1] = PtrMapping;
	break;
	}

	case AMDGPU::G_LOAD:
	return getInstrMappingForLoad(MI);
	}

	if (!IsComplete) {
	unsigned BankID = AMDGPU::SGPRRegBankID;

	unsigned Size = 0;
	for (unsigned Idx = 0; Idx < MI.getNumOperands(); ++Idx) {
	// If the operand is not a register default to the size of the previous
	// operand.
	// FIXME: Can't we pull the types from the MachineInstr rather than the
	// operands.
	if (MI.getOperand(Idx).isReg())
	Size = getSizeInBits(MI.getOperand(Idx).getReg(), MRI, *TRI);
	OpdsMapping.push_back(AMDGPU::getValueMapping(BankID, Size));
	}
	}
	return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
	MI.getNumOperands());
	}
	Index: head/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
	===================================================================
	--- head/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp (revision 329409)
	+++ head/contrib/llvm/lib/Target/AMDGPU/SIISelLowering.cpp (revision 329410)
	@@ -1,7290 +1,7332 @@
	//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	/// \file
	/// \brief Custom DAG lowering for SI
	//
	//===----------------------------------------------------------------------===//

	#ifdef _MSC_VER
	// Provide M_PI.
	#define _USE_MATH_DEFINES
	#endif

	#include "SIISelLowering.h"
	#include "AMDGPU.h"
	#include "AMDGPUIntrinsicInfo.h"
	#include "AMDGPUSubtarget.h"
	#include "AMDGPUTargetMachine.h"
	#include "SIDefines.h"
	#include "SIInstrInfo.h"
	#include "SIMachineFunctionInfo.h"
	#include "SIRegisterInfo.h"
	#include "Utils/AMDGPUBaseInfo.h"
	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/BitVector.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/CodeGen/Analysis.h"
	#include "llvm/CodeGen/CallingConvLower.h"
	#include "llvm/CodeGen/DAGCombine.h"
	#include "llvm/CodeGen/ISDOpcodes.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/MachineValueType.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/TargetCallingConv.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/DiagnosticInfo.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/InstrTypes.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/Type.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Target/TargetOptions.h"
	#include <cassert>
	#include <cmath>
	#include <cstdint>
	#include <iterator>
	#include <tuple>
	#include <utility>
	#include <vector>

	using namespace llvm;

	#define DEBUG_TYPE "si-lower"

	STATISTIC(NumTailCalls, "Number of tail calls");

	static cl::opt<bool> EnableVGPRIndexMode(
	"amdgpu-vgpr-index-mode",
	cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
	cl::init(false));

	static cl::opt<unsigned> AssumeFrameIndexHighZeroBits(
	"amdgpu-frame-index-zero-bits",
	cl::desc("High bits of frame index assumed to be zero"),
	cl::init(5),
	cl::ReallyHidden);

	static unsigned findFirstFreeSGPR(CCState &CCInfo) {
	unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
	for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
	if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
	return AMDGPU::SGPR0 + Reg;
	}
	}
	llvm_unreachable("Cannot allocate sgpr");
	}

	SITargetLowering::SITargetLowering(const TargetMachine &TM,
	const SISubtarget &STI)
	: AMDGPUTargetLowering(TM, STI) {
	addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
	addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);

	addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
	addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);

	addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
	addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
	addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);

	addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
	addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);

	addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
	addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);

	addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
	addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);

	addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
	addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);

	if (Subtarget->has16BitInsts()) {
	addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
	addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
	}

	if (Subtarget->hasVOP3PInsts()) {
	addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
	addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
	}

	computeRegisterProperties(STI.getRegisterInfo());

	// We need to custom lower vector stores from local memory
	setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
	setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
	setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
	setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
	setOperationAction(ISD::LOAD, MVT::i1, Custom);

	setOperationAction(ISD::STORE, MVT::v2i32, Custom);
	setOperationAction(ISD::STORE, MVT::v4i32, Custom);
	setOperationAction(ISD::STORE, MVT::v8i32, Custom);
	setOperationAction(ISD::STORE, MVT::v16i32, Custom);
	setOperationAction(ISD::STORE, MVT::i1, Custom);

	setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
	setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
	setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
	setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
	setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
	setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
	setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
	setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
	setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
	setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);

	setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
	setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
	setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);

	setOperationAction(ISD::SELECT, MVT::i1, Promote);
	setOperationAction(ISD::SELECT, MVT::i64, Custom);
	setOperationAction(ISD::SELECT, MVT::f64, Promote);
	AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);

	setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);

	setOperationAction(ISD::SETCC, MVT::i1, Promote);
	setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
	setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
	AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);

	setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
	setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);

	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);

	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
	+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);

	setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);

	setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
	setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
	setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);

	setOperationAction(ISD::BRCOND, MVT::Other, Custom);
	setOperationAction(ISD::BR_CC, MVT::i1, Expand);
	setOperationAction(ISD::BR_CC, MVT::i32, Expand);
	setOperationAction(ISD::BR_CC, MVT::i64, Expand);
	setOperationAction(ISD::BR_CC, MVT::f32, Expand);
	setOperationAction(ISD::BR_CC, MVT::f64, Expand);

	setOperationAction(ISD::UADDO, MVT::i32, Legal);
	setOperationAction(ISD::USUBO, MVT::i32, Legal);

	setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
	setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);

	#if 0
	setOperationAction(ISD::ADDCARRY, MVT::i64, Legal);
	setOperationAction(ISD::SUBCARRY, MVT::i64, Legal);
	#endif

	//setOperationAction(ISD::ADDC, MVT::i64, Expand);
	//setOperationAction(ISD::SUBC, MVT::i64, Expand);

	// We only support LOAD/STORE and vector manipulation ops for vectors
	// with > 4 elements.
	for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
	MVT::v2i64, MVT::v2f64}) {
	for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
	switch (Op) {
	case ISD::LOAD:
	case ISD::STORE:
	case ISD::BUILD_VECTOR:
	case ISD::BITCAST:
	case ISD::EXTRACT_VECTOR_ELT:
	case ISD::INSERT_VECTOR_ELT:
	case ISD::INSERT_SUBVECTOR:
	case ISD::EXTRACT_SUBVECTOR:
	case ISD::SCALAR_TO_VECTOR:
	break;
	case ISD::CONCAT_VECTORS:
	setOperationAction(Op, VT, Custom);
	break;
	default:
	setOperationAction(Op, VT, Expand);
	break;
	}
	}
	}

	// TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
	// is expanded to avoid having two separate loops in case the index is a VGPR.

	// Most operations are naturally 32-bit vector operations. We only support
	// load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
	for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
	setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
	AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);

	setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
	AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);

	setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
	AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);

	setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
	AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
	}

	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);

	// Avoid stack access for these.
	// TODO: Generalize to more vector types.
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);

	// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
	// and output demarshalling
	setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
	setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);

	// We can't return success/failure, only the old value,
	// let LLVM add the comparison
	setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
	setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);

	if (getSubtarget()->hasFlatAddressSpace()) {
	setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
	setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
	}

	setOperationAction(ISD::BSWAP, MVT::i32, Legal);
	setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);

	// On SI this is s_memtime and s_memrealtime on VI.
	setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
	setOperationAction(ISD::TRAP, MVT::Other, Custom);
	setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);

	setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
	setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);

	if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) {
	setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
	setOperationAction(ISD::FCEIL, MVT::f64, Legal);
	setOperationAction(ISD::FRINT, MVT::f64, Legal);
	}

	setOperationAction(ISD::FFLOOR, MVT::f64, Legal);

	setOperationAction(ISD::FSIN, MVT::f32, Custom);
	setOperationAction(ISD::FCOS, MVT::f32, Custom);
	setOperationAction(ISD::FDIV, MVT::f32, Custom);
	setOperationAction(ISD::FDIV, MVT::f64, Custom);

	if (Subtarget->has16BitInsts()) {
	setOperationAction(ISD::Constant, MVT::i16, Legal);

	setOperationAction(ISD::SMIN, MVT::i16, Legal);
	setOperationAction(ISD::SMAX, MVT::i16, Legal);

	setOperationAction(ISD::UMIN, MVT::i16, Legal);
	setOperationAction(ISD::UMAX, MVT::i16, Legal);

	setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
	AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);

	setOperationAction(ISD::ROTR, MVT::i16, Promote);
	setOperationAction(ISD::ROTL, MVT::i16, Promote);

	setOperationAction(ISD::SDIV, MVT::i16, Promote);
	setOperationAction(ISD::UDIV, MVT::i16, Promote);
	setOperationAction(ISD::SREM, MVT::i16, Promote);
	setOperationAction(ISD::UREM, MVT::i16, Promote);

	setOperationAction(ISD::BSWAP, MVT::i16, Promote);
	setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);

	setOperationAction(ISD::CTTZ, MVT::i16, Promote);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
	setOperationAction(ISD::CTLZ, MVT::i16, Promote);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);

	setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);

	setOperationAction(ISD::BR_CC, MVT::i16, Expand);

	setOperationAction(ISD::LOAD, MVT::i16, Custom);

	setTruncStoreAction(MVT::i64, MVT::i16, Expand);

	setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
	AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
	setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
	AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);

	setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
	setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
	setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
	setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);

	// F16 - Constant Actions.
	setOperationAction(ISD::ConstantFP, MVT::f16, Legal);

	// F16 - Load/Store Actions.
	setOperationAction(ISD::LOAD, MVT::f16, Promote);
	AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
	setOperationAction(ISD::STORE, MVT::f16, Promote);
	AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);

	// F16 - VOP1 Actions.
	setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
	setOperationAction(ISD::FCOS, MVT::f16, Promote);
	setOperationAction(ISD::FSIN, MVT::f16, Promote);
	setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
	setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
	setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
	setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
	setOperationAction(ISD::FROUND, MVT::f16, Custom);

	// F16 - VOP2 Actions.
	setOperationAction(ISD::BR_CC, MVT::f16, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
	setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
	setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
	setOperationAction(ISD::FDIV, MVT::f16, Custom);

	// F16 - VOP3 Actions.
	setOperationAction(ISD::FMA, MVT::f16, Legal);
	if (!Subtarget->hasFP16Denormals())
	setOperationAction(ISD::FMAD, MVT::f16, Legal);
	}

	if (Subtarget->hasVOP3PInsts()) {
	for (MVT VT : {MVT::v2i16, MVT::v2f16}) {
	for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
	switch (Op) {
	case ISD::LOAD:
	case ISD::STORE:
	case ISD::BUILD_VECTOR:
	case ISD::BITCAST:
	case ISD::EXTRACT_VECTOR_ELT:
	case ISD::INSERT_VECTOR_ELT:
	case ISD::INSERT_SUBVECTOR:
	case ISD::EXTRACT_SUBVECTOR:
	case ISD::SCALAR_TO_VECTOR:
	break;
	case ISD::CONCAT_VECTORS:
	setOperationAction(Op, VT, Custom);
	break;
	default:
	setOperationAction(Op, VT, Expand);
	break;
	}
	}
	}

	// XXX - Do these do anything? Vector constants turn into build_vector.
	setOperationAction(ISD::Constant, MVT::v2i16, Legal);
	setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);

	setOperationAction(ISD::STORE, MVT::v2i16, Promote);
	AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
	setOperationAction(ISD::STORE, MVT::v2f16, Promote);
	AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);

	setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
	AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
	setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
	AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);

	setOperationAction(ISD::AND, MVT::v2i16, Promote);
	AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
	setOperationAction(ISD::OR, MVT::v2i16, Promote);
	AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
	setOperationAction(ISD::XOR, MVT::v2i16, Promote);
	AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
	setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
	AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
	setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
	AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);

	setOperationAction(ISD::ADD, MVT::v2i16, Legal);
	setOperationAction(ISD::SUB, MVT::v2i16, Legal);
	setOperationAction(ISD::MUL, MVT::v2i16, Legal);
	setOperationAction(ISD::SHL, MVT::v2i16, Legal);
	setOperationAction(ISD::SRL, MVT::v2i16, Legal);
	setOperationAction(ISD::SRA, MVT::v2i16, Legal);
	setOperationAction(ISD::SMIN, MVT::v2i16, Legal);
	setOperationAction(ISD::UMIN, MVT::v2i16, Legal);
	setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
	setOperationAction(ISD::UMAX, MVT::v2i16, Legal);

	setOperationAction(ISD::FADD, MVT::v2f16, Legal);
	setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
	setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
	setOperationAction(ISD::FMA, MVT::v2f16, Legal);
	setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
	setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);

	// This isn't really legal, but this avoids the legalizer unrolling it (and
	// allows matching fneg (fabs x) patterns)
	setOperationAction(ISD::FABS, MVT::v2f16, Legal);

	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);

	setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
	setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
	} else {
	setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
	setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
	}

	for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
	setOperationAction(ISD::SELECT, VT, Custom);
	}

	setTargetDAGCombine(ISD::ADD);
	setTargetDAGCombine(ISD::ADDCARRY);
	setTargetDAGCombine(ISD::SUB);
	setTargetDAGCombine(ISD::SUBCARRY);
	setTargetDAGCombine(ISD::FADD);
	setTargetDAGCombine(ISD::FSUB);
	setTargetDAGCombine(ISD::FMINNUM);
	setTargetDAGCombine(ISD::FMAXNUM);
	setTargetDAGCombine(ISD::SMIN);
	setTargetDAGCombine(ISD::SMAX);
	setTargetDAGCombine(ISD::UMIN);
	setTargetDAGCombine(ISD::UMAX);
	setTargetDAGCombine(ISD::SETCC);
	setTargetDAGCombine(ISD::AND);
	setTargetDAGCombine(ISD::OR);
	setTargetDAGCombine(ISD::XOR);
	setTargetDAGCombine(ISD::SINT_TO_FP);
	setTargetDAGCombine(ISD::UINT_TO_FP);
	setTargetDAGCombine(ISD::FCANONICALIZE);
	setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
	setTargetDAGCombine(ISD::ZERO_EXTEND);
	setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
	setTargetDAGCombine(ISD::BUILD_VECTOR);

	// All memory operations. Some folding on the pointer operand is done to help
	// matching the constant offsets in the addressing modes.
	setTargetDAGCombine(ISD::LOAD);
	setTargetDAGCombine(ISD::STORE);
	setTargetDAGCombine(ISD::ATOMIC_LOAD);
	setTargetDAGCombine(ISD::ATOMIC_STORE);
	setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
	setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
	setTargetDAGCombine(ISD::ATOMIC_SWAP);
	setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
	setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
	setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
	setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
	setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
	setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
	setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
	setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
	setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
	setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);

	setSchedulingPreference(Sched::RegPressure);
	}

	const SISubtarget *SITargetLowering::getSubtarget() const {
	return static_cast<const SISubtarget *>(Subtarget);
	}

	//===----------------------------------------------------------------------===//
	// TargetLowering queries
	//===----------------------------------------------------------------------===//

	bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
	// SI has some legal vector types, but no legal vector operations. Say no
	// shuffles are legal in order to prefer scalarizing some vector operations.
	return false;
	}

	bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
	const CallInst &CI,
	MachineFunction &MF,
	unsigned IntrID) const {
	switch (IntrID) {
	case Intrinsic::amdgcn_atomic_inc:
	case Intrinsic::amdgcn_atomic_dec: {
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::getVT(CI.getType());
	Info.ptrVal = CI.getOperand(0);
	Info.align = 0;
	Info.flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOStore;

	const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
	if (!Vol \|\| !Vol->isZero())
	Info.flags \|= MachineMemOperand::MOVolatile;

	return true;
	}

	// Image load.
	case Intrinsic::amdgcn_image_load:
	case Intrinsic::amdgcn_image_load_mip:

	// Sample.
	case Intrinsic::amdgcn_image_sample:
	case Intrinsic::amdgcn_image_sample_cl:
	case Intrinsic::amdgcn_image_sample_d:
	case Intrinsic::amdgcn_image_sample_d_cl:
	case Intrinsic::amdgcn_image_sample_l:
	case Intrinsic::amdgcn_image_sample_b:
	case Intrinsic::amdgcn_image_sample_b_cl:
	case Intrinsic::amdgcn_image_sample_lz:
	case Intrinsic::amdgcn_image_sample_cd:
	case Intrinsic::amdgcn_image_sample_cd_cl:

	// Sample with comparison.
	case Intrinsic::amdgcn_image_sample_c:
	case Intrinsic::amdgcn_image_sample_c_cl:
	case Intrinsic::amdgcn_image_sample_c_d:
	case Intrinsic::amdgcn_image_sample_c_d_cl:
	case Intrinsic::amdgcn_image_sample_c_l:
	case Intrinsic::amdgcn_image_sample_c_b:
	case Intrinsic::amdgcn_image_sample_c_b_cl:
	case Intrinsic::amdgcn_image_sample_c_lz:
	case Intrinsic::amdgcn_image_sample_c_cd:
	case Intrinsic::amdgcn_image_sample_c_cd_cl:

	// Sample with offsets.
	case Intrinsic::amdgcn_image_sample_o:
	case Intrinsic::amdgcn_image_sample_cl_o:
	case Intrinsic::amdgcn_image_sample_d_o:
	case Intrinsic::amdgcn_image_sample_d_cl_o:
	case Intrinsic::amdgcn_image_sample_l_o:
	case Intrinsic::amdgcn_image_sample_b_o:
	case Intrinsic::amdgcn_image_sample_b_cl_o:
	case Intrinsic::amdgcn_image_sample_lz_o:
	case Intrinsic::amdgcn_image_sample_cd_o:
	case Intrinsic::amdgcn_image_sample_cd_cl_o:

	// Sample with comparison and offsets.
	case Intrinsic::amdgcn_image_sample_c_o:
	case Intrinsic::amdgcn_image_sample_c_cl_o:
	case Intrinsic::amdgcn_image_sample_c_d_o:
	case Intrinsic::amdgcn_image_sample_c_d_cl_o:
	case Intrinsic::amdgcn_image_sample_c_l_o:
	case Intrinsic::amdgcn_image_sample_c_b_o:
	case Intrinsic::amdgcn_image_sample_c_b_cl_o:
	case Intrinsic::amdgcn_image_sample_c_lz_o:
	case Intrinsic::amdgcn_image_sample_c_cd_o:
	case Intrinsic::amdgcn_image_sample_c_cd_cl_o:

	// Basic gather4
	case Intrinsic::amdgcn_image_gather4:
	case Intrinsic::amdgcn_image_gather4_cl:
	case Intrinsic::amdgcn_image_gather4_l:
	case Intrinsic::amdgcn_image_gather4_b:
	case Intrinsic::amdgcn_image_gather4_b_cl:
	case Intrinsic::amdgcn_image_gather4_lz:

	// Gather4 with comparison
	case Intrinsic::amdgcn_image_gather4_c:
	case Intrinsic::amdgcn_image_gather4_c_cl:
	case Intrinsic::amdgcn_image_gather4_c_l:
	case Intrinsic::amdgcn_image_gather4_c_b:
	case Intrinsic::amdgcn_image_gather4_c_b_cl:
	case Intrinsic::amdgcn_image_gather4_c_lz:

	// Gather4 with offsets
	case Intrinsic::amdgcn_image_gather4_o:
	case Intrinsic::amdgcn_image_gather4_cl_o:
	case Intrinsic::amdgcn_image_gather4_l_o:
	case Intrinsic::amdgcn_image_gather4_b_o:
	case Intrinsic::amdgcn_image_gather4_b_cl_o:
	case Intrinsic::amdgcn_image_gather4_lz_o:

	// Gather4 with comparison and offsets
	case Intrinsic::amdgcn_image_gather4_c_o:
	case Intrinsic::amdgcn_image_gather4_c_cl_o:
	case Intrinsic::amdgcn_image_gather4_c_l_o:
	case Intrinsic::amdgcn_image_gather4_c_b_o:
	case Intrinsic::amdgcn_image_gather4_c_b_cl_o:
	case Intrinsic::amdgcn_image_gather4_c_lz_o: {
	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::getVT(CI.getType());
	Info.ptrVal = MFI->getImagePSV(
	*MF.getSubtarget<SISubtarget>().getInstrInfo(),
	CI.getArgOperand(1));
	Info.align = 0;
	Info.flags = MachineMemOperand::MOLoad \|
	MachineMemOperand::MODereferenceable;
	return true;
	}
	case Intrinsic::amdgcn_image_store:
	case Intrinsic::amdgcn_image_store_mip: {
	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
	Info.opc = ISD::INTRINSIC_VOID;
	Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
	Info.ptrVal = MFI->getImagePSV(
	*MF.getSubtarget<SISubtarget>().getInstrInfo(),
	CI.getArgOperand(2));
	Info.flags = MachineMemOperand::MOStore \|
	MachineMemOperand::MODereferenceable;
	Info.align = 0;
	return true;
	}
	case Intrinsic::amdgcn_image_atomic_swap:
	case Intrinsic::amdgcn_image_atomic_add:
	case Intrinsic::amdgcn_image_atomic_sub:
	case Intrinsic::amdgcn_image_atomic_smin:
	case Intrinsic::amdgcn_image_atomic_umin:
	case Intrinsic::amdgcn_image_atomic_smax:
	case Intrinsic::amdgcn_image_atomic_umax:
	case Intrinsic::amdgcn_image_atomic_and:
	case Intrinsic::amdgcn_image_atomic_or:
	case Intrinsic::amdgcn_image_atomic_xor:
	case Intrinsic::amdgcn_image_atomic_inc:
	case Intrinsic::amdgcn_image_atomic_dec: {
	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::getVT(CI.getType());
	Info.ptrVal = MFI->getImagePSV(
	*MF.getSubtarget<SISubtarget>().getInstrInfo(),
	CI.getArgOperand(2));

	Info.flags = MachineMemOperand::MOLoad \|
	MachineMemOperand::MOStore \|
	MachineMemOperand::MODereferenceable;

	// XXX - Should this be volatile without known ordering?
	Info.flags \|= MachineMemOperand::MOVolatile;
	return true;
	}
	case Intrinsic::amdgcn_image_atomic_cmpswap: {
	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::getVT(CI.getType());
	Info.ptrVal = MFI->getImagePSV(
	*MF.getSubtarget<SISubtarget>().getInstrInfo(),
	CI.getArgOperand(3));

	Info.flags = MachineMemOperand::MOLoad \|
	MachineMemOperand::MOStore \|
	MachineMemOperand::MODereferenceable;

	// XXX - Should this be volatile without known ordering?
	Info.flags \|= MachineMemOperand::MOVolatile;
	return true;
	}
	case Intrinsic::amdgcn_tbuffer_load:
	case Intrinsic::amdgcn_buffer_load:
	case Intrinsic::amdgcn_buffer_load_format: {
	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.ptrVal = MFI->getBufferPSV(
	*MF.getSubtarget<SISubtarget>().getInstrInfo(),
	CI.getArgOperand(0));
	Info.memVT = MVT::getVT(CI.getType());
	Info.flags = MachineMemOperand::MOLoad \|
	MachineMemOperand::MODereferenceable;

	// There is a constant offset component, but there are additional register
	// offsets which could break AA if we set the offset to anything non-0.
	return true;
	}
	case Intrinsic::amdgcn_tbuffer_store:
	case Intrinsic::amdgcn_buffer_store:
	case Intrinsic::amdgcn_buffer_store_format: {
	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
	Info.opc = ISD::INTRINSIC_VOID;
	Info.ptrVal = MFI->getBufferPSV(
	*MF.getSubtarget<SISubtarget>().getInstrInfo(),
	CI.getArgOperand(1));
	Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
	Info.flags = MachineMemOperand::MOStore \|
	MachineMemOperand::MODereferenceable;
	return true;
	}
	case Intrinsic::amdgcn_buffer_atomic_swap:
	case Intrinsic::amdgcn_buffer_atomic_add:
	case Intrinsic::amdgcn_buffer_atomic_sub:
	case Intrinsic::amdgcn_buffer_atomic_smin:
	case Intrinsic::amdgcn_buffer_atomic_umin:
	case Intrinsic::amdgcn_buffer_atomic_smax:
	case Intrinsic::amdgcn_buffer_atomic_umax:
	case Intrinsic::amdgcn_buffer_atomic_and:
	case Intrinsic::amdgcn_buffer_atomic_or:
	case Intrinsic::amdgcn_buffer_atomic_xor: {
	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.ptrVal = MFI->getBufferPSV(
	*MF.getSubtarget<SISubtarget>().getInstrInfo(),
	CI.getArgOperand(1));
	Info.memVT = MVT::getVT(CI.getType());
	Info.flags = MachineMemOperand::MOLoad \|
	MachineMemOperand::MOStore \|
	MachineMemOperand::MODereferenceable \|
	MachineMemOperand::MOVolatile;
	return true;
	}
	case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.ptrVal = MFI->getBufferPSV(
	*MF.getSubtarget<SISubtarget>().getInstrInfo(),
	CI.getArgOperand(2));
	Info.memVT = MVT::getVT(CI.getType());
	Info.flags = MachineMemOperand::MOLoad \|
	MachineMemOperand::MOStore \|
	MachineMemOperand::MODereferenceable \|
	MachineMemOperand::MOVolatile;
	return true;
	}
	default:
	return false;
	}
	}

	bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
	SmallVectorImpl<Value*> &Ops,
	Type *&AccessTy) const {
	switch (II->getIntrinsicID()) {
	case Intrinsic::amdgcn_atomic_inc:
	case Intrinsic::amdgcn_atomic_dec: {
	Value *Ptr = II->getArgOperand(0);
	AccessTy = II->getType();
	Ops.push_back(Ptr);
	return true;
	}
	default:
	return false;
	}
	}

	bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
	if (!Subtarget->hasFlatInstOffsets()) {
	// Flat instructions do not have offsets, and only have the register
	// address.
	return AM.BaseOffs == 0 && AM.Scale == 0;
	}

	// GFX9 added a 13-bit signed offset. When using regular flat instructions,
	// the sign bit is ignored and is treated as a 12-bit unsigned offset.

	// Just r + i
	return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
	}

	bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
	if (Subtarget->hasFlatGlobalInsts())
	return isInt<13>(AM.BaseOffs) && AM.Scale == 0;

	if (!Subtarget->hasAddr64() \|\| Subtarget->useFlatForGlobal()) {
	// Assume the we will use FLAT for all global memory accesses
	// on VI.
	// FIXME: This assumption is currently wrong. On VI we still use
	// MUBUF instructions for the r + i addressing mode. As currently
	// implemented, the MUBUF instructions only work on buffer < 4GB.
	// It may be possible to support > 4GB buffers with MUBUF instructions,
	// by setting the stride value in the resource descriptor which would
	// increase the size limit to (stride * 4GB). However, this is risky,
	// because it has never been validated.
	return isLegalFlatAddressingMode(AM);
	}

	return isLegalMUBUFAddressingMode(AM);
	}

	bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
	// MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
	// additionally can do r + r + i with addr64. 32-bit has more addressing
	// mode options. Depending on the resource constant, it can also do
	// (i64 r0) + (i32 r1) * (i14 i).
	//
	// Private arrays end up using a scratch buffer most of the time, so also
	// assume those use MUBUF instructions. Scratch loads / stores are currently
	// implemented as mubuf instructions with offen bit set, so slightly
	// different than the normal addr64.
	if (!isUInt<12>(AM.BaseOffs))
	return false;

	// FIXME: Since we can split immediate into soffset and immediate offset,
	// would it make sense to allow any immediate?

	switch (AM.Scale) {
	case 0: // r + i or just i, depending on HasBaseReg.
	return true;
	case 1:
	return true; // We have r + r or r + i.
	case 2:
	if (AM.HasBaseReg) {
	// Reject 2 * r + r.
	return false;
	}

	// Allow 2 * r as r + r
	// Or 2 * r + i is allowed as r + r + i.
	return true;
	default: // Don't allow n * r
	return false;
	}
	}

	bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS, Instruction *I) const {
	// No global is ever allowed as a base.
	if (AM.BaseGV)
	return false;

	if (AS == AMDGPUASI.GLOBAL_ADDRESS)
	return isLegalGlobalAddressingMode(AM);

	if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
	// If the offset isn't a multiple of 4, it probably isn't going to be
	// correctly aligned.
	// FIXME: Can we get the real alignment here?
	if (AM.BaseOffs % 4 != 0)
	return isLegalMUBUFAddressingMode(AM);

	// There are no SMRD extloads, so if we have to do a small type access we
	// will use a MUBUF load.
	// FIXME?: We also need to do this if unaligned, but we don't know the
	// alignment here.
	if (DL.getTypeStoreSize(Ty) < 4)
	return isLegalGlobalAddressingMode(AM);

	if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
	// SMRD instructions have an 8-bit, dword offset on SI.
	if (!isUInt<8>(AM.BaseOffs / 4))
	return false;
	} else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) {
	// On CI+, this can also be a 32-bit literal constant offset. If it fits
	// in 8-bits, it can use a smaller encoding.
	if (!isUInt<32>(AM.BaseOffs / 4))
	return false;
	} else if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
	// On VI, these use the SMEM format and the offset is 20-bit in bytes.
	if (!isUInt<20>(AM.BaseOffs))
	return false;
	} else
	llvm_unreachable("unhandled generation");

	if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
	return true;

	if (AM.Scale == 1 && AM.HasBaseReg)
	return true;

	return false;

	} else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
	return isLegalMUBUFAddressingMode(AM);
	} else if (AS == AMDGPUASI.LOCAL_ADDRESS \|\|
	AS == AMDGPUASI.REGION_ADDRESS) {
	// Basic, single offset DS instructions allow a 16-bit unsigned immediate
	// field.
	// XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
	// an 8-bit dword offset but we don't know the alignment here.
	if (!isUInt<16>(AM.BaseOffs))
	return false;

	if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
	return true;

	if (AM.Scale == 1 && AM.HasBaseReg)
	return true;

	return false;
	} else if (AS == AMDGPUASI.FLAT_ADDRESS \|\|
	AS == AMDGPUASI.UNKNOWN_ADDRESS_SPACE) {
	// For an unknown address space, this usually means that this is for some
	// reason being used for pure arithmetic, and not based on some addressing
	// computation. We don't have instructions that compute pointers with any
	// addressing modes, so treat them as having no offset like flat
	// instructions.
	return isLegalFlatAddressingMode(AM);
	} else {
	llvm_unreachable("unhandled address space");
	}
	}

	bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
	const SelectionDAG &DAG) const {
	if (AS == AMDGPUASI.GLOBAL_ADDRESS \|\| AS == AMDGPUASI.FLAT_ADDRESS) {
	return (MemVT.getSizeInBits() <= 4 * 32);
	} else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
	unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
	return (MemVT.getSizeInBits() <= MaxPrivateBits);
	} else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
	return (MemVT.getSizeInBits() <= 2 * 32);
	}
	return true;
	}

	bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
	unsigned AddrSpace,
	unsigned Align,
	bool *IsFast) const {
	if (IsFast)
	*IsFast = false;

	// TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
	// which isn't a simple VT.
	// Until MVT is extended to handle this, simply check for the size and
	// rely on the condition below: allow accesses if the size is a multiple of 4.
	if (VT == MVT::Other \|\| (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
	VT.getStoreSize() > 16)) {
	return false;
	}

	if (AddrSpace == AMDGPUASI.LOCAL_ADDRESS \|\|
	AddrSpace == AMDGPUASI.REGION_ADDRESS) {
	// ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
	// aligned, 8 byte access in a single operation using ds_read2/write2_b32
	// with adjacent offsets.
	bool AlignedBy4 = (Align % 4 == 0);
	if (IsFast)
	*IsFast = AlignedBy4;

	return AlignedBy4;
	}

	// FIXME: We have to be conservative here and assume that flat operations
	// will access scratch. If we had access to the IR function, then we
	// could determine if any private memory was used in the function.
	if (!Subtarget->hasUnalignedScratchAccess() &&
	(AddrSpace == AMDGPUASI.PRIVATE_ADDRESS \|\|
	AddrSpace == AMDGPUASI.FLAT_ADDRESS)) {
	return false;
	}

	if (Subtarget->hasUnalignedBufferAccess()) {
	// If we have an uniform constant load, it still requires using a slow
	// buffer instruction if unaligned.
	if (IsFast) {
	*IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS) ?
	(Align % 4 == 0) : true;
	}

	return true;
	}

	// Smaller than dword value must be aligned.
	if (VT.bitsLT(MVT::i32))
	return false;

	// 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
	// byte-address are ignored, thus forcing Dword alignment.
	// This applies to private, global, and constant memory.
	if (IsFast)
	*IsFast = true;

	return VT.bitsGT(MVT::i32) && Align % 4 == 0;
	}

	EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
	unsigned SrcAlign, bool IsMemset,
	bool ZeroMemset,
	bool MemcpyStrSrc,
	MachineFunction &MF) const {
	// FIXME: Should account for address space here.

	// The default fallback uses the private pointer size as a guess for a type to
	// use. Make sure we switch these to 64-bit accesses.

	if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
	return MVT::v4i32;

	if (Size >= 8 && DstAlign >= 4)
	return MVT::v2i32;

	// Use the default.
	return MVT::Other;
	}

	static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) {
	return AS == AMDGPUASI.GLOBAL_ADDRESS \|\|
	AS == AMDGPUASI.FLAT_ADDRESS \|\|
	AS == AMDGPUASI.CONSTANT_ADDRESS;
	}

	bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
	unsigned DestAS) const {
	return isFlatGlobalAddrSpace(SrcAS, AMDGPUASI) &&
	isFlatGlobalAddrSpace(DestAS, AMDGPUASI);
	}

	bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
	const MemSDNode *MemNode = cast<MemSDNode>(N);
	const Value *Ptr = MemNode->getMemOperand()->getValue();
	const Instruction *I = dyn_cast<Instruction>(Ptr);
	return I && I->getMetadata("amdgpu.noclobber");
	}

	bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
	unsigned DestAS) const {
	// Flat -> private/local is a simple truncate.
	// Flat -> global is no-op
	if (SrcAS == AMDGPUASI.FLAT_ADDRESS)
	return true;

	return isNoopAddrSpaceCast(SrcAS, DestAS);
	}

	bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
	const MemSDNode *MemNode = cast<MemSDNode>(N);

	- return AMDGPU::isUniformMMO(MemNode->getMemOperand());
	+ return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
	}

	TargetLoweringBase::LegalizeTypeAction
	SITargetLowering::getPreferredVectorAction(EVT VT) const {
	if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
	return TypeSplitVector;

	return TargetLoweringBase::getPreferredVectorAction(VT);
	}

	bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const {
	// FIXME: Could be smarter if called for vector constants.
	return true;
	}

	bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
	if (Subtarget->has16BitInsts() && VT == MVT::i16) {
	switch (Op) {
	case ISD::LOAD:
	case ISD::STORE:

	// These operations are done with 32-bit instructions anyway.
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	case ISD::SELECT:
	// TODO: Extensions?
	return true;
	default:
	return false;
	}
	}

	// SimplifySetCC uses this function to determine whether or not it should
	// create setcc with i1 operands. We don't have instructions for i1 setcc.
	if (VT == MVT::i1 && Op == ISD::SETCC)
	return false;

	return TargetLowering::isTypeDesirableForOp(Op, VT);
	}

	SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
	const SDLoc &SL,
	SDValue Chain,
	uint64_t Offset) const {
	const DataLayout &DL = DAG.getDataLayout();
	MachineFunction &MF = DAG.getMachineFunction();
	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();

	const ArgDescriptor *InputPtrReg;
	const TargetRegisterClass *RC;

	std::tie(InputPtrReg, RC)
	= Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);

	MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
	MVT PtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS);
	SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
	MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);

	return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
	DAG.getConstant(Offset, SL, PtrVT));
	}

	SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
	const SDLoc &SL) const {
	auto MFI = DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
	uint64_t Offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
	return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
	}

	SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
	const SDLoc &SL, SDValue Val,
	bool Signed,
	const ISD::InputArg *Arg) const {
	if (Arg && (Arg->Flags.isSExt() \|\| Arg->Flags.isZExt()) &&
	VT.bitsLT(MemVT)) {
	unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
	Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
	}

	if (MemVT.isFloatingPoint())
	Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
	else if (Signed)
	Val = DAG.getSExtOrTrunc(Val, SL, VT);
	else
	Val = DAG.getZExtOrTrunc(Val, SL, VT);

	return Val;
	}

	SDValue SITargetLowering::lowerKernargMemParameter(
	SelectionDAG &DAG, EVT VT, EVT MemVT,
	const SDLoc &SL, SDValue Chain,
	uint64_t Offset, bool Signed,
	const ISD::InputArg *Arg) const {
	const DataLayout &DL = DAG.getDataLayout();
	Type Ty = MemVT.getTypeForEVT(DAG.getContext());
	PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
	MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));

	unsigned Align = DL.getABITypeAlignment(Ty);

	SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
	SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
	MachineMemOperand::MONonTemporal \|
	MachineMemOperand::MODereferenceable \|
	MachineMemOperand::MOInvariant);

	SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
	return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
	}

	SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
	const SDLoc &SL, SDValue Chain,
	const ISD::InputArg &Arg) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();

	if (Arg.Flags.isByVal()) {
	unsigned Size = Arg.Flags.getByValSize();
	int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
	return DAG.getFrameIndex(FrameIdx, MVT::i32);
	}

	unsigned ArgOffset = VA.getLocMemOffset();
	unsigned ArgSize = VA.getValVT().getStoreSize();

	int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);

	// Create load nodes to retrieve arguments from the stack.
	SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
	SDValue ArgValue;

	// For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
	ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
	MVT MemVT = VA.getValVT();

	switch (VA.getLocInfo()) {
	default:
	break;
	case CCValAssign::BCvt:
	MemVT = VA.getLocVT();
	break;
	case CCValAssign::SExt:
	ExtType = ISD::SEXTLOAD;
	break;
	case CCValAssign::ZExt:
	ExtType = ISD::ZEXTLOAD;
	break;
	case CCValAssign::AExt:
	ExtType = ISD::EXTLOAD;
	break;
	}

	ArgValue = DAG.getExtLoad(
	ExtType, SL, VA.getLocVT(), Chain, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
	MemVT);
	return ArgValue;
	}

	SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
	const SIMachineFunctionInfo &MFI,
	EVT VT,
	AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
	const ArgDescriptor *Reg;
	const TargetRegisterClass *RC;

	std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
	return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
	}

	static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
	CallingConv::ID CallConv,
	ArrayRef<ISD::InputArg> Ins,
	BitVector &Skipped,
	FunctionType *FType,
	SIMachineFunctionInfo *Info) {
	for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
	const ISD::InputArg &Arg = Ins[I];

	// First check if it's a PS input addr.
	if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
	!Arg.Flags.isByVal() && PSInputNum <= 15) {

	if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
	// We can safely skip PS inputs.
	Skipped.set(I);
	++PSInputNum;
	continue;
	}

	Info->markPSInputAllocated(PSInputNum);
	if (Arg.Used)
	Info->markPSInputEnabled(PSInputNum);

	++PSInputNum;
	}

	// Second split vertices into their elements.
	if (Arg.VT.isVector()) {
	ISD::InputArg NewArg = Arg;
	NewArg.Flags.setSplit();
	NewArg.VT = Arg.VT.getVectorElementType();

	// We REALLY want the ORIGINAL number of vertex elements here, e.g. a
	// three or five element vertex only needs three or five registers,
	// NOT four or eight.
	Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
	unsigned NumElements = ParamType->getVectorNumElements();

	for (unsigned J = 0; J != NumElements; ++J) {
	Splits.push_back(NewArg);
	NewArg.PartOffset += NewArg.VT.getStoreSize();
	}
	} else {
	Splits.push_back(Arg);
	}
	}
	}

	// Allocate special inputs passed in VGPRs.
	static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
	MachineFunction &MF,
	const SIRegisterInfo &TRI,
	SIMachineFunctionInfo &Info) {
	if (Info.hasWorkItemIDX()) {
	unsigned Reg = AMDGPU::VGPR0;
	MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);

	CCInfo.AllocateReg(Reg);
	Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
	}

	if (Info.hasWorkItemIDY()) {
	unsigned Reg = AMDGPU::VGPR1;
	MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);

	CCInfo.AllocateReg(Reg);
	Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
	}

	if (Info.hasWorkItemIDZ()) {
	unsigned Reg = AMDGPU::VGPR2;
	MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);

	CCInfo.AllocateReg(Reg);
	Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
	}
	}

	// Try to allocate a VGPR at the end of the argument list, or if no argument
	// VGPRs are left allocating a stack slot.
	static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
	ArrayRef<MCPhysReg> ArgVGPRs
	= makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
	unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
	if (RegIdx == ArgVGPRs.size()) {
	// Spill to stack required.
	int64_t Offset = CCInfo.AllocateStack(4, 4);

	return ArgDescriptor::createStack(Offset);
	}

	unsigned Reg = ArgVGPRs[RegIdx];
	Reg = CCInfo.AllocateReg(Reg);
	assert(Reg != AMDGPU::NoRegister);

	MachineFunction &MF = CCInfo.getMachineFunction();
	MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
	return ArgDescriptor::createRegister(Reg);
	}

	static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
	const TargetRegisterClass *RC,
	unsigned NumArgRegs) {
	ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
	unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
	if (RegIdx == ArgSGPRs.size())
	report_fatal_error("ran out of SGPRs for arguments");

	unsigned Reg = ArgSGPRs[RegIdx];
	Reg = CCInfo.AllocateReg(Reg);
	assert(Reg != AMDGPU::NoRegister);

	MachineFunction &MF = CCInfo.getMachineFunction();
	MF.addLiveIn(Reg, RC);
	return ArgDescriptor::createRegister(Reg);
	}

	static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {
	return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
	}

	static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
	return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
	}

	static void allocateSpecialInputVGPRs(CCState &CCInfo,
	MachineFunction &MF,
	const SIRegisterInfo &TRI,
	SIMachineFunctionInfo &Info) {
	if (Info.hasWorkItemIDX())
	Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));

	if (Info.hasWorkItemIDY())
	Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));

	if (Info.hasWorkItemIDZ())
	Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
	}

	static void allocateSpecialInputSGPRs(CCState &CCInfo,
	MachineFunction &MF,
	const SIRegisterInfo &TRI,
	SIMachineFunctionInfo &Info) {
	auto &ArgInfo = Info.getArgInfo();

	// TODO: Unify handling with private memory pointers.

	if (Info.hasDispatchPtr())
	ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);

	if (Info.hasQueuePtr())
	ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);

	if (Info.hasKernargSegmentPtr())
	ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);

	if (Info.hasDispatchID())
	ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);

	// flat_scratch_init is not applicable for non-kernel functions.

	if (Info.hasWorkGroupIDX())
	ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);

	if (Info.hasWorkGroupIDY())
	ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);

	if (Info.hasWorkGroupIDZ())
	ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);

	if (Info.hasImplicitArgPtr())
	ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
	}

	// Allocate special inputs passed in user SGPRs.
	static void allocateHSAUserSGPRs(CCState &CCInfo,
	MachineFunction &MF,
	const SIRegisterInfo &TRI,
	SIMachineFunctionInfo &Info) {
	if (Info.hasImplicitBufferPtr()) {
	unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
	MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
	CCInfo.AllocateReg(ImplicitBufferPtrReg);
	}

	// FIXME: How should these inputs interact with inreg / custom SGPR inputs?
	if (Info.hasPrivateSegmentBuffer()) {
	unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
	MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
	CCInfo.AllocateReg(PrivateSegmentBufferReg);
	}

	if (Info.hasDispatchPtr()) {
	unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
	MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
	CCInfo.AllocateReg(DispatchPtrReg);
	}

	if (Info.hasQueuePtr()) {
	unsigned QueuePtrReg = Info.addQueuePtr(TRI);
	MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
	CCInfo.AllocateReg(QueuePtrReg);
	}

	if (Info.hasKernargSegmentPtr()) {
	unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
	MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
	CCInfo.AllocateReg(InputPtrReg);
	}

	if (Info.hasDispatchID()) {
	unsigned DispatchIDReg = Info.addDispatchID(TRI);
	MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
	CCInfo.AllocateReg(DispatchIDReg);
	}

	if (Info.hasFlatScratchInit()) {
	unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
	MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
	CCInfo.AllocateReg(FlatScratchInitReg);
	}

	// TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
	// these from the dispatch pointer.
	}

	// Allocate special input registers that are initialized per-wave.
	static void allocateSystemSGPRs(CCState &CCInfo,
	MachineFunction &MF,
	SIMachineFunctionInfo &Info,
	CallingConv::ID CallConv,
	bool IsShader) {
	if (Info.hasWorkGroupIDX()) {
	unsigned Reg = Info.addWorkGroupIDX();
	MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
	CCInfo.AllocateReg(Reg);
	}

	if (Info.hasWorkGroupIDY()) {
	unsigned Reg = Info.addWorkGroupIDY();
	MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
	CCInfo.AllocateReg(Reg);
	}

	if (Info.hasWorkGroupIDZ()) {
	unsigned Reg = Info.addWorkGroupIDZ();
	MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
	CCInfo.AllocateReg(Reg);
	}

	if (Info.hasWorkGroupInfo()) {
	unsigned Reg = Info.addWorkGroupInfo();
	MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
	CCInfo.AllocateReg(Reg);
	}

	if (Info.hasPrivateSegmentWaveByteOffset()) {
	// Scratch wave offset passed in system SGPR.
	unsigned PrivateSegmentWaveByteOffsetReg;

	if (IsShader) {
	PrivateSegmentWaveByteOffsetReg =
	Info.getPrivateSegmentWaveByteOffsetSystemSGPR();

	// This is true if the scratch wave byte offset doesn't have a fixed
	// location.
	if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
	PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
	Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
	}
	} else
	PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();

	MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
	CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
	}
	}

	static void reservePrivateMemoryRegs(const TargetMachine &TM,
	MachineFunction &MF,
	const SIRegisterInfo &TRI,
	SIMachineFunctionInfo &Info) {
	// Now that we've figured out where the scratch register inputs are, see if
	// should reserve the arguments and use them directly.
	MachineFrameInfo &MFI = MF.getFrameInfo();
	bool HasStackObjects = MFI.hasStackObjects();

	// Record that we know we have non-spill stack objects so we don't need to
	// check all stack objects later.
	if (HasStackObjects)
	Info.setHasNonSpillStackObjects(true);

	// Everything live out of a block is spilled with fast regalloc, so it's
	// almost certain that spilling will be required.
	if (TM.getOptLevel() == CodeGenOpt::None)
	HasStackObjects = true;

	// For now assume stack access is needed in any callee functions, so we need
	// the scratch registers to pass in.
	bool RequiresStackAccess = HasStackObjects \|\| MFI.hasCalls();

	const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
	if (ST.isAmdCodeObjectV2(MF)) {
	if (RequiresStackAccess) {
	// If we have stack objects, we unquestionably need the private buffer
	// resource. For the Code Object V2 ABI, this will be the first 4 user
	// SGPR inputs. We can reserve those and use them directly.

	unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
	AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
	Info.setScratchRSrcReg(PrivateSegmentBufferReg);

	if (MFI.hasCalls()) {
	// If we have calls, we need to keep the frame register in a register
	// that won't be clobbered by a call, so ensure it is copied somewhere.

	// This is not a problem for the scratch wave offset, because the same
	// registers are reserved in all functions.

	// FIXME: Nothing is really ensuring this is a call preserved register,
	// it's just selected from the end so it happens to be.
	unsigned ReservedOffsetReg
	= TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
	Info.setScratchWaveOffsetReg(ReservedOffsetReg);
	} else {
	unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
	AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
	Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
	}
	} else {
	unsigned ReservedBufferReg
	= TRI.reservedPrivateSegmentBufferReg(MF);
	unsigned ReservedOffsetReg
	= TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);

	// We tentatively reserve the last registers (skipping the last two
	// which may contain VCC). After register allocation, we'll replace
	// these with the ones immediately after those which were really
	// allocated. In the prologue copies will be inserted from the argument
	// to these reserved registers.
	Info.setScratchRSrcReg(ReservedBufferReg);
	Info.setScratchWaveOffsetReg(ReservedOffsetReg);
	}
	} else {
	unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);

	// Without HSA, relocations are used for the scratch pointer and the
	// buffer resource setup is always inserted in the prologue. Scratch wave
	// offset is still in an input SGPR.
	Info.setScratchRSrcReg(ReservedBufferReg);

	if (HasStackObjects && !MFI.hasCalls()) {
	unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
	AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
	Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
	} else {
	unsigned ReservedOffsetReg
	= TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
	Info.setScratchWaveOffsetReg(ReservedOffsetReg);
	}
	}
	}

	bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
	const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
	return !Info->isEntryFunction();
	}

	void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {

	}

	void SITargetLowering::insertCopiesSplitCSR(
	MachineBasicBlock *Entry,
	const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
	const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();

	const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
	if (!IStart)
	return;

	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
	MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
	MachineBasicBlock::iterator MBBI = Entry->begin();
	for (const MCPhysReg I = IStart; I; ++I) {
	const TargetRegisterClass *RC = nullptr;
	if (AMDGPU::SReg_64RegClass.contains(*I))
	RC = &AMDGPU::SGPR_64RegClass;
	else if (AMDGPU::SReg_32RegClass.contains(*I))
	RC = &AMDGPU::SGPR_32RegClass;
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");

	unsigned NewVR = MRI->createVirtualRegister(RC);
	// Create copy from CSR to a virtual register.
	Entry->addLiveIn(*I);
	BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
	.addReg(*I);

	// Insert the copy-back instructions right before the terminator.
	for (auto *Exit : Exits)
	BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
	TII->get(TargetOpcode::COPY), *I)
	.addReg(NewVR);
	}
	}

	SDValue SITargetLowering::LowerFormalArguments(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();

	MachineFunction &MF = DAG.getMachineFunction();
	FunctionType *FType = MF.getFunction().getFunctionType();
	SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
	const SISubtarget &ST = MF.getSubtarget<SISubtarget>();

	if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
	const Function &Fn = MF.getFunction();
	DiagnosticInfoUnsupported NoGraphicsHSA(
	Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
	DAG.getContext()->diagnose(NoGraphicsHSA);
	return DAG.getEntryNode();
	}

	// Create stack objects that are used for emitting debugger prologue if
	// "amdgpu-debugger-emit-prologue" attribute was specified.
	if (ST.debuggerEmitPrologue())
	createDebuggerPrologueStackObjects(MF);

	SmallVector<ISD::InputArg, 16> Splits;
	SmallVector<CCValAssign, 16> ArgLocs;
	BitVector Skipped(Ins.size());
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
	*DAG.getContext());

	bool IsShader = AMDGPU::isShader(CallConv);
	bool IsKernel = AMDGPU::isKernel(CallConv);
	bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);

	if (!IsEntryFunc) {
	// 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
	// this when allocating argument fixed offsets.
	CCInfo.AllocateStack(4, 4);
	}

	if (IsShader) {
	processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);

	// At least one interpolation mode must be enabled or else the GPU will
	// hang.
	//
	// Check PSInputAddr instead of PSInputEnable. The idea is that if the user
	// set PSInputAddr, the user wants to enable some bits after the compilation
	// based on run-time states. Since we can't know what the final PSInputEna
	// will look like, so we shouldn't do anything here and the user should take
	// responsibility for the correct programming.
	//
	// Otherwise, the following restrictions apply:
	// - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
	// - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
	// enabled too.
	if (CallConv == CallingConv::AMDGPU_PS) {
	if ((Info->getPSInputAddr() & 0x7F) == 0 \|\|
	((Info->getPSInputAddr() & 0xF) == 0 &&
	Info->isPSInputAllocated(11))) {
	CCInfo.AllocateReg(AMDGPU::VGPR0);
	CCInfo.AllocateReg(AMDGPU::VGPR1);
	Info->markPSInputAllocated(0);
	Info->markPSInputEnabled(0);
	}
	if (Subtarget->isAmdPalOS()) {
	// For isAmdPalOS, the user does not enable some bits after compilation
	// based on run-time states; the register values being generated here are
	// the final ones set in hardware. Therefore we need to apply the
	// workaround to PSInputAddr and PSInputEnable together. (The case where
	// a bit is set in PSInputAddr but not PSInputEnable is where the
	// frontend set up an input arg for a particular interpolation mode, but
	// nothing uses that input arg. Really we should have an earlier pass
	// that removes such an arg.)
	unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
	if ((PsInputBits & 0x7F) == 0 \|\|
	((PsInputBits & 0xF) == 0 &&
	(PsInputBits >> 11 & 1)))
	Info->markPSInputEnabled(
	countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
	}
	}

	assert(!Info->hasDispatchPtr() &&
	!Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
	!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
	!Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
	!Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
	!Info->hasWorkItemIDZ());
	} else if (IsKernel) {
	assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
	} else {
	Splits.append(Ins.begin(), Ins.end());
	}

	if (IsEntryFunc) {
	allocateSpecialEntryInputVGPRs(CCInfo, MF, TRI, Info);
	allocateHSAUserSGPRs(CCInfo, MF, TRI, Info);
	}

	if (IsKernel) {
	analyzeFormalArgumentsCompute(CCInfo, Ins);
	} else {
	CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
	CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
	}

	SmallVector<SDValue, 16> Chains;

	for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
	const ISD::InputArg &Arg = Ins[i];
	if (Skipped[i]) {
	InVals.push_back(DAG.getUNDEF(Arg.VT));
	continue;
	}

	CCValAssign &VA = ArgLocs[ArgIdx++];
	MVT VT = VA.getLocVT();

	if (IsEntryFunc && VA.isMemLoc()) {
	VT = Ins[i].VT;
	EVT MemVT = VA.getLocVT();

	const uint64_t Offset = Subtarget->getExplicitKernelArgOffset(MF) +
	VA.getLocMemOffset();
	Info->setABIArgOffset(Offset + MemVT.getStoreSize());

	// The first 36 bytes of the input buffer contains information about
	// thread group and global sizes.
	SDValue Arg = lowerKernargMemParameter(
	DAG, VT, MemVT, DL, Chain, Offset, Ins[i].Flags.isSExt(), &Ins[i]);
	Chains.push_back(Arg.getValue(1));

	auto *ParamTy =
	dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
	if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
	ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
	// On SI local pointers are just offsets into LDS, so they are always
	// less than 16-bits. On CI and newer they could potentially be
	// real pointers, so we can't guarantee their size.
	Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
	DAG.getValueType(MVT::i16));
	}

	InVals.push_back(Arg);
	continue;
	} else if (!IsEntryFunc && VA.isMemLoc()) {
	SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
	InVals.push_back(Val);
	if (!Arg.Flags.isByVal())
	Chains.push_back(Val.getValue(1));
	continue;
	}

	assert(VA.isRegLoc() && "Parameter must be in a register!");

	unsigned Reg = VA.getLocReg();
	const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
	EVT ValVT = VA.getValVT();

	Reg = MF.addLiveIn(Reg, RC);
	SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);

	if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) {
	// The return object should be reasonably addressable.

	// FIXME: This helps when the return is a real sret. If it is a
	// automatically inserted sret (i.e. CanLowerReturn returns false), an
	// extra copy is inserted in SelectionDAGBuilder which obscures this.
	unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits;
	Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
	DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
	}

	// If this is an 8 or 16-bit value, it is really passed promoted
	// to 32 bits. Insert an assert[sz]ext to capture this, then
	// truncate to the right size.
	switch (VA.getLocInfo()) {
	case CCValAssign::Full:
	break;
	case CCValAssign::BCvt:
	Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
	break;
	case CCValAssign::SExt:
	Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
	DAG.getValueType(ValVT));
	Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
	break;
	case CCValAssign::ZExt:
	Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
	DAG.getValueType(ValVT));
	Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
	break;
	case CCValAssign::AExt:
	Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
	break;
	default:
	llvm_unreachable("Unknown loc info!");
	}

	if (IsShader && Arg.VT.isVector()) {
	// Build a vector from the registers
	Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
	unsigned NumElements = ParamType->getVectorNumElements();

	SmallVector<SDValue, 4> Regs;
	Regs.push_back(Val);
	for (unsigned j = 1; j != NumElements; ++j) {
	Reg = ArgLocs[ArgIdx++].getLocReg();
	Reg = MF.addLiveIn(Reg, RC);

	SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
	Regs.push_back(Copy);
	}

	// Fill up the missing vector elements
	NumElements = Arg.VT.getVectorNumElements() - NumElements;
	Regs.append(NumElements, DAG.getUNDEF(VT));

	InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
	continue;
	}

	InVals.push_back(Val);
	}

	if (!IsEntryFunc) {
	// Special inputs come after user arguments.
	allocateSpecialInputVGPRs(CCInfo, MF, TRI, Info);
	}

	// Start adding system SGPRs.
	if (IsEntryFunc) {
	allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
	} else {
	CCInfo.AllocateReg(Info->getScratchRSrcReg());
	CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
	CCInfo.AllocateReg(Info->getFrameOffsetReg());
	allocateSpecialInputSGPRs(CCInfo, MF, TRI, Info);
	}

	auto &ArgUsageInfo =
	DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
	ArgUsageInfo.setFuncArgInfo(MF.getFunction(), Info->getArgInfo());

	unsigned StackArgSize = CCInfo.getNextStackOffset();
	Info->setBytesInStackArgArea(StackArgSize);

	return Chains.empty() ? Chain :
	DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
	}

	// TODO: If return values can't fit in registers, we should return as many as
	// possible in registers before passing on stack.
	bool SITargetLowering::CanLowerReturn(
	CallingConv::ID CallConv,
	MachineFunction &MF, bool IsVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	LLVMContext &Context) const {
	// Replacing returns with sret/stack usage doesn't make sense for shaders.
	// FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
	// for shaders. Vector types should be explicitly handled by CC.
	if (AMDGPU::isEntryFunctionCC(CallConv))
	return true;

	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
	return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
	}

	SDValue
	SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &DL, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();

	if (AMDGPU::isKernel(CallConv)) {
	return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
	OutVals, DL, DAG);
	}

	bool IsShader = AMDGPU::isShader(CallConv);

	Info->setIfReturnsVoid(Outs.size() == 0);
	bool IsWaveEnd = Info->returnsVoid() && IsShader;

	SmallVector<ISD::OutputArg, 48> Splits;
	SmallVector<SDValue, 48> SplitVals;

	// Split vectors into their elements.
	for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
	const ISD::OutputArg &Out = Outs[i];

	if (IsShader && Out.VT.isVector()) {
	MVT VT = Out.VT.getVectorElementType();
	ISD::OutputArg NewOut = Out;
	NewOut.Flags.setSplit();
	NewOut.VT = VT;

	// We want the original number of vector elements here, e.g.
	// three or five, not four or eight.
	unsigned NumElements = Out.ArgVT.getVectorNumElements();

	for (unsigned j = 0; j != NumElements; ++j) {
	SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i],
	DAG.getConstant(j, DL, MVT::i32));
	SplitVals.push_back(Elem);
	Splits.push_back(NewOut);
	NewOut.PartOffset += NewOut.VT.getStoreSize();
	}
	} else {
	SplitVals.push_back(OutVals[i]);
	Splits.push_back(Out);
	}
	}

	// CCValAssign - represent the assignment of the return value to a location.
	SmallVector<CCValAssign, 48> RVLocs;

	// CCState - Info about the registers and stack slots.
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());

	// Analyze outgoing return values.
	CCInfo.AnalyzeReturn(Splits, CCAssignFnForReturn(CallConv, isVarArg));

	SDValue Flag;
	SmallVector<SDValue, 48> RetOps;
	RetOps.push_back(Chain); // Operand #0 = Chain (updated below)

	// Add return address for callable functions.
	if (!Info->isEntryFunction()) {
	const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
	SDValue ReturnAddrReg = CreateLiveInRegister(
	DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);

	// FIXME: Should be able to use a vreg here, but need a way to prevent it
	// from being allcoated to a CSR.

	SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
	MVT::i64);

	Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
	Flag = Chain.getValue(1);

	RetOps.push_back(PhysReturnAddrReg);
	}

	// Copy the result values into the output registers.
	for (unsigned i = 0, realRVLocIdx = 0;
	i != RVLocs.size();
	++i, ++realRVLocIdx) {
	CCValAssign &VA = RVLocs[i];
	assert(VA.isRegLoc() && "Can only return in registers!");
	// TODO: Partially return in registers if return values don't fit.

	SDValue Arg = SplitVals[realRVLocIdx];

	// Copied from other backends.
	switch (VA.getLocInfo()) {
	case CCValAssign::Full:
	break;
	case CCValAssign::BCvt:
	Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::SExt:
	Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::ZExt:
	Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::AExt:
	Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	default:
	llvm_unreachable("Unknown loc info!");
	}

	Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
	Flag = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
	}

	// FIXME: Does sret work properly?
	if (!Info->isEntryFunction()) {
	const SIRegisterInfo *TRI
	= static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo();
	const MCPhysReg *I =
	TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
	if (I) {
	for (; *I; ++I) {
	if (AMDGPU::SReg_64RegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::i64));
	else if (AMDGPU::SReg_32RegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::i32));
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
	}
	}
	}

	// Update chain and glue.
	RetOps[0] = Chain;
	if (Flag.getNode())
	RetOps.push_back(Flag);

	unsigned Opc = AMDGPUISD::ENDPGM;
	if (!IsWaveEnd)
	Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
	return DAG.getNode(Opc, DL, MVT::Other, RetOps);
	}

	SDValue SITargetLowering::LowerCallResult(
	SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
	SDValue ThisVal) const {
	CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);

	// Assign locations to each value returned by this call.
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());
	CCInfo.AnalyzeCallResult(Ins, RetCC);

	// Copy all of the result registers out of their specified physreg.
	for (unsigned i = 0; i != RVLocs.size(); ++i) {
	CCValAssign VA = RVLocs[i];
	SDValue Val;

	if (VA.isRegLoc()) {
	Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
	Chain = Val.getValue(1);
	InFlag = Val.getValue(2);
	} else if (VA.isMemLoc()) {
	report_fatal_error("TODO: return values in memory");
	} else
	llvm_unreachable("unknown argument location type");

	switch (VA.getLocInfo()) {
	case CCValAssign::Full:
	break;
	case CCValAssign::BCvt:
	Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
	break;
	case CCValAssign::ZExt:
	Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
	DAG.getValueType(VA.getValVT()));
	Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
	break;
	case CCValAssign::SExt:
	Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
	DAG.getValueType(VA.getValVT()));
	Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
	break;
	case CCValAssign::AExt:
	Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
	break;
	default:
	llvm_unreachable("Unknown loc info!");
	}

	InVals.push_back(Val);
	}

	return Chain;
	}

	// Add code to pass special inputs required depending on used features separate
	// from the explicit user arguments present in the IR.
	void SITargetLowering::passSpecialInputs(
	CallLoweringInfo &CLI,
	const SIMachineFunctionInfo &Info,
	SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
	SmallVectorImpl<SDValue> &MemOpChains,
	SDValue Chain,
	SDValue StackPtr) const {
	// If we don't have a call site, this was a call inserted by
	// legalization. These can never use special inputs.
	if (!CLI.CS)
	return;

	const Function *CalleeFunc = CLI.CS.getCalledFunction();
	assert(CalleeFunc);

	SelectionDAG &DAG = CLI.DAG;
	const SDLoc &DL = CLI.DL;

	const SISubtarget *ST = getSubtarget();
	const SIRegisterInfo *TRI = ST->getRegisterInfo();

	auto &ArgUsageInfo =
	DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
	const AMDGPUFunctionArgInfo &CalleeArgInfo
	= ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);

	const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();

	// TODO: Unify with private memory register handling. This is complicated by
	// the fact that at least in kernels, the input argument is not necessarily
	// in the same location as the input.
	AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
	AMDGPUFunctionArgInfo::DISPATCH_PTR,
	AMDGPUFunctionArgInfo::QUEUE_PTR,
	AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR,
	AMDGPUFunctionArgInfo::DISPATCH_ID,
	AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
	AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
	AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
	AMDGPUFunctionArgInfo::WORKITEM_ID_X,
	AMDGPUFunctionArgInfo::WORKITEM_ID_Y,
	AMDGPUFunctionArgInfo::WORKITEM_ID_Z,
	AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
	};

	for (auto InputID : InputRegs) {
	const ArgDescriptor *OutgoingArg;
	const TargetRegisterClass *ArgRC;

	std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
	if (!OutgoingArg)
	continue;

	const ArgDescriptor *IncomingArg;
	const TargetRegisterClass *IncomingArgRC;
	std::tie(IncomingArg, IncomingArgRC)
	= CallerArgInfo.getPreloadedValue(InputID);
	assert(IncomingArgRC == ArgRC);

	// All special arguments are ints for now.
	EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
	SDValue InputReg;

	if (IncomingArg) {
	InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
	} else {
	// The implicit arg ptr is special because it doesn't have a corresponding
	// input for kernels, and is computed from the kernarg segment pointer.
	assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
	InputReg = getImplicitArgPtr(DAG, DL);
	}

	if (OutgoingArg->isRegister()) {
	RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
	} else {
	SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, StackPtr,
	InputReg,
	OutgoingArg->getStackOffset());
	MemOpChains.push_back(ArgStore);
	}
	}
	}

	static bool canGuaranteeTCO(CallingConv::ID CC) {
	return CC == CallingConv::Fast;
	}

	/// Return true if we might ever do TCO for calls with this calling convention.
	static bool mayTailCallThisCC(CallingConv::ID CC) {
	switch (CC) {
	case CallingConv::C:
	return true;
	default:
	return canGuaranteeTCO(CC);
	}
	}

	bool SITargetLowering::isEligibleForTailCallOptimization(
	SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
	if (!mayTailCallThisCC(CalleeCC))
	return false;

	MachineFunction &MF = DAG.getMachineFunction();
	const Function &CallerF = MF.getFunction();
	CallingConv::ID CallerCC = CallerF.getCallingConv();
	const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);

	// Kernels aren't callable, and don't have a live in return address so it
	// doesn't make sense to do a tail call with entry functions.
	if (!CallerPreserved)
	return false;

	bool CCMatch = CallerCC == CalleeCC;

	if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
	if (canGuaranteeTCO(CalleeCC) && CCMatch)
	return true;
	return false;
	}

	// TODO: Can we handle var args?
	if (IsVarArg)
	return false;

	for (const Argument &Arg : CallerF.args()) {
	if (Arg.hasByValAttr())
	return false;
	}

	LLVMContext &Ctx = *DAG.getContext();

	// Check that the call results are passed in the same way.
	if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
	CCAssignFnForCall(CalleeCC, IsVarArg),
	CCAssignFnForCall(CallerCC, IsVarArg)))
	return false;

	// The callee has to preserve all registers the caller needs to preserve.
	if (!CCMatch) {
	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
	if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
	return false;
	}

	// Nothing more to check if the callee is taking no arguments.
	if (Outs.empty())
	return true;

	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);

	CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));

	const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
	// If the stack arguments for this call do not fit into our own save area then
	// the call cannot be made tail.
	// TODO: Is this really necessary?
	if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
	return false;

	const MachineRegisterInfo &MRI = MF.getRegInfo();
	return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
	}

	bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
	if (!CI->isTailCall())
	return false;

	const Function *ParentFn = CI->getParent()->getParent();
	if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
	return false;

	auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
	return (Attr.getValueAsString() != "true");
	}

	// The wave scratch offset register is used as the global base pointer.
	SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	SelectionDAG &DAG = CLI.DAG;
	const SDLoc &DL = CLI.DL;
	SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
	SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
	SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
	SDValue Chain = CLI.Chain;
	SDValue Callee = CLI.Callee;
	bool &IsTailCall = CLI.IsTailCall;
	CallingConv::ID CallConv = CLI.CallConv;
	bool IsVarArg = CLI.IsVarArg;
	bool IsSibCall = false;
	bool IsThisReturn = false;
	MachineFunction &MF = DAG.getMachineFunction();

	if (IsVarArg) {
	return lowerUnhandledCall(CLI, InVals,
	"unsupported call to variadic function ");
	}

	if (!CLI.CS.getCalledFunction()) {
	return lowerUnhandledCall(CLI, InVals,
	"unsupported indirect call to function ");
	}

	if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
	return lowerUnhandledCall(CLI, InVals,
	"unsupported required tail call to function ");
	}

	// The first 4 bytes are reserved for the callee's emergency stack slot.
	const unsigned CalleeUsableStackOffset = 4;

	if (IsTailCall) {
	IsTailCall = isEligibleForTailCallOptimization(
	Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
	if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
	report_fatal_error("failed to perform tail call elimination on a call "
	"site marked musttail");
	}

	bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;

	// A sibling call is one where we're under the usual C ABI and not planning
	// to change that but can still do a tail call:
	if (!TailCallOpt && IsTailCall)
	IsSibCall = true;

	if (IsTailCall)
	++NumTailCalls;
	}

	if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee)) {
	// FIXME: Remove this hack for function pointer types after removing
	// support of old address space mapping. In the new address space
	// mapping the pointer in default address space is 64 bit, therefore
	// does not need this hack.
	if (Callee.getValueType() == MVT::i32) {
	const GlobalValue *GV = GA->getGlobal();
	Callee = DAG.getGlobalAddress(GV, DL, MVT::i64, GA->getOffset(), false,
	GA->getTargetFlags());
	}
	}
	assert(Callee.getValueType() == MVT::i64);

	const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();

	// Analyze operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
	CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
	CCInfo.AnalyzeCallOperands(Outs, AssignFn);

	// Get a count of how many bytes are to be pushed on the stack.
	unsigned NumBytes = CCInfo.getNextStackOffset();

	if (IsSibCall) {
	// Since we're not changing the ABI to make this a tail call, the memory
	// operands are already available in the caller's incoming argument space.
	NumBytes = 0;
	}

	// FPDiff is the byte offset of the call's argument area from the callee's.
	// Stores to callee stack arguments will be placed in FixedStackSlots offset
	// by this amount for a tail call. In a sibling call it must be 0 because the
	// caller will deallocate the entire stack and the callee still expects its
	// arguments to begin at SP+0. Completely unused for non-tail calls.
	int32_t FPDiff = 0;
	MachineFrameInfo &MFI = MF.getFrameInfo();
	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;

	SDValue CallerSavedFP;

	// Adjust the stack pointer for the new arguments...
	// These operations are automatically eliminated by the prolog/epilog pass
	if (!IsSibCall) {
	Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);

	unsigned OffsetReg = Info->getScratchWaveOffsetReg();

	// In the HSA case, this should be an identity copy.
	SDValue ScratchRSrcReg
	= DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
	RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);

	// TODO: Don't hardcode these registers and get from the callee function.
	SDValue ScratchWaveOffsetReg
	= DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
	RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);

	if (!Info->isEntryFunction()) {
	// Avoid clobbering this function's FP value. In the current convention
	// callee will overwrite this, so do save/restore around the call site.
	CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
	Info->getFrameOffsetReg(), MVT::i32);
	}
	}

	// Stack pointer relative accesses are done by changing the offset SGPR. This
	// is just the VGPR offset component.
	SDValue StackPtr = DAG.getConstant(CalleeUsableStackOffset, DL, MVT::i32);

	SmallVector<SDValue, 8> MemOpChains;
	MVT PtrVT = MVT::i32;

	// Walk the register/memloc assignments, inserting copies/loads.
	for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
	++i, ++realArgIdx) {
	CCValAssign &VA = ArgLocs[i];
	SDValue Arg = OutVals[realArgIdx];

	// Promote the value if needed.
	switch (VA.getLocInfo()) {
	case CCValAssign::Full:
	break;
	case CCValAssign::BCvt:
	Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::ZExt:
	Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::SExt:
	Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::AExt:
	Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::FPExt:
	Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	default:
	llvm_unreachable("Unknown loc info!");
	}

	if (VA.isRegLoc()) {
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
	} else {
	assert(VA.isMemLoc());

	SDValue DstAddr;
	MachinePointerInfo DstInfo;

	unsigned LocMemOffset = VA.getLocMemOffset();
	int32_t Offset = LocMemOffset;

	SDValue PtrOff = DAG.getObjectPtrOffset(DL, StackPtr, Offset);

	if (IsTailCall) {
	ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
	unsigned OpSize = Flags.isByVal() ?
	Flags.getByValSize() : VA.getValVT().getStoreSize();

	Offset = Offset + FPDiff;
	int FI = MFI.CreateFixedObject(OpSize, Offset, true);

	DstAddr = DAG.getObjectPtrOffset(DL, DAG.getFrameIndex(FI, PtrVT),
	StackPtr);
	DstInfo = MachinePointerInfo::getFixedStack(MF, FI);

	// Make sure any stack arguments overlapping with where we're storing
	// are loaded before this eventual operation. Otherwise they'll be
	// clobbered.

	// FIXME: Why is this really necessary? This seems to just result in a
	// lot of code to copy the stack and write them back to the same
	// locations, which are supposed to be immutable?
	Chain = addTokenForArgument(Chain, DAG, MFI, FI);
	} else {
	DstAddr = PtrOff;
	DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
	}

	if (Outs[i].Flags.isByVal()) {
	SDValue SizeNode =
	DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
	SDValue Cpy = DAG.getMemcpy(
	Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
	/isVol = / false, /AlwaysInline = / true,
	/isTailCall = / false, DstInfo,
	MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy(
	*DAG.getContext(), AMDGPUASI.PRIVATE_ADDRESS))));

	MemOpChains.push_back(Cpy);
	} else {
	SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
	MemOpChains.push_back(Store);
	}
	}
	}

	// Copy special input registers after user input arguments.
	passSpecialInputs(CLI, *Info, RegsToPass, MemOpChains, Chain, StackPtr);

	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);

	// Build a sequence of copy-to-reg nodes chained together with token chain
	// and flag operands which copy the outgoing args into the appropriate regs.
	SDValue InFlag;
	for (auto &RegToPass : RegsToPass) {
	Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
	RegToPass.second, InFlag);
	InFlag = Chain.getValue(1);
	}


	SDValue PhysReturnAddrReg;
	if (IsTailCall) {
	// Since the return is being combined with the call, we need to pass on the
	// return address.

	const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
	SDValue ReturnAddrReg = CreateLiveInRegister(
	DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);

	PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
	MVT::i64);
	Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
	InFlag = Chain.getValue(1);
	}

	// We don't usually want to end the call-sequence here because we would tidy
	// the frame up after the call, however in the ABI-changing tail-call case
	// we've carefully laid out the parameters so that when sp is reset they'll be
	// in the correct location.
	if (IsTailCall && !IsSibCall) {
	Chain = DAG.getCALLSEQ_END(Chain,
	DAG.getTargetConstant(NumBytes, DL, MVT::i32),
	DAG.getTargetConstant(0, DL, MVT::i32),
	InFlag, DL);
	InFlag = Chain.getValue(1);
	}

	std::vector<SDValue> Ops;
	Ops.push_back(Chain);
	Ops.push_back(Callee);

	if (IsTailCall) {
	// Each tail call may have to adjust the stack by a different amount, so
	// this information must travel along with the operation for eventual
	// consumption by emitEpilogue.
	Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));

	Ops.push_back(PhysReturnAddrReg);
	}

	// Add argument registers to the end of the list so that they are known live
	// into the call.
	for (auto &RegToPass : RegsToPass) {
	Ops.push_back(DAG.getRegister(RegToPass.first,
	RegToPass.second.getValueType()));
	}

	// Add a register mask operand representing the call-preserved registers.

	const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
	const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
	assert(Mask && "Missing call preserved mask for calling convention");
	Ops.push_back(DAG.getRegisterMask(Mask));

	if (InFlag.getNode())
	Ops.push_back(InFlag);

	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

	// If we're doing a tall call, use a TC_RETURN here rather than an
	// actual call instruction.
	if (IsTailCall) {
	MFI.setHasTailCall();
	return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
	}

	// Returns a chain and a flag for retval copy to use.
	SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
	Chain = Call.getValue(0);
	InFlag = Call.getValue(1);

	if (CallerSavedFP) {
	SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
	Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
	InFlag = Chain.getValue(1);
	}

	uint64_t CalleePopBytes = NumBytes;
	Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
	DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
	InFlag, DL);
	if (!Ins.empty())
	InFlag = Chain.getValue(1);

	// Handle result values, copying them out of physregs into vregs that we
	// return.
	return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
	InVals, IsThisReturn,
	IsThisReturn ? OutVals[0] : SDValue());
	}

	unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
	SelectionDAG &DAG) const {
	unsigned Reg = StringSwitch<unsigned>(RegName)
	.Case("m0", AMDGPU::M0)
	.Case("exec", AMDGPU::EXEC)
	.Case("exec_lo", AMDGPU::EXEC_LO)
	.Case("exec_hi", AMDGPU::EXEC_HI)
	.Case("flat_scratch", AMDGPU::FLAT_SCR)
	.Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
	.Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
	.Default(AMDGPU::NoRegister);

	if (Reg == AMDGPU::NoRegister) {
	report_fatal_error(Twine("invalid register name \""
	+ StringRef(RegName) + "\"."));

	}

	if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
	Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
	report_fatal_error(Twine("invalid register \""
	+ StringRef(RegName) + "\" for subtarget."));
	}

	switch (Reg) {
	case AMDGPU::M0:
	case AMDGPU::EXEC_LO:
	case AMDGPU::EXEC_HI:
	case AMDGPU::FLAT_SCR_LO:
	case AMDGPU::FLAT_SCR_HI:
	if (VT.getSizeInBits() == 32)
	return Reg;
	break;
	case AMDGPU::EXEC:
	case AMDGPU::FLAT_SCR:
	if (VT.getSizeInBits() == 64)
	return Reg;
	break;
	default:
	llvm_unreachable("missing register type checking");
	}

	report_fatal_error(Twine("invalid type for register \""
	+ StringRef(RegName) + "\"."));
	}

	// If kill is not the last instruction, split the block so kill is always a
	// proper terminator.
	MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();

	MachineBasicBlock::iterator SplitPoint(&MI);
	++SplitPoint;

	if (SplitPoint == BB->end()) {
	// Don't bother with a new block.
	MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
	return BB;
	}

	MachineFunction *MF = BB->getParent();
	MachineBasicBlock *SplitBB
	= MF->CreateMachineBasicBlock(BB->getBasicBlock());

	MF->insert(++MachineFunction::iterator(BB), SplitBB);
	SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());

	SplitBB->transferSuccessorsAndUpdatePHIs(BB);
	BB->addSuccessor(SplitBB);

	MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
	return SplitBB;
	}

	// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
	// wavefront. If the value is uniform and just happens to be in a VGPR, this
	// will only do one iteration. In the worst case, this will loop 64 times.
	//
	// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
	static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
	const SIInstrInfo *TII,
	MachineRegisterInfo &MRI,
	MachineBasicBlock &OrigBB,
	MachineBasicBlock &LoopBB,
	const DebugLoc &DL,
	const MachineOperand &IdxReg,
	unsigned InitReg,
	unsigned ResultReg,
	unsigned PhiReg,
	unsigned InitSaveExecReg,
	int Offset,
	bool UseGPRIdxMode) {
	MachineBasicBlock::iterator I = LoopBB.begin();

	unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
	unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
	unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
	unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);

	BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
	.addReg(InitReg)
	.addMBB(&OrigBB)
	.addReg(ResultReg)
	.addMBB(&LoopBB);

	BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
	.addReg(InitSaveExecReg)
	.addMBB(&OrigBB)
	.addReg(NewExec)
	.addMBB(&LoopBB);

	// Read the next variant <- also loop target.
	BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
	.addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));

	// Compare the just read M0 value to all possible Idx values.
	BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
	.addReg(CurrentIdxReg)
	.addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());

	if (UseGPRIdxMode) {
	unsigned IdxReg;
	if (Offset == 0) {
	IdxReg = CurrentIdxReg;
	} else {
	IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
	BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
	.addReg(CurrentIdxReg, RegState::Kill)
	.addImm(Offset);
	}

	MachineInstr *SetIdx =
	BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_IDX))
	.addReg(IdxReg, RegState::Kill);
	SetIdx->getOperand(2).setIsUndef();
	} else {
	// Move index from VCC into M0
	if (Offset == 0) {
	BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
	.addReg(CurrentIdxReg, RegState::Kill);
	} else {
	BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
	.addReg(CurrentIdxReg, RegState::Kill)
	.addImm(Offset);
	}
	}

	// Update EXEC, save the original EXEC value to VCC.
	BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
	.addReg(CondReg, RegState::Kill);

	MRI.setSimpleHint(NewExec, CondReg);

	// Update EXEC, switch all done bits to 0 and all todo bits to 1.
	MachineInstr *InsertPt =
	BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
	.addReg(AMDGPU::EXEC)
	.addReg(NewExec);

	// XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
	// s_cbranch_scc0?

	// Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
	BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
	.addMBB(&LoopBB);

	return InsertPt->getIterator();
	}

	// This has slightly sub-optimal regalloc when the source vector is killed by
	// the read. The register allocator does not understand that the kill is
	// per-workitem, so is kept alive for the whole loop so we end up not re-using a
	// subregister from it, using 1 more VGPR than necessary. This was saved when
	// this was expanded after register allocation.
	static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
	MachineBasicBlock &MBB,
	MachineInstr &MI,
	unsigned InitResultReg,
	unsigned PhiReg,
	int Offset,
	bool UseGPRIdxMode) {
	MachineFunction *MF = MBB.getParent();
	MachineRegisterInfo &MRI = MF->getRegInfo();
	const DebugLoc &DL = MI.getDebugLoc();
	MachineBasicBlock::iterator I(&MI);

	unsigned DstReg = MI.getOperand(0).getReg();
	unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
	unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);

	BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);

	// Save the EXEC mask
	BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
	.addReg(AMDGPU::EXEC);

	// To insert the loop we need to split the block. Move everything after this
	// point to a new block, and insert a new empty block between the two.
	MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
	MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
	MachineFunction::iterator MBBI(MBB);
	++MBBI;

	MF->insert(MBBI, LoopBB);
	MF->insert(MBBI, RemainderBB);

	LoopBB->addSuccessor(LoopBB);
	LoopBB->addSuccessor(RemainderBB);

	// Move the rest of the block into a new block.
	RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
	RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());

	MBB.addSuccessor(LoopBB);

	const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);

	auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, LoopBB, DL, Idx,
	InitResultReg, DstReg, PhiReg, TmpExec,
	Offset, UseGPRIdxMode);

	MachineBasicBlock::iterator First = RemainderBB->begin();
	BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
	.addReg(SaveExec);

	return InsPt;
	}

	// Returns subreg index, offset
	static std::pair<unsigned, int>
	computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
	const TargetRegisterClass *SuperRC,
	unsigned VecReg,
	int Offset) {
	int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;

	// Skip out of bounds offsets, or else we would end up using an undefined
	// register.
	if (Offset >= NumElts \|\| Offset < 0)
	return std::make_pair(AMDGPU::sub0, Offset);

	return std::make_pair(AMDGPU::sub0 + Offset, 0);
	}

	// Return true if the index is an SGPR and was set.
	static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
	MachineRegisterInfo &MRI,
	MachineInstr &MI,
	int Offset,
	bool UseGPRIdxMode,
	bool IsIndirectSrc) {
	MachineBasicBlock *MBB = MI.getParent();
	const DebugLoc &DL = MI.getDebugLoc();
	MachineBasicBlock::iterator I(&MI);

	const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
	const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());

	assert(Idx->getReg() != AMDGPU::NoRegister);

	if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
	return false;

	if (UseGPRIdxMode) {
	unsigned IdxMode = IsIndirectSrc ?
	VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
	if (Offset == 0) {
	MachineInstr *SetOn =
	BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
	.add(*Idx)
	.addImm(IdxMode);

	SetOn->getOperand(3).setIsUndef();
	} else {
	unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
	BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
	.add(*Idx)
	.addImm(Offset);
	MachineInstr *SetOn =
	BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
	.addReg(Tmp, RegState::Kill)
	.addImm(IdxMode);

	SetOn->getOperand(3).setIsUndef();
	}

	return true;
	}

	if (Offset == 0) {
	BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
	.add(*Idx);
	} else {
	BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
	.add(*Idx)
	.addImm(Offset);
	}

	return true;
	}

	// Control flow needs to be inserted if indexing with a VGPR.
	static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
	MachineBasicBlock &MBB,
	const SISubtarget &ST) {
	const SIInstrInfo *TII = ST.getInstrInfo();
	const SIRegisterInfo &TRI = TII->getRegisterInfo();
	MachineFunction *MF = MBB.getParent();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	unsigned Dst = MI.getOperand(0).getReg();
	unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
	int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();

	const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);

	unsigned SubReg;
	std::tie(SubReg, Offset)
	= computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);

	bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);

	if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
	MachineBasicBlock::iterator I(&MI);
	const DebugLoc &DL = MI.getDebugLoc();

	if (UseGPRIdxMode) {
	// TODO: Look at the uses to avoid the copy. This may require rescheduling
	// to avoid interfering with other uses, so probably requires a new
	// optimization pass.
	BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
	.addReg(SrcReg, RegState::Undef, SubReg)
	.addReg(SrcReg, RegState::Implicit)
	.addReg(AMDGPU::M0, RegState::Implicit);
	BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
	} else {
	BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
	.addReg(SrcReg, RegState::Undef, SubReg)
	.addReg(SrcReg, RegState::Implicit);
	}

	MI.eraseFromParent();

	return &MBB;
	}

	const DebugLoc &DL = MI.getDebugLoc();
	MachineBasicBlock::iterator I(&MI);

	unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
	unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);

	BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);

	if (UseGPRIdxMode) {
	MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
	.addImm(0) // Reset inside loop.
	.addImm(VGPRIndexMode::SRC0_ENABLE);
	SetOn->getOperand(3).setIsUndef();

	// Disable again after the loop.
	BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
	}

	auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, UseGPRIdxMode);
	MachineBasicBlock *LoopBB = InsPt->getParent();

	if (UseGPRIdxMode) {
	BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
	.addReg(SrcReg, RegState::Undef, SubReg)
	.addReg(SrcReg, RegState::Implicit)
	.addReg(AMDGPU::M0, RegState::Implicit);
	} else {
	BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
	.addReg(SrcReg, RegState::Undef, SubReg)
	.addReg(SrcReg, RegState::Implicit);
	}

	MI.eraseFromParent();

	return LoopBB;
	}

	static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
	const TargetRegisterClass *VecRC) {
	switch (TRI.getRegSizeInBits(*VecRC)) {
	case 32: // 4 bytes
	return AMDGPU::V_MOVRELD_B32_V1;
	case 64: // 8 bytes
	return AMDGPU::V_MOVRELD_B32_V2;
	case 128: // 16 bytes
	return AMDGPU::V_MOVRELD_B32_V4;
	case 256: // 32 bytes
	return AMDGPU::V_MOVRELD_B32_V8;
	case 512: // 64 bytes
	return AMDGPU::V_MOVRELD_B32_V16;
	default:
	llvm_unreachable("unsupported size for MOVRELD pseudos");
	}
	}

	static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
	MachineBasicBlock &MBB,
	const SISubtarget &ST) {
	const SIInstrInfo *TII = ST.getInstrInfo();
	const SIRegisterInfo &TRI = TII->getRegisterInfo();
	MachineFunction *MF = MBB.getParent();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	unsigned Dst = MI.getOperand(0).getReg();
	const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
	const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
	const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
	int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
	const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());

	// This can be an immediate, but will be folded later.
	assert(Val->getReg());

	unsigned SubReg;
	std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
	SrcVec->getReg(),
	Offset);
	bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);

	if (Idx->getReg() == AMDGPU::NoRegister) {
	MachineBasicBlock::iterator I(&MI);
	const DebugLoc &DL = MI.getDebugLoc();

	assert(Offset == 0);

	BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
	.add(*SrcVec)
	.add(*Val)
	.addImm(SubReg);

	MI.eraseFromParent();
	return &MBB;
	}

	if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
	MachineBasicBlock::iterator I(&MI);
	const DebugLoc &DL = MI.getDebugLoc();

	if (UseGPRIdxMode) {
	BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
	.addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
	.add(*Val)
	.addReg(Dst, RegState::ImplicitDefine)
	.addReg(SrcVec->getReg(), RegState::Implicit)
	.addReg(AMDGPU::M0, RegState::Implicit);

	BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
	} else {
	const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));

	BuildMI(MBB, I, DL, MovRelDesc)
	.addReg(Dst, RegState::Define)
	.addReg(SrcVec->getReg())
	.add(*Val)
	.addImm(SubReg - AMDGPU::sub0);
	}

	MI.eraseFromParent();
	return &MBB;
	}

	if (Val->isReg())
	MRI.clearKillFlags(Val->getReg());

	const DebugLoc &DL = MI.getDebugLoc();

	if (UseGPRIdxMode) {
	MachineBasicBlock::iterator I(&MI);

	MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
	.addImm(0) // Reset inside loop.
	.addImm(VGPRIndexMode::DST_ENABLE);
	SetOn->getOperand(3).setIsUndef();

	// Disable again after the loop.
	BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
	}

	unsigned PhiReg = MRI.createVirtualRegister(VecRC);

	auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
	Offset, UseGPRIdxMode);
	MachineBasicBlock *LoopBB = InsPt->getParent();

	if (UseGPRIdxMode) {
	BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
	.addReg(PhiReg, RegState::Undef, SubReg) // vdst
	.add(*Val) // src0
	.addReg(Dst, RegState::ImplicitDefine)
	.addReg(PhiReg, RegState::Implicit)
	.addReg(AMDGPU::M0, RegState::Implicit);
	} else {
	const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));

	BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
	.addReg(Dst, RegState::Define)
	.addReg(PhiReg)
	.add(*Val)
	.addImm(SubReg - AMDGPU::sub0);
	}

	MI.eraseFromParent();

	return LoopBB;
	}

	MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
	MachineInstr &MI, MachineBasicBlock *BB) const {

	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
	MachineFunction *MF = BB->getParent();
	SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();

	if (TII->isMIMG(MI)) {
	if (MI.memoperands_empty() && MI.mayLoadOrStore()) {
	report_fatal_error("missing mem operand from MIMG instruction");
	}
	// Add a memoperand for mimg instructions so that they aren't assumed to
	// be ordered memory instuctions.

	return BB;
	}

	switch (MI.getOpcode()) {
	case AMDGPU::S_ADD_U64_PSEUDO:
	case AMDGPU::S_SUB_U64_PSEUDO: {
	MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
	const DebugLoc &DL = MI.getDebugLoc();

	MachineOperand &Dest = MI.getOperand(0);
	MachineOperand &Src0 = MI.getOperand(1);
	MachineOperand &Src1 = MI.getOperand(2);

	unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
	unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);

	MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
	Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
	&AMDGPU::SReg_32_XM0RegClass);
	MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
	Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
	&AMDGPU::SReg_32_XM0RegClass);

	MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
	Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
	&AMDGPU::SReg_32_XM0RegClass);
	MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
	Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
	&AMDGPU::SReg_32_XM0RegClass);

	bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);

	unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
	unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
	BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
	.add(Src0Sub0)
	.add(Src1Sub0);
	BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
	.add(Src0Sub1)
	.add(Src1Sub1);
	BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
	.addReg(DestSub0)
	.addImm(AMDGPU::sub0)
	.addReg(DestSub1)
	.addImm(AMDGPU::sub1);
	MI.eraseFromParent();
	return BB;
	}
	case AMDGPU::SI_INIT_M0: {
	BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
	TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
	.add(MI.getOperand(0));
	MI.eraseFromParent();
	return BB;
	}
	case AMDGPU::SI_INIT_EXEC:
	// This should be before all vector instructions.
	BuildMI(BB, &BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
	AMDGPU::EXEC)
	.addImm(MI.getOperand(0).getImm());
	MI.eraseFromParent();
	return BB;

	case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
	// Extract the thread count from an SGPR input and set EXEC accordingly.
	// Since BFM can't shift by 64, handle that case with CMP + CMOV.
	//
	// S_BFE_U32 count, input, {shift, 7}
	// S_BFM_B64 exec, count, 0
	// S_CMP_EQ_U32 count, 64
	// S_CMOV_B64 exec, -1
	MachineInstr FirstMI = &BB->begin();
	MachineRegisterInfo &MRI = MF->getRegInfo();
	unsigned InputReg = MI.getOperand(0).getReg();
	unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
	bool Found = false;

	// Move the COPY of the input reg to the beginning, so that we can use it.
	for (auto I = BB->begin(); I != &MI; I++) {
	if (I->getOpcode() != TargetOpcode::COPY \|\|
	I->getOperand(0).getReg() != InputReg)
	continue;

	if (I == FirstMI) {
	FirstMI = &*++BB->begin();
	} else {
	I->removeFromParent();
	BB->insert(FirstMI, &*I);
	}
	Found = true;
	break;
	}
	assert(Found);
	(void)Found;

	// This should be before all vector instructions.
	BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
	.addReg(InputReg)
	.addImm((MI.getOperand(1).getImm() & 0x7f) \| 0x70000);
	BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
	AMDGPU::EXEC)
	.addReg(CountReg)
	.addImm(0);
	BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
	.addReg(CountReg, RegState::Kill)
	.addImm(64);
	BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
	AMDGPU::EXEC)
	.addImm(-1);
	MI.eraseFromParent();
	return BB;
	}

	case AMDGPU::GET_GROUPSTATICSIZE: {
	DebugLoc DL = MI.getDebugLoc();
	BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
	.add(MI.getOperand(0))
	.addImm(MFI->getLDSSize());
	MI.eraseFromParent();
	return BB;
	}
	case AMDGPU::SI_INDIRECT_SRC_V1:
	case AMDGPU::SI_INDIRECT_SRC_V2:
	case AMDGPU::SI_INDIRECT_SRC_V4:
	case AMDGPU::SI_INDIRECT_SRC_V8:
	case AMDGPU::SI_INDIRECT_SRC_V16:
	return emitIndirectSrc(MI, BB, getSubtarget());
	case AMDGPU::SI_INDIRECT_DST_V1:
	case AMDGPU::SI_INDIRECT_DST_V2:
	case AMDGPU::SI_INDIRECT_DST_V4:
	case AMDGPU::SI_INDIRECT_DST_V8:
	case AMDGPU::SI_INDIRECT_DST_V16:
	return emitIndirectDst(MI, BB, getSubtarget());
	case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
	case AMDGPU::SI_KILL_I1_PSEUDO:
	return splitKillBlock(MI, BB);
	case AMDGPU::V_CNDMASK_B64_PSEUDO: {
	MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();

	unsigned Dst = MI.getOperand(0).getReg();
	unsigned Src0 = MI.getOperand(1).getReg();
	unsigned Src1 = MI.getOperand(2).getReg();
	const DebugLoc &DL = MI.getDebugLoc();
	unsigned SrcCond = MI.getOperand(3).getReg();

	unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
	unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
	unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);

	BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
	.addReg(SrcCond);
	BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
	.addReg(Src0, 0, AMDGPU::sub0)
	.addReg(Src1, 0, AMDGPU::sub0)
	.addReg(SrcCondCopy);
	BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
	.addReg(Src0, 0, AMDGPU::sub1)
	.addReg(Src1, 0, AMDGPU::sub1)
	.addReg(SrcCondCopy);

	BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
	.addReg(DstLo)
	.addImm(AMDGPU::sub0)
	.addReg(DstHi)
	.addImm(AMDGPU::sub1);
	MI.eraseFromParent();
	return BB;
	}
	case AMDGPU::SI_BR_UNDEF: {
	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
	const DebugLoc &DL = MI.getDebugLoc();
	MachineInstr Br = BuildMI(BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
	.add(MI.getOperand(0));
	Br->getOperand(1).setIsUndef(true); // read undef SCC
	MI.eraseFromParent();
	return BB;
	}
	case AMDGPU::ADJCALLSTACKUP:
	case AMDGPU::ADJCALLSTACKDOWN: {
	const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
	MachineInstrBuilder MIB(*MF, &MI);
	MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
	.addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
	return BB;
	}
	case AMDGPU::SI_CALL_ISEL:
	case AMDGPU::SI_TCRETURN_ISEL: {
	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
	const DebugLoc &DL = MI.getDebugLoc();
	unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);

	MachineRegisterInfo &MRI = MF->getRegInfo();
	unsigned GlobalAddrReg = MI.getOperand(0).getReg();
	MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg);
	assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET);

	const GlobalValue *G = PCRel->getOperand(1).getGlobal();

	MachineInstrBuilder MIB;
	if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
	MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg)
	.add(MI.getOperand(0))
	.addGlobalAddress(G);
	} else {
	MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN))
	.add(MI.getOperand(0))
	.addGlobalAddress(G);

	// There is an additional imm operand for tcreturn, but it should be in the
	// right place already.
	}

	for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
	MIB.add(MI.getOperand(I));

	MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
	MI.eraseFromParent();
	return BB;
	}
	default:
	return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
	}
	}

	bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const {
	return isTypeLegal(VT.getScalarType());
	}

	bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
	// This currently forces unfolding various combinations of fsub into fma with
	// free fneg'd operands. As long as we have fast FMA (controlled by
	// isFMAFasterThanFMulAndFAdd), we should perform these.

	// When fma is quarter rate, for f64 where add / sub are at best half rate,
	// most of these combines appear to be cycle neutral but save on instruction
	// count / code size.
	return true;
	}

	EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
	EVT VT) const {
	if (!VT.isVector()) {
	return MVT::i1;
	}
	return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
	}

	MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
	// TODO: Should i16 be used always if legal? For now it would force VALU
	// shifts.
	return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
	}

	// Answering this is somewhat tricky and depends on the specific device which
	// have different rates for fma or all f64 operations.
	//
	// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
	// regardless of which device (although the number of cycles differs between
	// devices), so it is always profitable for f64.
	//
	// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
	// only on full rate devices. Normally, we should prefer selecting v_mad_f32
	// which we can always do even without fused FP ops since it returns the same
	// result as the separate operations and since it is always full
	// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
	// however does not support denormals, so we do report fma as faster if we have
	// a fast fma device and require denormals.
	//
	bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
	VT = VT.getScalarType();

	switch (VT.getSimpleVT().SimpleTy) {
	case MVT::f32:
	// This is as fast on some subtargets. However, we always have full rate f32
	// mad available which returns the same result as the separate operations
	// which we should prefer over fma. We can't use this if we want to support
	// denormals, so only report this in these cases.
	return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32();
	case MVT::f64:
	return true;
	case MVT::f16:
	return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals();
	default:
	break;
	}

	return false;
	}

	//===----------------------------------------------------------------------===//
	// Custom DAG Lowering Operations
	//===----------------------------------------------------------------------===//

	SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
	switch (Op.getOpcode()) {
	default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
	case ISD::BRCOND: return LowerBRCOND(Op, DAG);
	case ISD::LOAD: {
	SDValue Result = LowerLOAD(Op, DAG);
	assert((!Result.getNode() \|\|
	Result.getNode()->getNumValues() == 2) &&
	"Load should return a value and a chain");
	return Result;
	}

	case ISD::FSIN:
	case ISD::FCOS:
	return LowerTrig(Op, DAG);
	case ISD::SELECT: return LowerSELECT(Op, DAG);
	case ISD::FDIV: return LowerFDIV(Op, DAG);
	case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
	case ISD::STORE: return LowerSTORE(Op, DAG);
	case ISD::GlobalAddress: {
	MachineFunction &MF = DAG.getMachineFunction();
	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
	return LowerGlobalAddress(MFI, Op, DAG);
	}
	case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
	case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
	case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
	case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
	case ISD::INSERT_VECTOR_ELT:
	return lowerINSERT_VECTOR_ELT(Op, DAG);
	case ISD::EXTRACT_VECTOR_ELT:
	return lowerEXTRACT_VECTOR_ELT(Op, DAG);
	case ISD::FP_ROUND:
	return lowerFP_ROUND(Op, DAG);
	case ISD::TRAP:
	case ISD::DEBUGTRAP:
	return lowerTRAP(Op, DAG);
	}
	return SDValue();
	}

	void SITargetLowering::ReplaceNodeResults(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const {
	switch (N->getOpcode()) {
	case ISD::INSERT_VECTOR_ELT: {
	if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
	Results.push_back(Res);
	return;
	}
	case ISD::EXTRACT_VECTOR_ELT: {
	if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
	Results.push_back(Res);
	return;
	}
	case ISD::INTRINSIC_WO_CHAIN: {
	unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
	- if (IID == Intrinsic::amdgcn_cvt_pkrtz) {
	+ switch (IID) {
	+ case Intrinsic::amdgcn_cvt_pkrtz: {
	SDValue Src0 = N->getOperand(1);
	SDValue Src1 = N->getOperand(2);
	SDLoc SL(N);
	SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
	Src0, Src1);
	Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
	return;
	}
	+ case Intrinsic::amdgcn_cvt_pknorm_i16:
	+ case Intrinsic::amdgcn_cvt_pknorm_u16:
	+ case Intrinsic::amdgcn_cvt_pk_i16:
	+ case Intrinsic::amdgcn_cvt_pk_u16: {
	+ SDValue Src0 = N->getOperand(1);
	+ SDValue Src1 = N->getOperand(2);
	+ SDLoc SL(N);
	+ unsigned Opcode;
	+
	+ if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
	+ Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
	+ else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
	+ Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
	+ else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
	+ Opcode = AMDGPUISD::CVT_PK_I16_I32;
	+ else
	+ Opcode = AMDGPUISD::CVT_PK_U16_U32;
	+
	+ SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
	+ Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
	+ return;
	+ }
	+ }
	break;
	}
	case ISD::SELECT: {
	SDLoc SL(N);
	EVT VT = N->getValueType(0);
	EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
	SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
	SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));

	EVT SelectVT = NewVT;
	if (NewVT.bitsLT(MVT::i32)) {
	LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
	RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
	SelectVT = MVT::i32;
	}

	SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
	N->getOperand(0), LHS, RHS);

	if (NewVT != SelectVT)
	NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
	Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
	return;
	}
	default:
	break;
	}
	}

	/// \brief Helper function for LowerBRCOND
	static SDNode *findUser(SDValue Value, unsigned Opcode) {

	SDNode *Parent = Value.getNode();
	for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
	I != E; ++I) {

	if (I.getUse().get() != Value)
	continue;

	if (I->getOpcode() == Opcode)
	return *I;
	}
	return nullptr;
	}

	unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
	if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
	switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
	case Intrinsic::amdgcn_if:
	return AMDGPUISD::IF;
	case Intrinsic::amdgcn_else:
	return AMDGPUISD::ELSE;
	case Intrinsic::amdgcn_loop:
	return AMDGPUISD::LOOP;
	case Intrinsic::amdgcn_end_cf:
	llvm_unreachable("should not occur");
	default:
	return 0;
	}
	}

	// break, if_break, else_break are all only used as inputs to loop, not
	// directly as branch conditions.
	return 0;
	}

	void SITargetLowering::createDebuggerPrologueStackObjects(
	MachineFunction &MF) const {
	// Create stack objects that are used for emitting debugger prologue.
	//
	// Debugger prologue writes work group IDs and work item IDs to scratch memory
	// at fixed location in the following format:
	// offset 0: work group ID x
	// offset 4: work group ID y
	// offset 8: work group ID z
	// offset 16: work item ID x
	// offset 20: work item ID y
	// offset 24: work item ID z
	SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
	int ObjectIdx = 0;

	// For each dimension:
	for (unsigned i = 0; i < 3; ++i) {
	// Create fixed stack object for work group ID.
	ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true);
	Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
	// Create fixed stack object for work item ID.
	ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true);
	Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
	}
	}

	bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
	const Triple &TT = getTargetMachine().getTargetTriple();
	return GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
	AMDGPU::shouldEmitConstantsToTextSection(TT);
	}

	bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
	return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS \|\|
	GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
	!shouldEmitFixup(GV) &&
	!getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
	}

	bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
	return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
	}

	/// This transforms the control flow intrinsics to get the branch destination as
	/// last parameter, also switches branch target with BR if the need arise
	SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
	SelectionDAG &DAG) const {
	SDLoc DL(BRCOND);

	SDNode *Intr = BRCOND.getOperand(1).getNode();
	SDValue Target = BRCOND.getOperand(2);
	SDNode *BR = nullptr;
	SDNode *SetCC = nullptr;

	if (Intr->getOpcode() == ISD::SETCC) {
	// As long as we negate the condition everything is fine
	SetCC = Intr;
	Intr = SetCC->getOperand(0).getNode();

	} else {
	// Get the target from BR if we don't negate the condition
	BR = findUser(BRCOND, ISD::BR);
	Target = BR->getOperand(1);
	}

	// FIXME: This changes the types of the intrinsics instead of introducing new
	// nodes with the correct types.
	// e.g. llvm.amdgcn.loop

	// eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
	// => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>

	unsigned CFNode = isCFIntrinsic(Intr);
	if (CFNode == 0) {
	// This is a uniform branch so we don't need to legalize.
	return BRCOND;
	}

	bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID \|\|
	Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;

	assert(!SetCC \|\|
	(SetCC->getConstantOperandVal(1) == 1 &&
	cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
	ISD::SETNE));

	// operands of the new intrinsic call
	SmallVector<SDValue, 4> Ops;
	if (HaveChain)
	Ops.push_back(BRCOND.getOperand(0));

	Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
	Ops.push_back(Target);

	ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());

	// build the new intrinsic call
	SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();

	if (!HaveChain) {
	SDValue Ops[] = {
	SDValue(Result, 0),
	BRCOND.getOperand(0)
	};

	Result = DAG.getMergeValues(Ops, DL).getNode();
	}

	if (BR) {
	// Give the branch instruction our target
	SDValue Ops[] = {
	BR->getOperand(0),
	BRCOND.getOperand(2)
	};
	SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
	DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
	BR = NewBR.getNode();
	}

	SDValue Chain = SDValue(Result, Result->getNumValues() - 1);

	// Copy the intrinsic results to registers
	for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
	SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
	if (!CopyToReg)
	continue;

	Chain = DAG.getCopyToReg(
	Chain, DL,
	CopyToReg->getOperand(1),
	SDValue(Result, i - 1),
	SDValue());

	DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
	}

	// Remove the old intrinsic from the chain
	DAG.ReplaceAllUsesOfValueWith(
	SDValue(Intr, Intr->getNumValues() - 1),
	Intr->getOperand(0));

	return Chain;
	}

	SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
	SDValue Op,
	const SDLoc &DL,
	EVT VT) const {
	return Op.getValueType().bitsLE(VT) ?
	DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
	DAG.getNode(ISD::FTRUNC, DL, VT, Op);
	}

	SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
	assert(Op.getValueType() == MVT::f16 &&
	"Do not know how to custom lower FP_ROUND for non-f16 type");

	SDValue Src = Op.getOperand(0);
	EVT SrcVT = Src.getValueType();
	if (SrcVT != MVT::f64)
	return Op;

	SDLoc DL(Op);

	SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
	return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
	}

	SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
	SDLoc SL(Op);
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Chain = Op.getOperand(0);

	unsigned TrapID = Op.getOpcode() == ISD::DEBUGTRAP ?
	SISubtarget::TrapIDLLVMDebugTrap : SISubtarget::TrapIDLLVMTrap;

	if (Subtarget->getTrapHandlerAbi() == SISubtarget::TrapHandlerAbiHsa &&
	Subtarget->isTrapHandlerEnabled()) {
	SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
	unsigned UserSGPR = Info->getQueuePtrUserSGPR();
	assert(UserSGPR != AMDGPU::NoRegister);

	SDValue QueuePtr = CreateLiveInRegister(
	DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);

	SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);

	SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
	QueuePtr, SDValue());

	SDValue Ops[] = {
	ToReg,
	DAG.getTargetConstant(TrapID, SL, MVT::i16),
	SGPR01,
	ToReg.getValue(1)
	};

	return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
	}

	switch (TrapID) {
	case SISubtarget::TrapIDLLVMTrap:
	return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
	case SISubtarget::TrapIDLLVMDebugTrap: {
	DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
	"debugtrap handler not supported",
	Op.getDebugLoc(),
	DS_Warning);
	LLVMContext &Ctx = MF.getFunction().getContext();
	Ctx.diagnose(NoTrap);
	return Chain;
	}
	default:
	llvm_unreachable("unsupported trap handler type!");
	}

	return Chain;
	}

	SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
	SelectionDAG &DAG) const {
	// FIXME: Use inline constants (src_{shared, private}_base) instead.
	if (Subtarget->hasApertureRegs()) {
	unsigned Offset = AS == AMDGPUASI.LOCAL_ADDRESS ?
	AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
	AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
	unsigned WidthM1 = AS == AMDGPUASI.LOCAL_ADDRESS ?
	AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
	AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
	unsigned Encoding =
	AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ \|
	Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ \|
	WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;

	SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
	SDValue ApertureReg = SDValue(
	DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
	SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
	return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
	}

	MachineFunction &MF = DAG.getMachineFunction();
	SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
	unsigned UserSGPR = Info->getQueuePtrUserSGPR();
	assert(UserSGPR != AMDGPU::NoRegister);

	SDValue QueuePtr = CreateLiveInRegister(
	DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);

	// Offset into amd_queue_t for group_segment_aperture_base_hi /
	// private_segment_aperture_base_hi.
	uint32_t StructOffset = (AS == AMDGPUASI.LOCAL_ADDRESS) ? 0x40 : 0x44;

	SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset);

	// TODO: Use custom target PseudoSourceValue.
	// TODO: We should use the value from the IR intrinsic call, but it might not
	// be available and how do we get it?
	Value V = UndefValue::get(PointerType::get(Type::getInt8Ty(DAG.getContext()),
	AMDGPUASI.CONSTANT_ADDRESS));

	MachinePointerInfo PtrInfo(V, StructOffset);
	return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
	MinAlign(64, StructOffset),
	MachineMemOperand::MODereferenceable \|
	MachineMemOperand::MOInvariant);
	}

	SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc SL(Op);
	const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);

	SDValue Src = ASC->getOperand(0);
	SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);

	const AMDGPUTargetMachine &TM =
	static_cast<const AMDGPUTargetMachine &>(getTargetMachine());

	// flat -> local/private
	if (ASC->getSrcAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
	unsigned DestAS = ASC->getDestAddressSpace();

	if (DestAS == AMDGPUASI.LOCAL_ADDRESS \|\|
	DestAS == AMDGPUASI.PRIVATE_ADDRESS) {
	unsigned NullVal = TM.getNullPointerValue(DestAS);
	SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
	SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
	SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);

	return DAG.getNode(ISD::SELECT, SL, MVT::i32,
	NonNull, Ptr, SegmentNullPtr);
	}
	}

	// local/private -> flat
	if (ASC->getDestAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
	unsigned SrcAS = ASC->getSrcAddressSpace();

	if (SrcAS == AMDGPUASI.LOCAL_ADDRESS \|\|
	SrcAS == AMDGPUASI.PRIVATE_ADDRESS) {
	unsigned NullVal = TM.getNullPointerValue(SrcAS);
	SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);

	SDValue NonNull
	= DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);

	SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
	SDValue CvtPtr
	= DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);

	return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
	DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
	FlatNullPtr);
	}
	}

	// global <-> flat are no-ops and never emitted.

	const MachineFunction &MF = DAG.getMachineFunction();
	DiagnosticInfoUnsupported InvalidAddrSpaceCast(
	MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
	DAG.getContext()->diagnose(InvalidAddrSpaceCast);

	return DAG.getUNDEF(ASC->getValueType(0));
	}

	SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue Idx = Op.getOperand(2);
	if (isa<ConstantSDNode>(Idx))
	return SDValue();

	// Avoid stack access for dynamic indexing.
	SDLoc SL(Op);
	SDValue Vec = Op.getOperand(0);
	SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Op.getOperand(1));

	// v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
	SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Val);

	// Convert vector index to bit-index.
	SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx,
	DAG.getConstant(16, SL, MVT::i32));

	SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);

	SDValue BFM = DAG.getNode(ISD::SHL, SL, MVT::i32,
	DAG.getConstant(0xffff, SL, MVT::i32),
	ScaledIdx);

	SDValue LHS = DAG.getNode(ISD::AND, SL, MVT::i32, BFM, ExtVal);
	SDValue RHS = DAG.getNode(ISD::AND, SL, MVT::i32,
	DAG.getNOT(SL, BFM, MVT::i32), BCVec);

	SDValue BFI = DAG.getNode(ISD::OR, SL, MVT::i32, LHS, RHS);
	return DAG.getNode(ISD::BITCAST, SL, Op.getValueType(), BFI);
	}

	SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc SL(Op);

	EVT ResultVT = Op.getValueType();
	SDValue Vec = Op.getOperand(0);
	SDValue Idx = Op.getOperand(1);

	DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);

	// Make sure we we do any optimizations that will make it easier to fold
	// source modifiers before obscuring it with bit operations.

	// XXX - Why doesn't this get called when vector_shuffle is expanded?
	if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
	return Combined;

	if (const ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
	SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);

	if (CIdx->getZExtValue() == 1) {
	Result = DAG.getNode(ISD::SRL, SL, MVT::i32, Result,
	DAG.getConstant(16, SL, MVT::i32));
	} else {
	assert(CIdx->getZExtValue() == 0);
	}

	if (ResultVT.bitsLT(MVT::i32))
	Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
	return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
	}

	SDValue Sixteen = DAG.getConstant(16, SL, MVT::i32);

	// Convert vector index to bit-index.
	SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, Sixteen);

	SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
	SDValue Elt = DAG.getNode(ISD::SRL, SL, MVT::i32, BC, ScaledIdx);

	SDValue Result = Elt;
	if (ResultVT.bitsLT(MVT::i32))
	Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);

	return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
	}

	bool
	SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
	// We can fold offsets for anything that doesn't require a GOT relocation.
	return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS \|\|
	GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
	!shouldEmitGOTReloc(GA->getGlobal());
	}

	static SDValue
	buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
	const SDLoc &DL, unsigned Offset, EVT PtrVT,
	unsigned GAFlags = SIInstrInfo::MO_NONE) {
	// In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
	// lowered to the following code sequence:
	//
	// For constant address space:
	// s_getpc_b64 s[0:1]
	// s_add_u32 s0, s0, $symbol
	// s_addc_u32 s1, s1, 0
	//
	// s_getpc_b64 returns the address of the s_add_u32 instruction and then
	// a fixup or relocation is emitted to replace $symbol with a literal
	// constant, which is a pc-relative offset from the encoding of the $symbol
	// operand to the global variable.
	//
	// For global address space:
	// s_getpc_b64 s[0:1]
	// s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
	// s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
	//
	// s_getpc_b64 returns the address of the s_add_u32 instruction and then
	// fixups or relocations are emitted to replace $symbol@*@lo and
	// $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
	// which is a 64-bit pc-relative offset from the encoding of the $symbol
	// operand to the global variable.
	//
	// What we want here is an offset from the value returned by s_getpc
	// (which is the address of the s_add_u32 instruction) to the global
	// variable, but since the encoding of $symbol starts 4 bytes after the start
	// of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
	// small. This requires us to add 4 to the global variable offset in order to
	// compute the correct address.
	SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
	GAFlags);
	SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
	GAFlags == SIInstrInfo::MO_NONE ?
	GAFlags : GAFlags + 1);
	return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
	}

	SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
	SDValue Op,
	SelectionDAG &DAG) const {
	GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
	const GlobalValue *GV = GSD->getGlobal();

	if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
	GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS &&
	// FIXME: It isn't correct to rely on the type of the pointer. This should
	// be removed when address space 0 is 64-bit.
	!GV->getType()->getElementType()->isFunctionTy())
	return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);

	SDLoc DL(GSD);
	EVT PtrVT = Op.getValueType();

	if (shouldEmitFixup(GV))
	return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
	else if (shouldEmitPCReloc(GV))
	return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
	SIInstrInfo::MO_REL32);

	SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
	SIInstrInfo::MO_GOTPCREL32);

	Type Ty = PtrVT.getTypeForEVT(DAG.getContext());
	PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
	const DataLayout &DataLayout = DAG.getDataLayout();
	unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
	// FIXME: Use a PseudoSourceValue once those can be assigned an address space.
	MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));

	return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
	MachineMemOperand::MODereferenceable \|
	MachineMemOperand::MOInvariant);
	}

	SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
	const SDLoc &DL, SDValue V) const {
	// We can't use S_MOV_B32 directly, because there is no way to specify m0 as
	// the destination register.
	//
	// We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
	// so we will end up with redundant moves to m0.
	//
	// We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.

	// A Null SDValue creates a glue result.
	SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
	V, Chain);
	return SDValue(M0, 0);
	}

	SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
	SDValue Op,
	MVT VT,
	unsigned Offset) const {
	SDLoc SL(Op);
	SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
	DAG.getEntryNode(), Offset, false);
	// The local size values will have the hi 16-bits as zero.
	return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
	DAG.getValueType(VT));
	}

	static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
	EVT VT) {
	DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
	"non-hsa intrinsic with hsa target",
	DL.getDebugLoc());
	DAG.getContext()->diagnose(BadIntrin);
	return DAG.getUNDEF(VT);
	}

	static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
	EVT VT) {
	DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
	"intrinsic not supported on subtarget",
	DL.getDebugLoc());
	DAG.getContext()->diagnose(BadIntrin);
	return DAG.getUNDEF(VT);
	}

	SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	auto MFI = MF.getInfo<SIMachineFunctionInfo>();

	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();

	// TODO: Should this propagate fast-math-flags?

	switch (IntrinsicID) {
	case Intrinsic::amdgcn_implicit_buffer_ptr: {
	if (getSubtarget()->isAmdCodeObjectV2(MF))
	return emitNonHSAIntrinsicError(DAG, DL, VT);
	return getPreloadedValue(DAG, *MFI, VT,
	AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
	}
	case Intrinsic::amdgcn_dispatch_ptr:
	case Intrinsic::amdgcn_queue_ptr: {
	if (!Subtarget->isAmdCodeObjectV2(MF)) {
	DiagnosticInfoUnsupported BadIntrin(
	MF.getFunction(), "unsupported hsa intrinsic without hsa target",
	DL.getDebugLoc());
	DAG.getContext()->diagnose(BadIntrin);
	return DAG.getUNDEF(VT);
	}

	auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
	AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR;
	return getPreloadedValue(DAG, *MFI, VT, RegID);
	}
	case Intrinsic::amdgcn_implicitarg_ptr: {
	if (MFI->isEntryFunction())
	return getImplicitArgPtr(DAG, DL);
	return getPreloadedValue(DAG, *MFI, VT,
	AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
	}
	case Intrinsic::amdgcn_kernarg_segment_ptr: {
	return getPreloadedValue(DAG, *MFI, VT,
	AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
	}
	case Intrinsic::amdgcn_dispatch_id: {
	return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
	}
	case Intrinsic::amdgcn_rcp:
	return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
	case Intrinsic::amdgcn_rsq:
	return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
	case Intrinsic::amdgcn_rsq_legacy:
	if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
	return emitRemovedIntrinsicError(DAG, DL, VT);

	return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
	case Intrinsic::amdgcn_rcp_legacy:
	if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
	return emitRemovedIntrinsicError(DAG, DL, VT);
	return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
	case Intrinsic::amdgcn_rsq_clamp: {
	if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
	return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));

	Type Type = VT.getTypeForEVT(DAG.getContext());
	APFloat Max = APFloat::getLargest(Type->getFltSemantics());
	APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);

	SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
	SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
	DAG.getConstantFP(Max, DL, VT));
	return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
	DAG.getConstantFP(Min, DL, VT));
	}
	case Intrinsic::r600_read_ngroups_x:
	if (Subtarget->isAmdHsaOS())
	return emitNonHSAIntrinsicError(DAG, DL, VT);

	return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
	SI::KernelInputOffsets::NGROUPS_X, false);
	case Intrinsic::r600_read_ngroups_y:
	if (Subtarget->isAmdHsaOS())
	return emitNonHSAIntrinsicError(DAG, DL, VT);

	return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
	SI::KernelInputOffsets::NGROUPS_Y, false);
	case Intrinsic::r600_read_ngroups_z:
	if (Subtarget->isAmdHsaOS())
	return emitNonHSAIntrinsicError(DAG, DL, VT);

	return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
	SI::KernelInputOffsets::NGROUPS_Z, false);
	case Intrinsic::r600_read_global_size_x:
	if (Subtarget->isAmdHsaOS())
	return emitNonHSAIntrinsicError(DAG, DL, VT);

	return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
	SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
	case Intrinsic::r600_read_global_size_y:
	if (Subtarget->isAmdHsaOS())
	return emitNonHSAIntrinsicError(DAG, DL, VT);

	return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
	SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
	case Intrinsic::r600_read_global_size_z:
	if (Subtarget->isAmdHsaOS())
	return emitNonHSAIntrinsicError(DAG, DL, VT);

	return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
	SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
	case Intrinsic::r600_read_local_size_x:
	if (Subtarget->isAmdHsaOS())
	return emitNonHSAIntrinsicError(DAG, DL, VT);

	return lowerImplicitZextParam(DAG, Op, MVT::i16,
	SI::KernelInputOffsets::LOCAL_SIZE_X);
	case Intrinsic::r600_read_local_size_y:
	if (Subtarget->isAmdHsaOS())
	return emitNonHSAIntrinsicError(DAG, DL, VT);

	return lowerImplicitZextParam(DAG, Op, MVT::i16,
	SI::KernelInputOffsets::LOCAL_SIZE_Y);
	case Intrinsic::r600_read_local_size_z:
	if (Subtarget->isAmdHsaOS())
	return emitNonHSAIntrinsicError(DAG, DL, VT);

	return lowerImplicitZextParam(DAG, Op, MVT::i16,
	SI::KernelInputOffsets::LOCAL_SIZE_Z);
	case Intrinsic::amdgcn_workgroup_id_x:
	case Intrinsic::r600_read_tgid_x:
	return getPreloadedValue(DAG, *MFI, VT,
	AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
	case Intrinsic::amdgcn_workgroup_id_y:
	case Intrinsic::r600_read_tgid_y:
	return getPreloadedValue(DAG, *MFI, VT,
	AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
	case Intrinsic::amdgcn_workgroup_id_z:
	case Intrinsic::r600_read_tgid_z:
	return getPreloadedValue(DAG, *MFI, VT,
	AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
	case Intrinsic::amdgcn_workitem_id_x: {
	case Intrinsic::r600_read_tidig_x:
	return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
	SDLoc(DAG.getEntryNode()),
	MFI->getArgInfo().WorkItemIDX);
	}
	case Intrinsic::amdgcn_workitem_id_y:
	case Intrinsic::r600_read_tidig_y:
	return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
	SDLoc(DAG.getEntryNode()),
	MFI->getArgInfo().WorkItemIDY);
	case Intrinsic::amdgcn_workitem_id_z:
	case Intrinsic::r600_read_tidig_z:
	return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
	SDLoc(DAG.getEntryNode()),
	MFI->getArgInfo().WorkItemIDZ);
	case AMDGPUIntrinsic::SI_load_const: {
	SDValue Ops[] = {
	Op.getOperand(1),
	Op.getOperand(2)
	};

	MachineMemOperand *MMO = MF.getMachineMemOperand(
	MachinePointerInfo(),
	MachineMemOperand::MOLoad \| MachineMemOperand::MODereferenceable \|
	MachineMemOperand::MOInvariant,
	VT.getStoreSize(), 4);
	return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
	Op->getVTList(), Ops, VT, MMO);
	}
	case Intrinsic::amdgcn_fdiv_fast:
	return lowerFDIV_FAST(Op, DAG);
	case Intrinsic::amdgcn_interp_mov: {
	SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
	SDValue Glue = M0.getValue(1);
	return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1),
	Op.getOperand(2), Op.getOperand(3), Glue);
	}
	case Intrinsic::amdgcn_interp_p1: {
	SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
	SDValue Glue = M0.getValue(1);
	return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
	Op.getOperand(2), Op.getOperand(3), Glue);
	}
	case Intrinsic::amdgcn_interp_p2: {
	SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
	SDValue Glue = SDValue(M0.getNode(), 1);
	return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
	Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
	Glue);
	}
	case Intrinsic::amdgcn_sin:
	return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));

	case Intrinsic::amdgcn_cos:
	return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));

	case Intrinsic::amdgcn_log_clamp: {
	if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
	return SDValue();

	DiagnosticInfoUnsupported BadIntrin(
	MF.getFunction(), "intrinsic not supported on subtarget",
	DL.getDebugLoc());
	DAG.getContext()->diagnose(BadIntrin);
	return DAG.getUNDEF(VT);
	}
	case Intrinsic::amdgcn_ldexp:
	return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
	Op.getOperand(1), Op.getOperand(2));

	case Intrinsic::amdgcn_fract:
	return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));

	case Intrinsic::amdgcn_class:
	return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::amdgcn_div_fmas:
	return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
	Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
	Op.getOperand(4));

	case Intrinsic::amdgcn_div_fixup:
	return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
	Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));

	case Intrinsic::amdgcn_trig_preop:
	return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::amdgcn_div_scale: {
	// 3rd parameter required to be a constant.
	const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
	if (!Param)
	return DAG.getMergeValues({ DAG.getUNDEF(VT), DAG.getUNDEF(MVT::i1) }, DL);

	// Translate to the operands expected by the machine instruction. The
	// first parameter must be the same as the first instruction.
	SDValue Numerator = Op.getOperand(1);
	SDValue Denominator = Op.getOperand(2);

	// Note this order is opposite of the machine instruction's operations,
	// which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
	// intrinsic has the numerator as the first operand to match a normal
	// division operation.

	SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;

	return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
	Denominator, Numerator);
	}
	case Intrinsic::amdgcn_icmp: {
	const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
	if (!CD)
	return DAG.getUNDEF(VT);

	int CondCode = CD->getSExtValue();
	if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE \|\|
	CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
	return DAG.getUNDEF(VT);

	ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
	ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
	return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
	Op.getOperand(2), DAG.getCondCode(CCOpcode));
	}
	case Intrinsic::amdgcn_fcmp: {
	const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
	if (!CD)
	return DAG.getUNDEF(VT);

	int CondCode = CD->getSExtValue();
	if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE \|\|
	CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE)
	return DAG.getUNDEF(VT);

	FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
	ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
	return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
	Op.getOperand(2), DAG.getCondCode(CCOpcode));
	}
	case Intrinsic::amdgcn_fmed3:
	return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
	Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
	case Intrinsic::amdgcn_fmul_legacy:
	return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::amdgcn_sffbh:
	return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
	case Intrinsic::amdgcn_sbfe:
	return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
	Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
	case Intrinsic::amdgcn_ubfe:
	return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
	Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
	- case Intrinsic::amdgcn_cvt_pkrtz: {
	- // FIXME: Stop adding cast if v2f16 legal.
	+ case Intrinsic::amdgcn_cvt_pkrtz:
	+ case Intrinsic::amdgcn_cvt_pknorm_i16:
	+ case Intrinsic::amdgcn_cvt_pknorm_u16:
	+ case Intrinsic::amdgcn_cvt_pk_i16:
	+ case Intrinsic::amdgcn_cvt_pk_u16: {
	+ // FIXME: Stop adding cast if v2f16/v2i16 are legal.
	EVT VT = Op.getValueType();
	- SDValue Node = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, MVT::i32,
	+ unsigned Opcode;
	+
	+ if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
	+ Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
	+ else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
	+ Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
	+ else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
	+ Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
	+ else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
	+ Opcode = AMDGPUISD::CVT_PK_I16_I32;
	+ else
	+ Opcode = AMDGPUISD::CVT_PK_U16_U32;
	+
	+ SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
	Op.getOperand(1), Op.getOperand(2));
	return DAG.getNode(ISD::BITCAST, DL, VT, Node);
	}
	case Intrinsic::amdgcn_wqm: {
	SDValue Src = Op.getOperand(1);
	return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
	0);
	}
	case Intrinsic::amdgcn_wwm: {
	SDValue Src = Op.getOperand(1);
	return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
	0);
	}
	case Intrinsic::amdgcn_image_getlod:
	case Intrinsic::amdgcn_image_getresinfo: {
	unsigned Idx = (IntrinsicID == Intrinsic::amdgcn_image_getresinfo) ? 3 : 4;

	// Replace dmask with everything disabled with undef.
	const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(Idx));
	if (!DMask \|\| DMask->isNullValue())
	return DAG.getUNDEF(Op.getValueType());
	return SDValue();
	}
	default:
	return Op;
	}
	}

	SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
	SelectionDAG &DAG) const {
	unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
	SDLoc DL(Op);

	switch (IntrID) {
	case Intrinsic::amdgcn_atomic_inc:
	case Intrinsic::amdgcn_atomic_dec: {
	MemSDNode *M = cast<MemSDNode>(Op);
	unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ?
	AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC;
	SDValue Ops[] = {
	M->getOperand(0), // Chain
	M->getOperand(2), // Ptr
	M->getOperand(3) // Value
	};

	return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
	M->getMemoryVT(), M->getMemOperand());
	}
	case Intrinsic::amdgcn_buffer_load:
	case Intrinsic::amdgcn_buffer_load_format: {
	SDValue Ops[] = {
	Op.getOperand(0), // Chain
	Op.getOperand(2), // rsrc
	Op.getOperand(3), // vindex
	Op.getOperand(4), // offset
	Op.getOperand(5), // glc
	Op.getOperand(6) // slc
	};

	unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
	AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
	EVT VT = Op.getValueType();
	EVT IntVT = VT.changeTypeToInteger();

	auto *M = cast<MemSDNode>(Op);
	return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
	M->getMemOperand());
	}
	case Intrinsic::amdgcn_tbuffer_load: {
	MemSDNode *M = cast<MemSDNode>(Op);
	SDValue Ops[] = {
	Op.getOperand(0), // Chain
	Op.getOperand(2), // rsrc
	Op.getOperand(3), // vindex
	Op.getOperand(4), // voffset
	Op.getOperand(5), // soffset
	Op.getOperand(6), // offset
	Op.getOperand(7), // dfmt
	Op.getOperand(8), // nfmt
	Op.getOperand(9), // glc
	Op.getOperand(10) // slc
	};

	EVT VT = Op.getValueType();

	return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
	Op->getVTList(), Ops, VT, M->getMemOperand());
	}
	case Intrinsic::amdgcn_buffer_atomic_swap:
	case Intrinsic::amdgcn_buffer_atomic_add:
	case Intrinsic::amdgcn_buffer_atomic_sub:
	case Intrinsic::amdgcn_buffer_atomic_smin:
	case Intrinsic::amdgcn_buffer_atomic_umin:
	case Intrinsic::amdgcn_buffer_atomic_smax:
	case Intrinsic::amdgcn_buffer_atomic_umax:
	case Intrinsic::amdgcn_buffer_atomic_and:
	case Intrinsic::amdgcn_buffer_atomic_or:
	case Intrinsic::amdgcn_buffer_atomic_xor: {
	SDValue Ops[] = {
	Op.getOperand(0), // Chain
	Op.getOperand(2), // vdata
	Op.getOperand(3), // rsrc
	Op.getOperand(4), // vindex
	Op.getOperand(5), // offset
	Op.getOperand(6) // slc
	};
	EVT VT = Op.getValueType();

	auto *M = cast<MemSDNode>(Op);
	unsigned Opcode = 0;

	switch (IntrID) {
	case Intrinsic::amdgcn_buffer_atomic_swap:
	Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
	break;
	case Intrinsic::amdgcn_buffer_atomic_add:
	Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
	break;
	case Intrinsic::amdgcn_buffer_atomic_sub:
	Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
	break;
	case Intrinsic::amdgcn_buffer_atomic_smin:
	Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
	break;
	case Intrinsic::amdgcn_buffer_atomic_umin:
	Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
	break;
	case Intrinsic::amdgcn_buffer_atomic_smax:
	Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
	break;
	case Intrinsic::amdgcn_buffer_atomic_umax:
	Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
	break;
	case Intrinsic::amdgcn_buffer_atomic_and:
	Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
	break;
	case Intrinsic::amdgcn_buffer_atomic_or:
	Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
	break;
	case Intrinsic::amdgcn_buffer_atomic_xor:
	Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
	break;
	default:
	llvm_unreachable("unhandled atomic opcode");
	}

	return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
	M->getMemOperand());
	}

	case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
	SDValue Ops[] = {
	Op.getOperand(0), // Chain
	Op.getOperand(2), // src
	Op.getOperand(3), // cmp
	Op.getOperand(4), // rsrc
	Op.getOperand(5), // vindex
	Op.getOperand(6), // offset
	Op.getOperand(7) // slc
	};
	EVT VT = Op.getValueType();
	auto *M = cast<MemSDNode>(Op);

	return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
	Op->getVTList(), Ops, VT, M->getMemOperand());
	}

	// Basic sample.
	case Intrinsic::amdgcn_image_sample:
	case Intrinsic::amdgcn_image_sample_cl:
	case Intrinsic::amdgcn_image_sample_d:
	case Intrinsic::amdgcn_image_sample_d_cl:
	case Intrinsic::amdgcn_image_sample_l:
	case Intrinsic::amdgcn_image_sample_b:
	case Intrinsic::amdgcn_image_sample_b_cl:
	case Intrinsic::amdgcn_image_sample_lz:
	case Intrinsic::amdgcn_image_sample_cd:
	case Intrinsic::amdgcn_image_sample_cd_cl:

	// Sample with comparison.
	case Intrinsic::amdgcn_image_sample_c:
	case Intrinsic::amdgcn_image_sample_c_cl:
	case Intrinsic::amdgcn_image_sample_c_d:
	case Intrinsic::amdgcn_image_sample_c_d_cl:
	case Intrinsic::amdgcn_image_sample_c_l:
	case Intrinsic::amdgcn_image_sample_c_b:
	case Intrinsic::amdgcn_image_sample_c_b_cl:
	case Intrinsic::amdgcn_image_sample_c_lz:
	case Intrinsic::amdgcn_image_sample_c_cd:
	case Intrinsic::amdgcn_image_sample_c_cd_cl:

	// Sample with offsets.
	case Intrinsic::amdgcn_image_sample_o:
	case Intrinsic::amdgcn_image_sample_cl_o:
	case Intrinsic::amdgcn_image_sample_d_o:
	case Intrinsic::amdgcn_image_sample_d_cl_o:
	case Intrinsic::amdgcn_image_sample_l_o:
	case Intrinsic::amdgcn_image_sample_b_o:
	case Intrinsic::amdgcn_image_sample_b_cl_o:
	case Intrinsic::amdgcn_image_sample_lz_o:
	case Intrinsic::amdgcn_image_sample_cd_o:
	case Intrinsic::amdgcn_image_sample_cd_cl_o:

	// Sample with comparison and offsets.
	case Intrinsic::amdgcn_image_sample_c_o:
	case Intrinsic::amdgcn_image_sample_c_cl_o:
	case Intrinsic::amdgcn_image_sample_c_d_o:
	case Intrinsic::amdgcn_image_sample_c_d_cl_o:
	case Intrinsic::amdgcn_image_sample_c_l_o:
	case Intrinsic::amdgcn_image_sample_c_b_o:
	case Intrinsic::amdgcn_image_sample_c_b_cl_o:
	case Intrinsic::amdgcn_image_sample_c_lz_o:
	case Intrinsic::amdgcn_image_sample_c_cd_o:
	case Intrinsic::amdgcn_image_sample_c_cd_cl_o: {
	// Replace dmask with everything disabled with undef.
	const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(5));
	if (!DMask \|\| DMask->isNullValue()) {
	SDValue Undef = DAG.getUNDEF(Op.getValueType());
	return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op));
	}

	return SDValue();
	}
	default:
	return SDValue();
	}
	}

	SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	SDValue Chain = Op.getOperand(0);
	unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
	MachineFunction &MF = DAG.getMachineFunction();

	switch (IntrinsicID) {
	case Intrinsic::amdgcn_exp: {
	const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
	const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
	const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8));
	const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9));

	const SDValue Ops[] = {
	Chain,
	DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
	DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en
	Op.getOperand(4), // src0
	Op.getOperand(5), // src1
	Op.getOperand(6), // src2
	Op.getOperand(7), // src3
	DAG.getTargetConstant(0, DL, MVT::i1), // compr
	DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
	};

	unsigned Opc = Done->isNullValue() ?
	AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
	return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
	}
	case Intrinsic::amdgcn_exp_compr: {
	const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
	const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
	SDValue Src0 = Op.getOperand(4);
	SDValue Src1 = Op.getOperand(5);
	const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
	const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7));

	SDValue Undef = DAG.getUNDEF(MVT::f32);
	const SDValue Ops[] = {
	Chain,
	DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
	DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en
	DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0),
	DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1),
	Undef, // src2
	Undef, // src3
	DAG.getTargetConstant(1, DL, MVT::i1), // compr
	DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
	};

	unsigned Opc = Done->isNullValue() ?
	AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
	return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
	}
	case Intrinsic::amdgcn_s_sendmsg:
	case Intrinsic::amdgcn_s_sendmsghalt: {
	unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ?
	AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT;
	Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
	SDValue Glue = Chain.getValue(1);
	return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
	Op.getOperand(2), Glue);
	}
	case Intrinsic::amdgcn_init_exec: {
	return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,
	Op.getOperand(2));
	}
	case Intrinsic::amdgcn_init_exec_from_input: {
	return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
	Op.getOperand(2), Op.getOperand(3));
	}
	case AMDGPUIntrinsic::AMDGPU_kill: {
	SDValue Src = Op.getOperand(2);
	if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) {
	if (!K->isNegative())
	return Chain;

	SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32);
	return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne);
	}

	SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
	return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast);
	}
	case Intrinsic::amdgcn_s_barrier: {
	if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
	const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
	unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
	if (WGSize <= ST.getWavefrontSize())
	return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
	Op.getOperand(0)), 0);
	}
	return SDValue();
	};
	case AMDGPUIntrinsic::SI_tbuffer_store: {

	// Extract vindex and voffset from vaddr as appropriate
	const ConstantSDNode *OffEn = cast<ConstantSDNode>(Op.getOperand(10));
	const ConstantSDNode *IdxEn = cast<ConstantSDNode>(Op.getOperand(11));
	SDValue VAddr = Op.getOperand(5);

	SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);

	assert(!(OffEn->isOne() && IdxEn->isOne()) &&
	"Legacy intrinsic doesn't support both offset and index - use new version");

	SDValue VIndex = IdxEn->isOne() ? VAddr : Zero;
	SDValue VOffset = OffEn->isOne() ? VAddr : Zero;

	// Deal with the vec-3 case
	const ConstantSDNode *NumChannels = cast<ConstantSDNode>(Op.getOperand(4));
	auto Opcode = NumChannels->getZExtValue() == 3 ?
	AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT;

	SDValue Ops[] = {
	Chain,
	Op.getOperand(3), // vdata
	Op.getOperand(2), // rsrc
	VIndex,
	VOffset,
	Op.getOperand(6), // soffset
	Op.getOperand(7), // inst_offset
	Op.getOperand(8), // dfmt
	Op.getOperand(9), // nfmt
	Op.getOperand(12), // glc
	Op.getOperand(13), // slc
	};

	assert((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 &&
	"Value of tfe other than zero is unsupported");

	EVT VT = Op.getOperand(3).getValueType();
	MachineMemOperand *MMO = MF.getMachineMemOperand(
	MachinePointerInfo(),
	MachineMemOperand::MOStore,
	VT.getStoreSize(), 4);
	return DAG.getMemIntrinsicNode(Opcode, DL,
	Op->getVTList(), Ops, VT, MMO);
	}

	case Intrinsic::amdgcn_tbuffer_store: {
	SDValue Ops[] = {
	Chain,
	Op.getOperand(2), // vdata
	Op.getOperand(3), // rsrc
	Op.getOperand(4), // vindex
	Op.getOperand(5), // voffset
	Op.getOperand(6), // soffset
	Op.getOperand(7), // offset
	Op.getOperand(8), // dfmt
	Op.getOperand(9), // nfmt
	Op.getOperand(10), // glc
	Op.getOperand(11) // slc
	};
	EVT VT = Op.getOperand(3).getValueType();
	MachineMemOperand *MMO = MF.getMachineMemOperand(
	MachinePointerInfo(),
	MachineMemOperand::MOStore,
	VT.getStoreSize(), 4);
	return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
	Op->getVTList(), Ops, VT, MMO);
	}

	case Intrinsic::amdgcn_buffer_store:
	case Intrinsic::amdgcn_buffer_store_format: {
	SDValue Ops[] = {
	Chain,
	Op.getOperand(2), // vdata
	Op.getOperand(3), // rsrc
	Op.getOperand(4), // vindex
	Op.getOperand(5), // offset
	Op.getOperand(6), // glc
	Op.getOperand(7) // slc
	};
	EVT VT = Op.getOperand(3).getValueType();
	MachineMemOperand *MMO = MF.getMachineMemOperand(
	MachinePointerInfo(),
	MachineMemOperand::MOStore \|
	MachineMemOperand::MODereferenceable,
	VT.getStoreSize(), 4);

	unsigned Opcode = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
	AMDGPUISD::BUFFER_STORE :
	AMDGPUISD::BUFFER_STORE_FORMAT;
	return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, MMO);
	}

	default:
	return Op;
	}
	}

	SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
	SDLoc DL(Op);
	LoadSDNode *Load = cast<LoadSDNode>(Op);
	ISD::LoadExtType ExtType = Load->getExtensionType();
	EVT MemVT = Load->getMemoryVT();

	if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
	if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
	return SDValue();

	// FIXME: Copied from PPC
	// First, load into 32 bits, then truncate to 1 bit.

	SDValue Chain = Load->getChain();
	SDValue BasePtr = Load->getBasePtr();
	MachineMemOperand *MMO = Load->getMemOperand();

	EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;

	SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
	BasePtr, RealMemVT, MMO);

	SDValue Ops[] = {
	DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
	NewLD.getValue(1)
	};

	return DAG.getMergeValues(Ops, DL);
	}

	if (!MemVT.isVector())
	return SDValue();

	assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
	"Custom lowering for non-i32 vectors hasn't been implemented.");

	unsigned AS = Load->getAddressSpace();
	if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
	AS, Load->getAlignment())) {
	SDValue Ops[2];
	std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
	return DAG.getMergeValues(Ops, DL);
	}

	MachineFunction &MF = DAG.getMachineFunction();
	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
	// If there is a possibilty that flat instruction access scratch memory
	// then we need to use the same legalization rules we use for private.
	if (AS == AMDGPUASI.FLAT_ADDRESS)
	AS = MFI->hasFlatScratchInit() ?
	AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;

	unsigned NumElements = MemVT.getVectorNumElements();
	if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
	if (isMemOpUniform(Load))
	return SDValue();
	// Non-uniform loads will be selected to MUBUF instructions, so they
	// have the same legalization requirements as global and private
	// loads.
	//
	}
	if (AS == AMDGPUASI.CONSTANT_ADDRESS \|\| AS == AMDGPUASI.GLOBAL_ADDRESS) {
	if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) &&
	!Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load))
	return SDValue();
	// Non-uniform loads will be selected to MUBUF instructions, so they
	// have the same legalization requirements as global and private
	// loads.
	//
	}
	if (AS == AMDGPUASI.CONSTANT_ADDRESS \|\| AS == AMDGPUASI.GLOBAL_ADDRESS \|\|
	AS == AMDGPUASI.FLAT_ADDRESS) {
	if (NumElements > 4)
	return SplitVectorLoad(Op, DAG);
	// v4 loads are supported for private and global memory.
	return SDValue();
	}
	if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
	// Depending on the setting of the private_element_size field in the
	// resource descriptor, we can only make private accesses up to a certain
	// size.
	switch (Subtarget->getMaxPrivateElementSize()) {
	case 4:
	return scalarizeVectorLoad(Load, DAG);
	case 8:
	if (NumElements > 2)
	return SplitVectorLoad(Op, DAG);
	return SDValue();
	case 16:
	// Same as global/flat
	if (NumElements > 4)
	return SplitVectorLoad(Op, DAG);
	return SDValue();
	default:
	llvm_unreachable("unsupported private_element_size");
	}
	} else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
	if (NumElements > 2)
	return SplitVectorLoad(Op, DAG);

	if (NumElements == 2)
	return SDValue();

	// If properly aligned, if we split we might be able to use ds_read_b64.
	return SplitVectorLoad(Op, DAG);
	}
	return SDValue();
	}

	SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
	if (Op.getValueType() != MVT::i64)
	return SDValue();

	SDLoc DL(Op);
	SDValue Cond = Op.getOperand(0);

	SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
	SDValue One = DAG.getConstant(1, DL, MVT::i32);

	SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
	SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));

	SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
	SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);

	SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);

	SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
	SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);

	SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);

	SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
	return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
	}

	// Catch division cases where we can use shortcuts with rcp and rsq
	// instructions.
	SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc SL(Op);
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	EVT VT = Op.getValueType();
	const SDNodeFlags Flags = Op->getFlags();
	bool Unsafe = DAG.getTarget().Options.UnsafeFPMath \|\|
	Flags.hasUnsafeAlgebra() \|\| Flags.hasAllowReciprocal();

	if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
	return SDValue();

	if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
	if (Unsafe \|\| VT == MVT::f32 \|\| VT == MVT::f16) {
	if (CLHS->isExactlyValue(1.0)) {
	// v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
	// the CI documentation has a worst case error of 1 ulp.
	// OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
	// use it as long as we aren't trying to use denormals.
	//
	// v_rcp_f16 and v_rsq_f16 DO support denormals.

	// 1.0 / sqrt(x) -> rsq(x)

	// XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
	// error seems really high at 2^29 ULP.
	if (RHS.getOpcode() == ISD::FSQRT)
	return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));

	// 1.0 / x -> rcp(x)
	return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
	}

	// Same as for 1.0, but expand the sign out of the constant.
	if (CLHS->isExactlyValue(-1.0)) {
	// -1.0 / x -> rcp (fneg x)
	SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
	return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
	}
	}
	}

	if (Unsafe) {
	// Turn into multiply by the reciprocal.
	// x / y -> x * (1.0 / y)
	SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
	return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
	}

	return SDValue();
	}

	static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
	EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
	if (GlueChain->getNumValues() <= 1) {
	return DAG.getNode(Opcode, SL, VT, A, B);
	}

	assert(GlueChain->getNumValues() == 3);

	SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
	switch (Opcode) {
	default: llvm_unreachable("no chain equivalent for opcode");
	case ISD::FMUL:
	Opcode = AMDGPUISD::FMUL_W_CHAIN;
	break;
	}

	return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B,
	GlueChain.getValue(2));
	}

	static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
	EVT VT, SDValue A, SDValue B, SDValue C,
	SDValue GlueChain) {
	if (GlueChain->getNumValues() <= 1) {
	return DAG.getNode(Opcode, SL, VT, A, B, C);
	}

	assert(GlueChain->getNumValues() == 3);

	SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
	switch (Opcode) {
	default: llvm_unreachable("no chain equivalent for opcode");
	case ISD::FMA:
	Opcode = AMDGPUISD::FMA_W_CHAIN;
	break;
	}

	return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C,
	GlueChain.getValue(2));
	}

	SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
	if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
	return FastLowered;

	SDLoc SL(Op);
	SDValue Src0 = Op.getOperand(0);
	SDValue Src1 = Op.getOperand(1);

	SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
	SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);

	SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
	SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);

	SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
	SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);

	return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
	}

	// Faster 2.5 ULP division that does not support denormals.
	SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
	SDLoc SL(Op);
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);

	SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);

	const APFloat K0Val(BitsToFloat(0x6f800000));
	const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);

	const APFloat K1Val(BitsToFloat(0x2f800000));
	const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);

	const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);

	EVT SetCCVT =
	getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);

	SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);

	SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);

	// TODO: Should this propagate fast-math-flags?
	r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);

	// rcp does not support denormals.
	SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);

	SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);

	return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
	}

	SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
	if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
	return FastLowered;

	SDLoc SL(Op);
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);

	const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);

	SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);

	SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
	RHS, RHS, LHS);
	SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
	LHS, RHS, LHS);

	// Denominator is scaled to not be denormal, so using rcp is ok.
	SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
	DenominatorScaled);
	SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
	DenominatorScaled);

	const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE \|
	(4 << AMDGPU::Hwreg::OFFSET_SHIFT_) \|
	(1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);

	const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);

	if (!Subtarget->hasFP32Denormals()) {
	SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
	const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
	SL, MVT::i32);
	SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
	DAG.getEntryNode(),
	EnableDenormValue, BitField);
	SDValue Ops[3] = {
	NegDivScale0,
	EnableDenorm.getValue(0),
	EnableDenorm.getValue(1)
	};

	NegDivScale0 = DAG.getMergeValues(Ops, SL);
	}

	SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
	ApproxRcp, One, NegDivScale0);

	SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
	ApproxRcp, Fma0);

	SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
	Fma1, Fma1);

	SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
	NumeratorScaled, Mul);

	SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2);

	SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
	NumeratorScaled, Fma3);

	if (!Subtarget->hasFP32Denormals()) {
	const SDValue DisableDenormValue =
	DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
	SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
	Fma4.getValue(1),
	DisableDenormValue,
	BitField,
	Fma4.getValue(2));

	SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
	DisableDenorm, DAG.getRoot());
	DAG.setRoot(OutputChain);
	}

	SDValue Scale = NumeratorScaled.getValue(1);
	SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
	Fma4, Fma1, Fma3, Scale);

	return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
	}

	SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
	if (DAG.getTarget().Options.UnsafeFPMath)
	return lowerFastUnsafeFDIV(Op, DAG);

	SDLoc SL(Op);
	SDValue X = Op.getOperand(0);
	SDValue Y = Op.getOperand(1);

	const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);

	SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);

	SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);

	SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);

	SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);

	SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);

	SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);

	SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);

	SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);

	SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
	SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);

	SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
	NegDivScale0, Mul, DivScale1);

	SDValue Scale;

	if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
	// Workaround a hardware bug on SI where the condition output from div_scale
	// is not usable.

	const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);

	// Figure out if the scale to use for div_fmas.
	SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
	SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
	SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
	SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);

	SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
	SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);

	SDValue Scale0Hi
	= DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
	SDValue Scale1Hi
	= DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);

	SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
	SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
	Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
	} else {
	Scale = DivScale1.getValue(1);
	}

	SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
	Fma4, Fma3, Mul, Scale);

	return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
	}

	SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();

	if (VT == MVT::f32)
	return LowerFDIV32(Op, DAG);

	if (VT == MVT::f64)
	return LowerFDIV64(Op, DAG);

	if (VT == MVT::f16)
	return LowerFDIV16(Op, DAG);

	llvm_unreachable("Unexpected type for fdiv");
	}

	SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
	SDLoc DL(Op);
	StoreSDNode *Store = cast<StoreSDNode>(Op);
	EVT VT = Store->getMemoryVT();

	if (VT == MVT::i1) {
	return DAG.getTruncStore(Store->getChain(), DL,
	DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
	Store->getBasePtr(), MVT::i1, Store->getMemOperand());
	}

	assert(VT.isVector() &&
	Store->getValue().getValueType().getScalarType() == MVT::i32);

	unsigned AS = Store->getAddressSpace();
	if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
	AS, Store->getAlignment())) {
	return expandUnalignedStore(Store, DAG);
	}

	MachineFunction &MF = DAG.getMachineFunction();
	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
	// If there is a possibilty that flat instruction access scratch memory
	// then we need to use the same legalization rules we use for private.
	if (AS == AMDGPUASI.FLAT_ADDRESS)
	AS = MFI->hasFlatScratchInit() ?
	AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;

	unsigned NumElements = VT.getVectorNumElements();
	if (AS == AMDGPUASI.GLOBAL_ADDRESS \|\|
	AS == AMDGPUASI.FLAT_ADDRESS) {
	if (NumElements > 4)
	return SplitVectorStore(Op, DAG);
	return SDValue();
	} else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
	switch (Subtarget->getMaxPrivateElementSize()) {
	case 4:
	return scalarizeVectorStore(Store, DAG);
	case 8:
	if (NumElements > 2)
	return SplitVectorStore(Op, DAG);
	return SDValue();
	case 16:
	if (NumElements > 4)
	return SplitVectorStore(Op, DAG);
	return SDValue();
	default:
	llvm_unreachable("unsupported private_element_size");
	}
	} else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
	if (NumElements > 2)
	return SplitVectorStore(Op, DAG);

	if (NumElements == 2)
	return Op;

	// If properly aligned, if we split we might be able to use ds_write_b64.
	return SplitVectorStore(Op, DAG);
	} else {
	llvm_unreachable("unhandled address space");
	}
	}

	SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
	SDLoc DL(Op);
	EVT VT = Op.getValueType();
	SDValue Arg = Op.getOperand(0);
	// TODO: Should this propagate fast-math-flags?
	SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
	DAG.getNode(ISD::FMUL, DL, VT, Arg,
	DAG.getConstantFP(0.5/M_PI, DL,
	VT)));

	switch (Op.getOpcode()) {
	case ISD::FCOS:
	return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart);
	case ISD::FSIN:
	return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart);
	default:
	llvm_unreachable("Wrong trig opcode");
	}
	}

	SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
	AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
	assert(AtomicNode->isCompareAndSwap());
	unsigned AS = AtomicNode->getAddressSpace();

	// No custom lowering required for local address space
	if (!isFlatGlobalAddrSpace(AS, AMDGPUASI))
	return Op;

	// Non-local address space requires custom lowering for atomic compare
	// and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
	SDLoc DL(Op);
	SDValue ChainIn = Op.getOperand(0);
	SDValue Addr = Op.getOperand(1);
	SDValue Old = Op.getOperand(2);
	SDValue New = Op.getOperand(3);
	EVT VT = Op.getValueType();
	MVT SimpleVT = VT.getSimpleVT();
	MVT VecType = MVT::getVectorVT(SimpleVT, 2);

	SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
	SDValue Ops[] = { ChainIn, Addr, NewOld };

	return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
	Ops, VT, AtomicNode->getMemOperand());
	}

	//===----------------------------------------------------------------------===//
	// Custom DAG optimizations
	//===----------------------------------------------------------------------===//

	SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	EVT VT = N->getValueType(0);
	EVT ScalarVT = VT.getScalarType();
	if (ScalarVT != MVT::f32)
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	SDLoc DL(N);

	SDValue Src = N->getOperand(0);
	EVT SrcVT = Src.getValueType();

	// TODO: We could try to match extracting the higher bytes, which would be
	// easier if i8 vectors weren't promoted to i32 vectors, particularly after
	// types are legalized. v4i8 -> v4f32 is probably the only case to worry
	// about in practice.
	if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) {
	if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
	SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
	DCI.AddToWorklist(Cvt.getNode());
	return Cvt;
	}
	}

	return SDValue();
	}

	// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)

	// This is a variant of
	// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
	//
	// The normal DAG combiner will do this, but only if the add has one use since
	// that would increase the number of instructions.
	//
	// This prevents us from seeing a constant offset that can be folded into a
	// memory instruction's addressing mode. If we know the resulting add offset of
	// a pointer can be folded into an addressing offset, we can replace the pointer
	// operand with the add of new constant offset. This eliminates one of the uses,
	// and may allow the remaining use to also be simplified.
	//
	SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
	unsigned AddrSpace,
	EVT MemVT,
	DAGCombinerInfo &DCI) const {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// We only do this to handle cases where it's profitable when there are
	// multiple uses of the add, so defer to the standard combine.
	if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) \|\|
	N0->hasOneUse())
	return SDValue();

	const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
	if (!CN1)
	return SDValue();

	const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (!CAdd)
	return SDValue();

	// If the resulting offset is too large, we can't fold it into the addressing
	// mode offset.
	APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
	Type Ty = MemVT.getTypeForEVT(DCI.DAG.getContext());

	AddrMode AM;
	AM.HasBaseReg = true;
	AM.BaseOffs = Offset.getSExtValue();
	if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	SDLoc SL(N);
	EVT VT = N->getValueType(0);

	SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
	SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);

	SDNodeFlags Flags;
	Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
	(N0.getOpcode() == ISD::OR \|\|
	N0->getFlags().hasNoUnsignedWrap()));

	return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
	}

	SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
	DAGCombinerInfo &DCI) const {
	SDValue Ptr = N->getBasePtr();
	SelectionDAG &DAG = DCI.DAG;
	SDLoc SL(N);

	// TODO: We could also do this for multiplies.
	if (Ptr.getOpcode() == ISD::SHL) {
	SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
	N->getMemoryVT(), DCI);
	if (NewPtr) {
	SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());

	NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
	return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
	}
	}

	return SDValue();
	}

	static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
	return (Opc == ISD::AND && (Val == 0 \|\| Val == 0xffffffff)) \|\|
	(Opc == ISD::OR && (Val == 0xffffffff \|\| Val == 0)) \|\|
	(Opc == ISD::XOR && Val == 0);
	}

	// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
	// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
	// integer combine opportunities since most 64-bit operations are decomposed
	// this way. TODO: We won't want this for SALU especially if it is an inline
	// immediate.
	SDValue SITargetLowering::splitBinaryBitConstantOp(
	DAGCombinerInfo &DCI,
	const SDLoc &SL,
	unsigned Opc, SDValue LHS,
	const ConstantSDNode *CRHS) const {
	uint64_t Val = CRHS->getZExtValue();
	uint32_t ValLo = Lo_32(Val);
	uint32_t ValHi = Hi_32(Val);
	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();

	if ((bitOpWithConstantIsReducible(Opc, ValLo) \|\|
	bitOpWithConstantIsReducible(Opc, ValHi)) \|\|
	(CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
	// If we need to materialize a 64-bit immediate, it will be split up later
	// anyway. Avoid creating the harder to understand 64-bit immediate
	// materialization.
	return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
	}

	return SDValue();
	}

	// Returns true if argument is a boolean value which is not serialized into
	// memory or argument and does not require v_cmdmask_b32 to be deserialized.
	static bool isBoolSGPR(SDValue V) {
	if (V.getValueType() != MVT::i1)
	return false;
	switch (V.getOpcode()) {
	default: break;
	case ISD::SETCC:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	case AMDGPUISD::FP_CLASS:
	return true;
	}
	return false;
	}

	SDValue SITargetLowering::performAndCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	if (DCI.isBeforeLegalize())
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);


	const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
	if (VT == MVT::i64 && CRHS) {
	if (SDValue Split
	= splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
	return Split;
	}

	if (CRHS && VT == MVT::i32) {
	// and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
	// nb = number of trailing zeroes in mask
	// It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
	// given that we are selecting 8 or 16 bit fields starting at byte boundary.
	uint64_t Mask = CRHS->getZExtValue();
	unsigned Bits = countPopulation(Mask);
	if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
	(Bits == 8 \|\| Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
	if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
	unsigned Shift = CShift->getZExtValue();
	unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
	unsigned Offset = NB + Shift;
	if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
	SDLoc SL(N);
	SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
	LHS->getOperand(0),
	DAG.getConstant(Offset, SL, MVT::i32),
	DAG.getConstant(Bits, SL, MVT::i32));
	EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
	SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
	DAG.getValueType(NarrowVT));
	SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
	DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
	return Shl;
	}
	}
	}
	}

	// (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
	// fp_class x, ~(s_nan \| q_nan \| n_infinity \| p_infinity)
	if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
	ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
	ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();

	SDValue X = LHS.getOperand(0);
	SDValue Y = RHS.getOperand(0);
	if (Y.getOpcode() != ISD::FABS \|\| Y.getOperand(0) != X)
	return SDValue();

	if (LCC == ISD::SETO) {
	if (X != LHS.getOperand(1))
	return SDValue();

	if (RCC == ISD::SETUNE) {
	const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
	if (!C1 \|\| !C1->isInfinity() \|\| C1->isNegative())
	return SDValue();

	const uint32_t Mask = SIInstrFlags::N_NORMAL \|
	SIInstrFlags::N_SUBNORMAL \|
	SIInstrFlags::N_ZERO \|
	SIInstrFlags::P_ZERO \|
	SIInstrFlags::P_SUBNORMAL \|
	SIInstrFlags::P_NORMAL;

	static_assert(((~(SIInstrFlags::S_NAN \|
	SIInstrFlags::Q_NAN \|
	SIInstrFlags::N_INFINITY \|
	SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
	"mask not equal");

	SDLoc DL(N);
	return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
	X, DAG.getConstant(Mask, DL, MVT::i32));
	}
	}
	}

	if (VT == MVT::i32 &&
	(RHS.getOpcode() == ISD::SIGN_EXTEND \|\| LHS.getOpcode() == ISD::SIGN_EXTEND)) {
	// and x, (sext cc from i1) => select cc, x, 0
	if (RHS.getOpcode() != ISD::SIGN_EXTEND)
	std::swap(LHS, RHS);
	if (isBoolSGPR(RHS.getOperand(0)))
	return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
	LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
	}

	return SDValue();
	}

	SDValue SITargetLowering::performOrCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);

	EVT VT = N->getValueType(0);
	if (VT == MVT::i1) {
	// or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 \| c2)
	if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
	RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
	SDValue Src = LHS.getOperand(0);
	if (Src != RHS.getOperand(0))
	return SDValue();

	const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
	const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
	if (!CLHS \|\| !CRHS)
	return SDValue();

	// Only 10 bits are used.
	static const uint32_t MaxMask = 0x3ff;

	uint32_t NewMask = (CLHS->getZExtValue() \| CRHS->getZExtValue()) & MaxMask;
	SDLoc DL(N);
	return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
	Src, DAG.getConstant(NewMask, DL, MVT::i32));
	}

	return SDValue();
	}

	if (VT != MVT::i64)
	return SDValue();

	// TODO: This could be a generic combine with a predicate for extracting the
	// high half of an integer being free.

	// (or i64:x, (zero_extend i32:y)) ->
	// i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
	if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
	RHS.getOpcode() != ISD::ZERO_EXTEND)
	std::swap(LHS, RHS);

	if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
	SDValue ExtSrc = RHS.getOperand(0);
	EVT SrcVT = ExtSrc.getValueType();
	if (SrcVT == MVT::i32) {
	SDLoc SL(N);
	SDValue LowLHS, HiBits;
	std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
	SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);

	DCI.AddToWorklist(LowOr.getNode());
	DCI.AddToWorklist(HiBits.getNode());

	SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
	LowOr, HiBits);
	return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
	}
	}

	const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (CRHS) {
	if (SDValue Split
	= splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS))
	return Split;
	}

	return SDValue();
	}

	SDValue SITargetLowering::performXorCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	EVT VT = N->getValueType(0);
	if (VT != MVT::i64)
	return SDValue();

	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);

	const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
	if (CRHS) {
	if (SDValue Split
	= splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
	return Split;
	}

	return SDValue();
	}

	// Instructions that will be lowered with a final instruction that zeros the
	// high result bits.
	// XXX - probably only need to list legal operations.
	static bool fp16SrcZerosHighBits(unsigned Opc) {
	switch (Opc) {
	case ISD::FADD:
	case ISD::FSUB:
	case ISD::FMUL:
	case ISD::FDIV:
	case ISD::FREM:
	case ISD::FMA:
	case ISD::FMAD:
	case ISD::FCANONICALIZE:
	case ISD::FP_ROUND:
	case ISD::UINT_TO_FP:
	case ISD::SINT_TO_FP:
	case ISD::FABS:
	// Fabs is lowered to a bit operation, but it's an and which will clear the
	// high bits anyway.
	case ISD::FSQRT:
	case ISD::FSIN:
	case ISD::FCOS:
	case ISD::FPOWI:
	case ISD::FPOW:
	case ISD::FLOG:
	case ISD::FLOG2:
	case ISD::FLOG10:
	case ISD::FEXP:
	case ISD::FEXP2:
	case ISD::FCEIL:
	case ISD::FTRUNC:
	case ISD::FRINT:
	case ISD::FNEARBYINT:
	case ISD::FROUND:
	case ISD::FFLOOR:
	case ISD::FMINNUM:
	case ISD::FMAXNUM:
	case AMDGPUISD::FRACT:
	case AMDGPUISD::CLAMP:
	case AMDGPUISD::COS_HW:
	case AMDGPUISD::SIN_HW:
	case AMDGPUISD::FMIN3:
	case AMDGPUISD::FMAX3:
	case AMDGPUISD::FMED3:
	case AMDGPUISD::FMAD_FTZ:
	case AMDGPUISD::RCP:
	case AMDGPUISD::RSQ:
	case AMDGPUISD::LDEXP:
	return true;
	default:
	// fcopysign, select and others may be lowered to 32-bit bit operations
	// which don't zero the high bits.
	return false;
	}
	}

	SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	if (!Subtarget->has16BitInsts() \|\|
	DCI.getDAGCombineLevel() < AfterLegalizeDAG)
	return SDValue();

	EVT VT = N->getValueType(0);
	if (VT != MVT::i32)
	return SDValue();

	SDValue Src = N->getOperand(0);
	if (Src.getValueType() != MVT::i16)
	return SDValue();

	// (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
	// FIXME: It is not universally true that the high bits are zeroed on gfx9.
	if (Src.getOpcode() == ISD::BITCAST) {
	SDValue BCSrc = Src.getOperand(0);
	if (BCSrc.getValueType() == MVT::f16 &&
	fp16SrcZerosHighBits(BCSrc.getOpcode()))
	return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
	}

	return SDValue();
	}

	SDValue SITargetLowering::performClassCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	SDValue Mask = N->getOperand(1);

	// fp_class x, 0 -> false
	if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
	if (CMask->isNullValue())
	return DAG.getConstant(0, SDLoc(N), MVT::i1);
	}

	if (N->getOperand(0).isUndef())
	return DAG.getUNDEF(MVT::i1);

	return SDValue();
	}

	static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
	if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
	return true;

	return DAG.isKnownNeverNaN(Op);
	}

	static bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
	const SISubtarget *ST, unsigned MaxDepth=5) {
	// If source is a result of another standard FP operation it is already in
	// canonical form.

	switch (Op.getOpcode()) {
	default:
	break;

	// These will flush denorms if required.
	case ISD::FADD:
	case ISD::FSUB:
	case ISD::FMUL:
	case ISD::FSQRT:
	case ISD::FCEIL:
	case ISD::FFLOOR:
	case ISD::FMA:
	case ISD::FMAD:

	case ISD::FCANONICALIZE:
	return true;

	case ISD::FP_ROUND:
	return Op.getValueType().getScalarType() != MVT::f16 \|\|
	ST->hasFP16Denormals();

	case ISD::FP_EXTEND:
	return Op.getOperand(0).getValueType().getScalarType() != MVT::f16 \|\|
	ST->hasFP16Denormals();

	case ISD::FP16_TO_FP:
	case ISD::FP_TO_FP16:
	return ST->hasFP16Denormals();

	// It can/will be lowered or combined as a bit operation.
	// Need to check their input recursively to handle.
	case ISD::FNEG:
	case ISD::FABS:
	return (MaxDepth > 0) &&
	isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1);

	case ISD::FSIN:
	case ISD::FCOS:
	case ISD::FSINCOS:
	return Op.getValueType().getScalarType() != MVT::f16;

	// In pre-GFX9 targets V_MIN_F32 and others do not flush denorms.
	// For such targets need to check their input recursively.
	case ISD::FMINNUM:
	case ISD::FMAXNUM:
	case ISD::FMINNAN:
	case ISD::FMAXNAN:

	if (ST->supportsMinMaxDenormModes() &&
	DAG.isKnownNeverNaN(Op.getOperand(0)) &&
	DAG.isKnownNeverNaN(Op.getOperand(1)))
	return true;

	return (MaxDepth > 0) &&
	isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1) &&
	isCanonicalized(DAG, Op.getOperand(1), ST, MaxDepth - 1);

	case ISD::ConstantFP: {
	auto F = cast<ConstantFPSDNode>(Op)->getValueAPF();
	return !F.isDenormal() && !(F.isNaN() && F.isSignaling());
	}
	}
	return false;
	}

	// Constant fold canonicalize.
	SDValue SITargetLowering::performFCanonicalizeCombine(
	SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	ConstantFPSDNode *CFP = isConstOrConstSplatFP(N->getOperand(0));

	if (!CFP) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N0.getValueType().getScalarType();
	auto ST = getSubtarget();

	if (((VT == MVT::f32 && ST->hasFP32Denormals()) \|\|
	(VT == MVT::f64 && ST->hasFP64Denormals()) \|\|
	(VT == MVT::f16 && ST->hasFP16Denormals())) &&
	DAG.isKnownNeverNaN(N0))
	return N0;

	bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());

	if ((IsIEEEMode \|\| isKnownNeverSNan(DAG, N0)) &&
	isCanonicalized(DAG, N0, ST))
	return N0;

	return SDValue();
	}

	const APFloat &C = CFP->getValueAPF();

	// Flush denormals to 0 if not enabled.
	if (C.isDenormal()) {
	EVT VT = N->getValueType(0);
	EVT SVT = VT.getScalarType();
	if (SVT == MVT::f32 && !Subtarget->hasFP32Denormals())
	return DAG.getConstantFP(0.0, SDLoc(N), VT);

	if (SVT == MVT::f64 && !Subtarget->hasFP64Denormals())
	return DAG.getConstantFP(0.0, SDLoc(N), VT);

	if (SVT == MVT::f16 && !Subtarget->hasFP16Denormals())
	return DAG.getConstantFP(0.0, SDLoc(N), VT);
	}

	if (C.isNaN()) {
	EVT VT = N->getValueType(0);
	APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
	if (C.isSignaling()) {
	// Quiet a signaling NaN.
	return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
	}

	// Make sure it is the canonical NaN bitpattern.
	//
	// TODO: Can we use -1 as the canonical NaN value since it's an inline
	// immediate?
	if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
	return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
	}

	return N->getOperand(0);
	}

	static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
	switch (Opc) {
	case ISD::FMAXNUM:
	return AMDGPUISD::FMAX3;
	case ISD::SMAX:
	return AMDGPUISD::SMAX3;
	case ISD::UMAX:
	return AMDGPUISD::UMAX3;
	case ISD::FMINNUM:
	return AMDGPUISD::FMIN3;
	case ISD::SMIN:
	return AMDGPUISD::SMIN3;
	case ISD::UMIN:
	return AMDGPUISD::UMIN3;
	default:
	llvm_unreachable("Not a min/max opcode");
	}
	}

	SDValue SITargetLowering::performIntMed3ImmCombine(
	SelectionDAG &DAG, const SDLoc &SL,
	SDValue Op0, SDValue Op1, bool Signed) const {
	ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
	if (!K1)
	return SDValue();

	ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
	if (!K0)
	return SDValue();

	if (Signed) {
	if (K0->getAPIntValue().sge(K1->getAPIntValue()))
	return SDValue();
	} else {
	if (K0->getAPIntValue().uge(K1->getAPIntValue()))
	return SDValue();
	}

	EVT VT = K0->getValueType(0);
	unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
	if (VT == MVT::i32 \|\| (VT == MVT::i16 && Subtarget->hasMed3_16())) {
	return DAG.getNode(Med3Opc, SL, VT,
	Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
	}

	// If there isn't a 16-bit med3 operation, convert to 32-bit.
	MVT NVT = MVT::i32;
	unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

	SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
	SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
	SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);

	SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
	return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
	}

	static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
	if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
	return C;

	if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
	if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
	return C;
	}

	return nullptr;
	}

	SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
	const SDLoc &SL,
	SDValue Op0,
	SDValue Op1) const {
	ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
	if (!K1)
	return SDValue();

	ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
	if (!K0)
	return SDValue();

	// Ordered >= (although NaN inputs should have folded away by now).
	APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
	if (Cmp == APFloat::cmpGreaterThan)
	return SDValue();

	// TODO: Check IEEE bit enabled?
	EVT VT = Op0.getValueType();
	if (Subtarget->enableDX10Clamp()) {
	// If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
	// hardware fmed3 behavior converting to a min.
	// FIXME: Should this be allowing -0.0?
	if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
	return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
	}

	// med3 for f16 is only available on gfx9+, and not available for v2f16.
	if (VT == MVT::f32 \|\| (VT == MVT::f16 && Subtarget->hasMed3_16())) {
	// This isn't safe with signaling NaNs because in IEEE mode, min/max on a
	// signaling NaN gives a quiet NaN. The quiet NaN input to the min would
	// then give the other result, which is different from med3 with a NaN
	// input.
	SDValue Var = Op0.getOperand(0);
	if (!isKnownNeverSNan(DAG, Var))
	return SDValue();

	return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
	Var, SDValue(K0, 0), SDValue(K1, 0));
	}

	return SDValue();
	}

	SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;

	EVT VT = N->getValueType(0);
	unsigned Opc = N->getOpcode();
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	// Only do this if the inner op has one use since this will just increases
	// register pressure for no benefit.


	if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
	VT != MVT::f64 &&
	((VT != MVT::f16 && VT != MVT::i16) \|\| Subtarget->hasMin3Max3_16())) {
	// max(max(a, b), c) -> max3(a, b, c)
	// min(min(a, b), c) -> min3(a, b, c)
	if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
	SDLoc DL(N);
	return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
	DL,
	N->getValueType(0),
	Op0.getOperand(0),
	Op0.getOperand(1),
	Op1);
	}

	// Try commuted.
	// max(a, max(b, c)) -> max3(a, b, c)
	// min(a, min(b, c)) -> min3(a, b, c)
	if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
	SDLoc DL(N);
	return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
	DL,
	N->getValueType(0),
	Op0,
	Op1.getOperand(0),
	Op1.getOperand(1));
	}
	}

	// min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
	if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
	if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
	return Med3;
	}

	if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
	if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
	return Med3;
	}

	// fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
	if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) \|\|
	(Opc == AMDGPUISD::FMIN_LEGACY &&
	Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
	(VT == MVT::f32 \|\| VT == MVT::f64 \|\|
	(VT == MVT::f16 && Subtarget->has16BitInsts()) \|\|
	(VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
	Op0.hasOneUse()) {
	if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
	return Res;
	}

	return SDValue();
	}

	static bool isClampZeroToOne(SDValue A, SDValue B) {
	if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
	if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
	// FIXME: Should this be allowing -0.0?
	return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) \|\|
	(CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
	}
	}

	return false;
	}

	// FIXME: Should only worry about snans for version with chain.
	SDValue SITargetLowering::performFMed3Combine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	EVT VT = N->getValueType(0);
	// v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
	// NaNs. With a NaN input, the order of the operands may change the result.

	SelectionDAG &DAG = DCI.DAG;
	SDLoc SL(N);

	SDValue Src0 = N->getOperand(0);
	SDValue Src1 = N->getOperand(1);
	SDValue Src2 = N->getOperand(2);

	if (isClampZeroToOne(Src0, Src1)) {
	// const_a, const_b, x -> clamp is safe in all cases including signaling
	// nans.
	// FIXME: Should this be allowing -0.0?
	return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
	}

	// FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
	// handling no dx10-clamp?
	if (Subtarget->enableDX10Clamp()) {
	// If NaNs is clamped to 0, we are free to reorder the inputs.

	if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
	std::swap(Src0, Src1);

	if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
	std::swap(Src1, Src2);

	if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
	std::swap(Src0, Src1);

	if (isClampZeroToOne(Src1, Src2))
	return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
	}

	return SDValue();
	}

	SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SDValue Src0 = N->getOperand(0);
	SDValue Src1 = N->getOperand(1);
	if (Src0.isUndef() && Src1.isUndef())
	return DCI.DAG.getUNDEF(N->getValueType(0));
	return SDValue();
	}

	SDValue SITargetLowering::performExtractVectorEltCombine(
	SDNode *N, DAGCombinerInfo &DCI) const {
	SDValue Vec = N->getOperand(0);

	SelectionDAG &DAG = DCI.DAG;
	if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) {
	SDLoc SL(N);
	EVT EltVT = N->getValueType(0);
	SDValue Idx = N->getOperand(1);
	SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
	Vec.getOperand(0), Idx);
	return DAG.getNode(ISD::FNEG, SL, EltVT, Elt);
	}

	return SDValue();
	}

	static bool convertBuildVectorCastElt(SelectionDAG &DAG,
	SDValue &Lo, SDValue &Hi) {
	if (Hi.getOpcode() == ISD::BITCAST &&
	Hi.getOperand(0).getValueType() == MVT::f16 &&
	(isa<ConstantSDNode>(Lo) \|\| Lo.isUndef())) {
	Lo = DAG.getNode(ISD::BITCAST, SDLoc(Lo), MVT::f16, Lo);
	Hi = Hi.getOperand(0);
	return true;
	}

	return false;
	}

	SDValue SITargetLowering::performBuildVectorCombine(
	SDNode *N, DAGCombinerInfo &DCI) const {
	SDLoc SL(N);

	if (!isTypeLegal(MVT::v2i16))
	return SDValue();
	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);

	if (VT == MVT::v2i16) {
	SDValue Lo = N->getOperand(0);
	SDValue Hi = N->getOperand(1);

	// v2i16 build_vector (const\|undef), (bitcast f16:$x)
	// -> bitcast (v2f16 build_vector const\|undef, $x
	if (convertBuildVectorCastElt(DAG, Lo, Hi)) {
	SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Lo, Hi });
	return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
	}

	if (convertBuildVectorCastElt(DAG, Hi, Lo)) {
	SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Hi, Lo });
	return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
	}
	}

	return SDValue();
	}

	unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
	const SDNode *N0,
	const SDNode *N1) const {
	EVT VT = N0->getValueType(0);

	// Only do this if we are not trying to support denormals. v_mad_f32 does not
	// support denormals ever.
	if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) \|\|
	(VT == MVT::f16 && !Subtarget->hasFP16Denormals()))
	return ISD::FMAD;

	const TargetOptions &Options = DAG.getTarget().Options;
	if ((Options.AllowFPOpFusion == FPOpFusion::Fast \|\| Options.UnsafeFPMath \|\|
	(N0->getFlags().hasUnsafeAlgebra() &&
	N1->getFlags().hasUnsafeAlgebra())) &&
	isFMAFasterThanFMulAndFAdd(VT)) {
	return ISD::FMA;
	}

	return 0;
	}

	static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
	EVT VT,
	SDValue N0, SDValue N1, SDValue N2,
	bool Signed) {
	unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
	SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
	SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
	return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
	}

	SDValue SITargetLowering::performAddCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);
	SDLoc SL(N);
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);

	if ((LHS.getOpcode() == ISD::MUL \|\| RHS.getOpcode() == ISD::MUL)
	&& Subtarget->hasMad64_32() &&
	!VT.isVector() && VT.getScalarSizeInBits() > 32 &&
	VT.getScalarSizeInBits() <= 64) {
	if (LHS.getOpcode() != ISD::MUL)
	std::swap(LHS, RHS);

	SDValue MulLHS = LHS.getOperand(0);
	SDValue MulRHS = LHS.getOperand(1);
	SDValue AddRHS = RHS;

	// TODO: Maybe restrict if SGPR inputs.
	if (numBitsUnsigned(MulLHS, DAG) <= 32 &&
	numBitsUnsigned(MulRHS, DAG) <= 32) {
	MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32);
	MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32);
	AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64);
	return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
	}

	if (numBitsSigned(MulLHS, DAG) < 32 && numBitsSigned(MulRHS, DAG) < 32) {
	MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
	MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
	AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
	return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
	}

	return SDValue();
	}

	if (VT != MVT::i32)
	return SDValue();

	// add x, zext (setcc) => addcarry x, 0, setcc
	// add x, sext (setcc) => subcarry x, 0, setcc
	unsigned Opc = LHS.getOpcode();
	if (Opc == ISD::ZERO_EXTEND \|\| Opc == ISD::SIGN_EXTEND \|\|
	Opc == ISD::ANY_EXTEND \|\| Opc == ISD::ADDCARRY)
	std::swap(RHS, LHS);

	Opc = RHS.getOpcode();
	switch (Opc) {
	default: break;
	case ISD::ZERO_EXTEND:
	case ISD::SIGN_EXTEND:
	case ISD::ANY_EXTEND: {
	auto Cond = RHS.getOperand(0);
	if (!isBoolSGPR(Cond))
	break;
	SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
	SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
	Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY;
	return DAG.getNode(Opc, SL, VTList, Args);
	}
	case ISD::ADDCARRY: {
	// add x, (addcarry y, 0, cc) => addcarry x, y, cc
	auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
	if (!C \|\| C->getZExtValue() != 0) break;
	SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
	return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args);
	}
	}
	return SDValue();
	}

	SDValue SITargetLowering::performSubCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);

	if (VT != MVT::i32)
	return SDValue();

	SDLoc SL(N);
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);

	unsigned Opc = LHS.getOpcode();
	if (Opc != ISD::SUBCARRY)
	std::swap(RHS, LHS);

	if (LHS.getOpcode() == ISD::SUBCARRY) {
	// sub (subcarry x, 0, cc), y => subcarry x, y, cc
	auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
	if (!C \|\| C->getZExtValue() != 0)
	return SDValue();
	SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
	return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
	}
	return SDValue();
	}

	SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {

	if (N->getValueType(0) != MVT::i32)
	return SDValue();

	auto C = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!C \|\| C->getZExtValue() != 0)
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	SDValue LHS = N->getOperand(0);

	// addcarry (add x, y), 0, cc => addcarry x, y, cc
	// subcarry (sub x, y), 0, cc => subcarry x, y, cc
	unsigned LHSOpc = LHS.getOpcode();
	unsigned Opc = N->getOpcode();
	if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) \|\|
	(LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) {
	SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
	return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
	}
	return SDValue();
	}

	SDValue SITargetLowering::performFAddCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);

	SDLoc SL(N);
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);

	// These should really be instruction patterns, but writing patterns with
	// source modiifiers is a pain.

	// fadd (fadd (a, a), b) -> mad 2.0, a, b
	if (LHS.getOpcode() == ISD::FADD) {
	SDValue A = LHS.getOperand(0);
	if (A == LHS.getOperand(1)) {
	unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
	if (FusedOp != 0) {
	const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
	return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
	}
	}
	}

	// fadd (b, fadd (a, a)) -> mad 2.0, a, b
	if (RHS.getOpcode() == ISD::FADD) {
	SDValue A = RHS.getOperand(0);
	if (A == RHS.getOperand(1)) {
	unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
	if (FusedOp != 0) {
	const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
	return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
	}
	}
	}

	return SDValue();
	}

	SDValue SITargetLowering::performFSubCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	SDLoc SL(N);
	EVT VT = N->getValueType(0);
	assert(!VT.isVector());

	// Try to get the fneg to fold into the source modifier. This undoes generic
	// DAG combines and folds them into the mad.
	//
	// Only do this if we are not trying to support denormals. v_mad_f32 does
	// not support denormals ever.
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	if (LHS.getOpcode() == ISD::FADD) {
	// (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
	SDValue A = LHS.getOperand(0);
	if (A == LHS.getOperand(1)) {
	unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
	if (FusedOp != 0){
	const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
	SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);

	return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
	}
	}
	}

	if (RHS.getOpcode() == ISD::FADD) {
	// (fsub c, (fadd a, a)) -> mad -2.0, a, c

	SDValue A = RHS.getOperand(0);
	if (A == RHS.getOperand(1)) {
	unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
	if (FusedOp != 0){
	const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
	return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
	}
	}
	}

	return SDValue();
	}

	SDValue SITargetLowering::performSetCCCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	SDLoc SL(N);

	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	EVT VT = LHS.getValueType();
	ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();

	auto CRHS = dyn_cast<ConstantSDNode>(RHS);
	if (!CRHS) {
	CRHS = dyn_cast<ConstantSDNode>(LHS);
	if (CRHS) {
	std::swap(LHS, RHS);
	CC = getSetCCSwappedOperands(CC);
	}
	}

	if (CRHS && VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
	isBoolSGPR(LHS.getOperand(0))) {
	// setcc (sext from i1 cc), -1, ne\|sgt\|ult) => not cc => xor cc, -1
	// setcc (sext from i1 cc), -1, eq\|sle\|uge) => cc
	// setcc (sext from i1 cc), 0, eq\|sge\|ule) => not cc => xor cc, -1
	// setcc (sext from i1 cc), 0, ne\|ugt\|slt) => cc
	if ((CRHS->isAllOnesValue() &&
	(CC == ISD::SETNE \|\| CC == ISD::SETGT \|\| CC == ISD::SETULT)) \|\|
	(CRHS->isNullValue() &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETGE \|\| CC == ISD::SETULE)))
	return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
	DAG.getConstant(-1, SL, MVT::i1));
	if ((CRHS->isAllOnesValue() &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETLE \|\| CC == ISD::SETUGE)) \|\|
	(CRHS->isNullValue() &&
	(CC == ISD::SETNE \|\| CC == ISD::SETUGT \|\| CC == ISD::SETLT)))
	return LHS.getOperand(0);
	}

	if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
	VT != MVT::f16))
	return SDValue();

	// Match isinf pattern
	// (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity \| n_infinity))
	if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) {
	const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
	if (!CRHS)
	return SDValue();

	const APFloat &APF = CRHS->getValueAPF();
	if (APF.isInfinity() && !APF.isNegative()) {
	unsigned Mask = SIInstrFlags::P_INFINITY \| SIInstrFlags::N_INFINITY;
	return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
	DAG.getConstant(Mask, SL, MVT::i32));
	}
	}

	return SDValue();
	}

	SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	SDLoc SL(N);
	unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;

	SDValue Src = N->getOperand(0);
	SDValue Srl = N->getOperand(0);
	if (Srl.getOpcode() == ISD::ZERO_EXTEND)
	Srl = Srl.getOperand(0);

	// TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
	if (Srl.getOpcode() == ISD::SRL) {
	// cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
	// cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
	// cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x

	if (const ConstantSDNode *C =
	dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
	Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)),
	EVT(MVT::i32));

	unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
	if (SrcOffset < 32 && SrcOffset % 8 == 0) {
	return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL,
	MVT::f32, Srl);
	}
	}
	}

	APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);

	KnownBits Known;
	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
	!DCI.isBeforeLegalizeOps());
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.ShrinkDemandedConstant(Src, Demanded, TLO) \|\|
	TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) {
	DCI.CommitTargetLoweringOpt(TLO);
	}

	return SDValue();
	}

	SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	switch (N->getOpcode()) {
	default:
	return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
	case ISD::ADD:
	return performAddCombine(N, DCI);
	case ISD::SUB:
	return performSubCombine(N, DCI);
	case ISD::ADDCARRY:
	case ISD::SUBCARRY:
	return performAddCarrySubCarryCombine(N, DCI);
	case ISD::FADD:
	return performFAddCombine(N, DCI);
	case ISD::FSUB:
	return performFSubCombine(N, DCI);
	case ISD::SETCC:
	return performSetCCCombine(N, DCI);
	case ISD::FMAXNUM:
	case ISD::FMINNUM:
	case ISD::SMAX:
	case ISD::SMIN:
	case ISD::UMAX:
	case ISD::UMIN:
	case AMDGPUISD::FMIN_LEGACY:
	case AMDGPUISD::FMAX_LEGACY: {
	if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
	getTargetMachine().getOptLevel() > CodeGenOpt::None)
	return performMinMaxCombine(N, DCI);
	break;
	}
	case ISD::LOAD:
	case ISD::STORE:
	case ISD::ATOMIC_LOAD:
	case ISD::ATOMIC_STORE:
	case ISD::ATOMIC_CMP_SWAP:
	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
	case ISD::ATOMIC_SWAP:
	case ISD::ATOMIC_LOAD_ADD:
	case ISD::ATOMIC_LOAD_SUB:
	case ISD::ATOMIC_LOAD_AND:
	case ISD::ATOMIC_LOAD_OR:
	case ISD::ATOMIC_LOAD_XOR:
	case ISD::ATOMIC_LOAD_NAND:
	case ISD::ATOMIC_LOAD_MIN:
	case ISD::ATOMIC_LOAD_MAX:
	case ISD::ATOMIC_LOAD_UMIN:
	case ISD::ATOMIC_LOAD_UMAX:
	case AMDGPUISD::ATOMIC_INC:
	case AMDGPUISD::ATOMIC_DEC: // TODO: Target mem intrinsics.
	if (DCI.isBeforeLegalize())
	break;
	return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
	case ISD::AND:
	return performAndCombine(N, DCI);
	case ISD::OR:
	return performOrCombine(N, DCI);
	case ISD::XOR:
	return performXorCombine(N, DCI);
	case ISD::ZERO_EXTEND:
	return performZeroExtendCombine(N, DCI);
	case AMDGPUISD::FP_CLASS:
	return performClassCombine(N, DCI);
	case ISD::FCANONICALIZE:
	return performFCanonicalizeCombine(N, DCI);
	case AMDGPUISD::FRACT:
	case AMDGPUISD::RCP:
	case AMDGPUISD::RSQ:
	case AMDGPUISD::RCP_LEGACY:
	case AMDGPUISD::RSQ_LEGACY:
	case AMDGPUISD::RSQ_CLAMP:
	case AMDGPUISD::LDEXP: {
	SDValue Src = N->getOperand(0);
	if (Src.isUndef())
	return Src;
	break;
	}
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	return performUCharToFloatCombine(N, DCI);
	case AMDGPUISD::CVT_F32_UBYTE0:
	case AMDGPUISD::CVT_F32_UBYTE1:
	case AMDGPUISD::CVT_F32_UBYTE2:
	case AMDGPUISD::CVT_F32_UBYTE3:
	return performCvtF32UByteNCombine(N, DCI);
	case AMDGPUISD::FMED3:
	return performFMed3Combine(N, DCI);
	case AMDGPUISD::CVT_PKRTZ_F16_F32:
	return performCvtPkRTZCombine(N, DCI);
	case ISD::SCALAR_TO_VECTOR: {
	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);

	// v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
	if (VT == MVT::v2i16 \|\| VT == MVT::v2f16) {
	SDLoc SL(N);
	SDValue Src = N->getOperand(0);
	EVT EltVT = Src.getValueType();
	if (EltVT == MVT::f16)
	Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);

	SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
	return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
	}

	break;
	}
	case ISD::EXTRACT_VECTOR_ELT:
	return performExtractVectorEltCombine(N, DCI);
	case ISD::BUILD_VECTOR:
	return performBuildVectorCombine(N, DCI);
	}
	return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
	}

	/// \brief Helper function for adjustWritemask
	static unsigned SubIdx2Lane(unsigned Idx) {
	switch (Idx) {
	default: return 0;
	case AMDGPU::sub0: return 0;
	case AMDGPU::sub1: return 1;
	case AMDGPU::sub2: return 2;
	case AMDGPU::sub3: return 3;
	}
	}

	/// \brief Adjust the writemask of MIMG instructions
	SDNode SITargetLowering::adjustWritemask(MachineSDNode &Node,
	SelectionDAG &DAG) const {
	SDNode *Users[4] = { nullptr };
	unsigned Lane = 0;
	unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;
	unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
	unsigned NewDmask = 0;
	bool HasChain = Node->getNumValues() > 1;

	if (OldDmask == 0) {
	// These are folded out, but on the chance it happens don't assert.
	return Node;
	}

	// Try to figure out the used register components
	for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
	I != E; ++I) {

	// Don't look at users of the chain.
	if (I.getUse().getResNo() != 0)
	continue;

	// Abort if we can't understand the usage
	if (!I->isMachineOpcode() \|\|
	I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
	return Node;

	// Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
	// Note that subregs are packed, i.e. Lane==0 is the first bit set
	// in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
	// set, etc.
	Lane = SubIdx2Lane(I->getConstantOperandVal(1));

	// Set which texture component corresponds to the lane.
	unsigned Comp;
	for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
	Comp = countTrailingZeros(Dmask);
	Dmask &= ~(1 << Comp);
	}

	// Abort if we have more than one user per component
	if (Users[Lane])
	return Node;

	Users[Lane] = *I;
	NewDmask \|= 1 << Comp;
	}

	// Abort if there's no change
	if (NewDmask == OldDmask)
	return Node;

	unsigned BitsSet = countPopulation(NewDmask);

	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
	int NewOpcode = AMDGPU::getMaskedMIMGOp(*TII,
	Node->getMachineOpcode(), BitsSet);
	assert(NewOpcode != -1 &&
	NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
	"failed to find equivalent MIMG op");

	// Adjust the writemask in the node
	SmallVector<SDValue, 12> Ops;
	Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
	Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
	Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());

	MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();

	MVT ResultVT = BitsSet == 1 ?
	SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet);
	SDVTList NewVTList = HasChain ?
	DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);


	MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
	NewVTList, Ops);

	if (HasChain) {
	// Update chain.
	NewNode->setMemRefs(Node->memoperands_begin(), Node->memoperands_end());
	DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
	}

	if (BitsSet == 1) {
	assert(Node->hasNUsesOfValue(1, 0));
	SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
	SDLoc(Node), Users[Lane]->getValueType(0),
	SDValue(NewNode, 0));
	DAG.ReplaceAllUsesWith(Users[Lane], Copy);
	return nullptr;
	}

	// Update the users of the node with the new indices
	for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
	SDNode *User = Users[i];
	if (!User)
	continue;

	SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
	DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);

	switch (Idx) {
	default: break;
	case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
	case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
	case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
	}
	}

	DAG.RemoveDeadNode(Node);
	return nullptr;
	}

	static bool isFrameIndexOp(SDValue Op) {
	if (Op.getOpcode() == ISD::AssertZext)
	Op = Op.getOperand(0);

	return isa<FrameIndexSDNode>(Op);
	}

	/// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
	/// with frame index operands.
	/// LLVM assumes that inputs are to these instructions are registers.
	SDNode SITargetLowering::legalizeTargetIndependentNode(SDNode Node,
	SelectionDAG &DAG) const {
	if (Node->getOpcode() == ISD::CopyToReg) {
	RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
	SDValue SrcVal = Node->getOperand(2);

	// Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
	// to try understanding copies to physical registers.
	if (SrcVal.getValueType() == MVT::i1 &&
	TargetRegisterInfo::isPhysicalRegister(DestReg->getReg())) {
	SDLoc SL(Node);
	MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
	SDValue VReg = DAG.getRegister(
	MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);

	SDNode *Glued = Node->getGluedNode();
	SDValue ToVReg
	= DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
	SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
	SDValue ToResultReg
	= DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
	VReg, ToVReg.getValue(1));
	DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
	DAG.RemoveDeadNode(Node);
	return ToResultReg.getNode();
	}
	}

	SmallVector<SDValue, 8> Ops;
	for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
	if (!isFrameIndexOp(Node->getOperand(i))) {
	Ops.push_back(Node->getOperand(i));
	continue;
	}

	SDLoc DL(Node);
	Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
	Node->getOperand(i).getValueType(),
	Node->getOperand(i)), 0));
	}

	return DAG.UpdateNodeOperands(Node, Ops);
	}

	/// \brief Fold the instructions after selecting them.
	/// Returns null if users were already updated.
	SDNode SITargetLowering::PostISelFolding(MachineSDNode Node,
	SelectionDAG &DAG) const {
	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
	unsigned Opcode = Node->getMachineOpcode();

	if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
	!TII->isGather4(Opcode)) {
	return adjustWritemask(Node, DAG);
	}

	if (Opcode == AMDGPU::INSERT_SUBREG \|\|
	Opcode == AMDGPU::REG_SEQUENCE) {
	legalizeTargetIndependentNode(Node, DAG);
	return Node;
	}

	switch (Opcode) {
	case AMDGPU::V_DIV_SCALE_F32:
	case AMDGPU::V_DIV_SCALE_F64: {
	// Satisfy the operand register constraint when one of the inputs is
	// undefined. Ordinarily each undef value will have its own implicit_def of
	// a vreg, so force these to use a single register.
	SDValue Src0 = Node->getOperand(0);
	SDValue Src1 = Node->getOperand(1);
	SDValue Src2 = Node->getOperand(2);

	if ((Src0.isMachineOpcode() &&
	Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
	(Src0 == Src1 \|\| Src0 == Src2))
	break;

	MVT VT = Src0.getValueType().getSimpleVT();
	const TargetRegisterClass *RC = getRegClassFor(VT);

	MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
	SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);

	SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
	UndefReg, Src0, SDValue());

	// src0 must be the same register as src1 or src2, even if the value is
	// undefined, so make sure we don't violate this constraint.
	if (Src0.isMachineOpcode() &&
	Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
	if (Src1.isMachineOpcode() &&
	Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
	Src0 = Src1;
	else if (Src2.isMachineOpcode() &&
	Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
	Src0 = Src2;
	else {
	assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
	Src0 = UndefReg;
	Src1 = UndefReg;
	}
	} else
	break;

	SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 };
	for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I)
	Ops.push_back(Node->getOperand(I));

	Ops.push_back(ImpDef.getValue(1));
	return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
	}
	default:
	break;
	}

	return Node;
	}

	/// \brief Assign the register class depending on the number of
	/// bits set in the writemask
	void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
	SDNode *Node) const {
	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();

	MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();

	if (TII->isVOP3(MI.getOpcode())) {
	// Make sure constant bus requirements are respected.
	TII->legalizeOperandsVOP3(MRI, MI);
	return;
	}

	// Replace unused atomics with the no return version.
	int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
	if (NoRetAtomicOp != -1) {
	if (!Node->hasAnyUseOfValue(0)) {
	MI.setDesc(TII->get(NoRetAtomicOp));
	MI.RemoveOperand(0);
	return;
	}

	// For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
	// instruction, because the return type of these instructions is a vec2 of
	// the memory type, so it can be tied to the input operand.
	// This means these instructions always have a use, so we need to add a
	// special case to check if the atomic has only one extract_subreg use,
	// which itself has no uses.
	if ((Node->hasNUsesOfValue(1, 0) &&
	Node->use_begin()->isMachineOpcode() &&
	Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
	!Node->use_begin()->hasAnyUseOfValue(0))) {
	unsigned Def = MI.getOperand(0).getReg();

	// Change this into a noret atomic.
	MI.setDesc(TII->get(NoRetAtomicOp));
	MI.RemoveOperand(0);

	// If we only remove the def operand from the atomic instruction, the
	// extract_subreg will be left with a use of a vreg without a def.
	// So we need to insert an implicit_def to avoid machine verifier
	// errors.
	BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
	TII->get(AMDGPU::IMPLICIT_DEF), Def);
	}
	return;
	}
	}

	static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
	uint64_t Val) {
	SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
	return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
	}

	MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
	const SDLoc &DL,
	SDValue Ptr) const {
	const SIInstrInfo *TII = getSubtarget()->getInstrInfo();

	// Build the half of the subregister with the constants before building the
	// full 128-bit register. If we are building multiple resource descriptors,
	// this will allow CSEing of the 2-component register.
	const SDValue Ops0[] = {
	DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
	buildSMovImm32(DAG, DL, 0),
	DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
	buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
	DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
	};

	SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
	MVT::v2i32, Ops0), 0);

	// Combine the constants and the pointer.
	const SDValue Ops1[] = {
	DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
	Ptr,
	DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
	SubRegHi,
	DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
	};

	return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
	}

	/// \brief Return a resource descriptor with the 'Add TID' bit enabled
	/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
	/// of the resource descriptor) to create an offset, which is added to
	/// the resource pointer.
	MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
	SDValue Ptr, uint32_t RsrcDword1,
	uint64_t RsrcDword2And3) const {
	SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
	SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
	if (RsrcDword1) {
	PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
	DAG.getConstant(RsrcDword1, DL, MVT::i32)),
	0);
	}

	SDValue DataLo = buildSMovImm32(DAG, DL,
	RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
	SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);

	const SDValue Ops[] = {
	DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
	PtrLo,
	DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
	PtrHi,
	DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
	DataLo,
	DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
	DataHi,
	DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
	};

	return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
	}

	//===----------------------------------------------------------------------===//
	// SI Inline Assembly Support
	//===----------------------------------------------------------------------===//

	std::pair<unsigned, const TargetRegisterClass *>
	SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint,
	MVT VT) const {
	if (!isTypeLegal(VT))
	return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	case 's':
	case 'r':
	switch (VT.getSizeInBits()) {
	default:
	return std::make_pair(0U, nullptr);
	case 32:
	case 16:
	return std::make_pair(0U, &AMDGPU::SReg_32_XM0RegClass);
	case 64:
	return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
	case 128:
	return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
	case 256:
	return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
	case 512:
	return std::make_pair(0U, &AMDGPU::SReg_512RegClass);
	}

	case 'v':
	switch (VT.getSizeInBits()) {
	default:
	return std::make_pair(0U, nullptr);
	case 32:
	case 16:
	return std::make_pair(0U, &AMDGPU::VGPR_32RegClass);
	case 64:
	return std::make_pair(0U, &AMDGPU::VReg_64RegClass);
	case 96:
	return std::make_pair(0U, &AMDGPU::VReg_96RegClass);
	case 128:
	return std::make_pair(0U, &AMDGPU::VReg_128RegClass);
	case 256:
	return std::make_pair(0U, &AMDGPU::VReg_256RegClass);
	case 512:
	return std::make_pair(0U, &AMDGPU::VReg_512RegClass);
	}
	}
	}

	if (Constraint.size() > 1) {
	const TargetRegisterClass *RC = nullptr;
	if (Constraint[1] == 'v') {
	RC = &AMDGPU::VGPR_32RegClass;
	} else if (Constraint[1] == 's') {
	RC = &AMDGPU::SGPR_32RegClass;
	}

	if (RC) {
	uint32_t Idx;
	bool Failed = Constraint.substr(2).getAsInteger(10, Idx);
	if (!Failed && Idx < RC->getNumRegs())
	return std::make_pair(RC->getRegister(Idx), RC);
	}
	}
	return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
	}

	SITargetLowering::ConstraintType
	SITargetLowering::getConstraintType(StringRef Constraint) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	default: break;
	case 's':
	case 'v':
	return C_RegisterClass;
	}
	}
	return TargetLowering::getConstraintType(Constraint);
	}

	// Figure out which registers should be reserved for stack access. Only after
	// the function is legalized do we know all of the non-spill stack objects or if
	// calls are present.
	void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
	MachineRegisterInfo &MRI = MF.getRegInfo();
	SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
	const SIRegisterInfo *TRI = ST.getRegisterInfo();

	if (Info->isEntryFunction()) {
	// Callable functions have fixed registers used for stack access.
	reservePrivateMemoryRegs(getTargetMachine(), MF, TRI, Info);
	}

	// We have to assume the SP is needed in case there are calls in the function
	// during lowering. Calls are only detected after the function is
	// lowered. We're about to reserve registers, so don't bother using it if we
	// aren't really going to use it.
	bool NeedSP = !Info->isEntryFunction() \|\|
	MFI.hasVarSizedObjects() \|\|
	MFI.hasCalls();

	if (NeedSP) {
	unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF);
	Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg);

	assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg());
	assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
	Info->getStackPtrOffsetReg()));
	MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
	}

	MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
	MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
	MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
	Info->getScratchWaveOffsetReg());

	TargetLoweringBase::finalizeLowering(MF);
	}

	void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
	KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth) const {
	TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts,
	DAG, Depth);

	if (getSubtarget()->enableHugePrivateBuffer())
	return;

	// Technically it may be possible to have a dispatch with a single workitem
	// that uses the full private memory size, but that's not really useful. We
	// can't use vaddr in MUBUF instructions if we don't know the address
	// calculation won't overflow, so assume the sign bit is never set.
	Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
	}
	Index: head/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
	===================================================================
	--- head/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (revision 329409)
	+++ head/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (revision 329410)
	@@ -1,935 +1,917 @@
	//===- AMDGPUBaseInfo.cpp - AMDGPU Base encoding information --------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#include "AMDGPUBaseInfo.h"
	#include "AMDGPU.h"
	#include "SIDefines.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/BinaryFormat/ELF.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/Module.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCInstrDesc.h"
	#include "llvm/MC/MCInstrInfo.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/MC/MCSectionELF.h"
	#include "llvm/MC/MCSubtargetInfo.h"
	#include "llvm/MC/SubtargetFeature.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MathExtras.h"
	#include <algorithm>
	#include <cassert>
	#include <cstdint>
	#include <cstring>
	#include <utility>

	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"

	#define GET_INSTRINFO_NAMED_OPS
	#define GET_INSTRMAP_INFO
	#include "AMDGPUGenInstrInfo.inc"
	#undef GET_INSTRMAP_INFO
	#undef GET_INSTRINFO_NAMED_OPS

	namespace {

	/// \returns Bit mask for given bit \p Shift and bit \p Width.
	unsigned getBitMask(unsigned Shift, unsigned Width) {
	return ((1 << Width) - 1) << Shift;
	}

	/// \brief Packs \p Src into \p Dst for given bit \p Shift and bit \p Width.
	///
	/// \returns Packed \p Dst.
	unsigned packBits(unsigned Src, unsigned Dst, unsigned Shift, unsigned Width) {
	Dst &= ~(1 << Shift) & ~getBitMask(Shift, Width);
	Dst \|= (Src << Shift) & getBitMask(Shift, Width);
	return Dst;
	}

	/// \brief Unpacks bits from \p Src for given bit \p Shift and bit \p Width.
	///
	/// \returns Unpacked bits.
	unsigned unpackBits(unsigned Src, unsigned Shift, unsigned Width) {
	return (Src & getBitMask(Shift, Width)) >> Shift;
	}

	/// \returns Vmcnt bit shift (lower bits).
	unsigned getVmcntBitShiftLo() { return 0; }

	/// \returns Vmcnt bit width (lower bits).
	unsigned getVmcntBitWidthLo() { return 4; }

	/// \returns Expcnt bit shift.
	unsigned getExpcntBitShift() { return 4; }

	/// \returns Expcnt bit width.
	unsigned getExpcntBitWidth() { return 3; }

	/// \returns Lgkmcnt bit shift.
	unsigned getLgkmcntBitShift() { return 8; }

	/// \returns Lgkmcnt bit width.
	unsigned getLgkmcntBitWidth() { return 4; }

	/// \returns Vmcnt bit shift (higher bits).
	unsigned getVmcntBitShiftHi() { return 14; }

	/// \returns Vmcnt bit width (higher bits).
	unsigned getVmcntBitWidthHi() { return 2; }

	} // end namespace anonymous

	namespace llvm {

	static cl::opt<bool> EnablePackedInlinableLiterals(
	"enable-packed-inlinable-literals",
	cl::desc("Enable packed inlinable literals (v2f16, v2i16)"),
	cl::init(false));

	namespace AMDGPU {

	LLVM_READNONE
	static inline Channels indexToChannel(unsigned Channel) {
	switch (Channel) {
	case 1:
	return AMDGPU::Channels_1;
	case 2:
	return AMDGPU::Channels_2;
	case 3:
	return AMDGPU::Channels_3;
	case 4:
	return AMDGPU::Channels_4;
	default:
	llvm_unreachable("invalid MIMG channel");
	}
	}


	// FIXME: Need to handle d16 images correctly.
	static unsigned rcToChannels(unsigned RCID) {
	switch (RCID) {
	case AMDGPU::VGPR_32RegClassID:
	return 1;
	case AMDGPU::VReg_64RegClassID:
	return 2;
	case AMDGPU::VReg_96RegClassID:
	return 3;
	case AMDGPU::VReg_128RegClassID:
	return 4;
	default:
	llvm_unreachable("invalid MIMG register class");
	}
	}

	int getMaskedMIMGOp(const MCInstrInfo &MII, unsigned Opc, unsigned NewChannels) {
	AMDGPU::Channels Channel = AMDGPU::indexToChannel(NewChannels);
	unsigned OrigChannels = rcToChannels(MII.get(Opc).OpInfo[0].RegClass);
	if (NewChannels == OrigChannels)
	return Opc;

	switch (OrigChannels) {
	case 1:
	return AMDGPU::getMaskedMIMGOp1(Opc, Channel);
	case 2:
	return AMDGPU::getMaskedMIMGOp2(Opc, Channel);
	case 3:
	return AMDGPU::getMaskedMIMGOp3(Opc, Channel);
	case 4:
	return AMDGPU::getMaskedMIMGOp4(Opc, Channel);
	default:
	llvm_unreachable("invalid MIMG channel");
	}
	}

	// Wrapper for Tablegen'd function. enum Subtarget is not defined in any
	// header files, so we need to wrap it in a function that takes unsigned
	// instead.
	int getMCOpcode(uint16_t Opcode, unsigned Gen) {
	return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen));
	}

	namespace IsaInfo {

	IsaVersion getIsaVersion(const FeatureBitset &Features) {
	// GCN GFX6 (Southern Islands (SI)).
	if (Features.test(FeatureISAVersion6_0_0))
	return {6, 0, 0};
	if (Features.test(FeatureISAVersion6_0_1))
	return {6, 0, 1};

	// GCN GFX7 (Sea Islands (CI)).
	if (Features.test(FeatureISAVersion7_0_0))
	return {7, 0, 0};
	if (Features.test(FeatureISAVersion7_0_1))
	return {7, 0, 1};
	if (Features.test(FeatureISAVersion7_0_2))
	return {7, 0, 2};
	if (Features.test(FeatureISAVersion7_0_3))
	return {7, 0, 3};
	if (Features.test(FeatureISAVersion7_0_4))
	return {7, 0, 4};

	// GCN GFX8 (Volcanic Islands (VI)).
	if (Features.test(FeatureISAVersion8_0_0))
	return {8, 0, 0};
	if (Features.test(FeatureISAVersion8_0_1))
	return {8, 0, 1};
	if (Features.test(FeatureISAVersion8_0_2))
	return {8, 0, 2};
	if (Features.test(FeatureISAVersion8_0_3))
	return {8, 0, 3};
	if (Features.test(FeatureISAVersion8_1_0))
	return {8, 1, 0};

	// GCN GFX9.
	if (Features.test(FeatureISAVersion9_0_0))
	return {9, 0, 0};
	if (Features.test(FeatureISAVersion9_0_2))
	return {9, 0, 2};

	if (!Features.test(FeatureGCN) \|\| Features.test(FeatureSouthernIslands))
	return {0, 0, 0};
	return {7, 0, 0};
	}

	void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) {
	auto TargetTriple = STI->getTargetTriple();
	auto ISAVersion = IsaInfo::getIsaVersion(STI->getFeatureBits());

	Stream << TargetTriple.getArchName() << '-'
	<< TargetTriple.getVendorName() << '-'
	<< TargetTriple.getOSName() << '-'
	<< TargetTriple.getEnvironmentName() << '-'
	<< "gfx"
	<< ISAVersion.Major
	<< ISAVersion.Minor
	<< ISAVersion.Stepping;
	Stream.flush();
	}

	bool hasCodeObjectV3(const FeatureBitset &Features) {
	return Features.test(FeatureCodeObjectV3);
	}

	unsigned getWavefrontSize(const FeatureBitset &Features) {
	if (Features.test(FeatureWavefrontSize16))
	return 16;
	if (Features.test(FeatureWavefrontSize32))
	return 32;

	return 64;
	}

	unsigned getLocalMemorySize(const FeatureBitset &Features) {
	if (Features.test(FeatureLocalMemorySize32768))
	return 32768;
	if (Features.test(FeatureLocalMemorySize65536))
	return 65536;

	return 0;
	}

	unsigned getEUsPerCU(const FeatureBitset &Features) {
	return 4;
	}

	unsigned getMaxWorkGroupsPerCU(const FeatureBitset &Features,
	unsigned FlatWorkGroupSize) {
	if (!Features.test(FeatureGCN))
	return 8;
	unsigned N = getWavesPerWorkGroup(Features, FlatWorkGroupSize);
	if (N == 1)
	return 40;
	N = 40 / N;
	return std::min(N, 16u);
	}

	unsigned getMaxWavesPerCU(const FeatureBitset &Features) {
	return getMaxWavesPerEU(Features) * getEUsPerCU(Features);
	}

	unsigned getMaxWavesPerCU(const FeatureBitset &Features,
	unsigned FlatWorkGroupSize) {
	return getWavesPerWorkGroup(Features, FlatWorkGroupSize);
	}

	unsigned getMinWavesPerEU(const FeatureBitset &Features) {
	return 1;
	}

	unsigned getMaxWavesPerEU(const FeatureBitset &Features) {
	if (!Features.test(FeatureGCN))
	return 8;
	// FIXME: Need to take scratch memory into account.
	return 10;
	}

	unsigned getMaxWavesPerEU(const FeatureBitset &Features,
	unsigned FlatWorkGroupSize) {
	return alignTo(getMaxWavesPerCU(Features, FlatWorkGroupSize),
	getEUsPerCU(Features)) / getEUsPerCU(Features);
	}

	unsigned getMinFlatWorkGroupSize(const FeatureBitset &Features) {
	return 1;
	}

	unsigned getMaxFlatWorkGroupSize(const FeatureBitset &Features) {
	return 2048;
	}

	unsigned getWavesPerWorkGroup(const FeatureBitset &Features,
	unsigned FlatWorkGroupSize) {
	return alignTo(FlatWorkGroupSize, getWavefrontSize(Features)) /
	getWavefrontSize(Features);
	}

	unsigned getSGPRAllocGranule(const FeatureBitset &Features) {
	IsaVersion Version = getIsaVersion(Features);
	if (Version.Major >= 8)
	return 16;
	return 8;
	}

	unsigned getSGPREncodingGranule(const FeatureBitset &Features) {
	return 8;
	}

	unsigned getTotalNumSGPRs(const FeatureBitset &Features) {
	IsaVersion Version = getIsaVersion(Features);
	if (Version.Major >= 8)
	return 800;
	return 512;
	}

	unsigned getAddressableNumSGPRs(const FeatureBitset &Features) {
	if (Features.test(FeatureSGPRInitBug))
	return FIXED_NUM_SGPRS_FOR_INIT_BUG;

	IsaVersion Version = getIsaVersion(Features);
	if (Version.Major >= 8)
	return 102;
	return 104;
	}

	unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
	assert(WavesPerEU != 0);

	if (WavesPerEU >= getMaxWavesPerEU(Features))
	return 0;
	unsigned MinNumSGPRs =
	alignDown(getTotalNumSGPRs(Features) / (WavesPerEU + 1),
	getSGPRAllocGranule(Features)) + 1;
	return std::min(MinNumSGPRs, getAddressableNumSGPRs(Features));
	}

	unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU,
	bool Addressable) {
	assert(WavesPerEU != 0);

	IsaVersion Version = getIsaVersion(Features);
	unsigned MaxNumSGPRs = alignDown(getTotalNumSGPRs(Features) / WavesPerEU,
	getSGPRAllocGranule(Features));
	unsigned AddressableNumSGPRs = getAddressableNumSGPRs(Features);
	if (Version.Major >= 8 && !Addressable)
	AddressableNumSGPRs = 112;
	return std::min(MaxNumSGPRs, AddressableNumSGPRs);
	}

	unsigned getVGPRAllocGranule(const FeatureBitset &Features) {
	return 4;
	}

	unsigned getVGPREncodingGranule(const FeatureBitset &Features) {
	return getVGPRAllocGranule(Features);
	}

	unsigned getTotalNumVGPRs(const FeatureBitset &Features) {
	return 256;
	}

	unsigned getAddressableNumVGPRs(const FeatureBitset &Features) {
	return getTotalNumVGPRs(Features);
	}

	unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
	assert(WavesPerEU != 0);

	if (WavesPerEU >= getMaxWavesPerEU(Features))
	return 0;
	unsigned MinNumVGPRs =
	alignDown(getTotalNumVGPRs(Features) / (WavesPerEU + 1),
	getVGPRAllocGranule(Features)) + 1;
	return std::min(MinNumVGPRs, getAddressableNumVGPRs(Features));
	}

	unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
	assert(WavesPerEU != 0);

	unsigned MaxNumVGPRs = alignDown(getTotalNumVGPRs(Features) / WavesPerEU,
	getVGPRAllocGranule(Features));
	unsigned AddressableNumVGPRs = getAddressableNumVGPRs(Features);
	return std::min(MaxNumVGPRs, AddressableNumVGPRs);
	}

	} // end namespace IsaInfo

	void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
	const FeatureBitset &Features) {
	IsaInfo::IsaVersion ISA = IsaInfo::getIsaVersion(Features);

	memset(&Header, 0, sizeof(Header));

	Header.amd_kernel_code_version_major = 1;
	Header.amd_kernel_code_version_minor = 1;
	Header.amd_machine_kind = 1; // AMD_MACHINE_KIND_AMDGPU
	Header.amd_machine_version_major = ISA.Major;
	Header.amd_machine_version_minor = ISA.Minor;
	Header.amd_machine_version_stepping = ISA.Stepping;
	Header.kernel_code_entry_byte_offset = sizeof(Header);
	// wavefront_size is specified as a power of 2: 2^6 = 64 threads.
	Header.wavefront_size = 6;

	// If the code object does not support indirect functions, then the value must
	// be 0xffffffff.
	Header.call_convention = -1;

	// These alignment values are specified in powers of two, so alignment =
	// 2^n. The minimum alignment is 2^4 = 16.
	Header.kernarg_segment_alignment = 4;
	Header.group_segment_alignment = 4;
	Header.private_segment_alignment = 4;
	}

	bool isGroupSegment(const GlobalValue *GV) {
	return GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
	}

	bool isGlobalSegment(const GlobalValue *GV) {
	return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
	}

	bool isReadOnlySegment(const GlobalValue *GV) {
	return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
	}

	bool shouldEmitConstantsToTextSection(const Triple &TT) {
	return TT.getOS() != Triple::AMDHSA;
	}

	int getIntegerAttribute(const Function &F, StringRef Name, int Default) {
	Attribute A = F.getFnAttribute(Name);
	int Result = Default;

	if (A.isStringAttribute()) {
	StringRef Str = A.getValueAsString();
	if (Str.getAsInteger(0, Result)) {
	LLVMContext &Ctx = F.getContext();
	Ctx.emitError("can't parse integer attribute " + Name);
	}
	}

	return Result;
	}

	std::pair<int, int> getIntegerPairAttribute(const Function &F,
	StringRef Name,
	std::pair<int, int> Default,
	bool OnlyFirstRequired) {
	Attribute A = F.getFnAttribute(Name);
	if (!A.isStringAttribute())
	return Default;

	LLVMContext &Ctx = F.getContext();
	std::pair<int, int> Ints = Default;
	std::pair<StringRef, StringRef> Strs = A.getValueAsString().split(',');
	if (Strs.first.trim().getAsInteger(0, Ints.first)) {
	Ctx.emitError("can't parse first integer attribute " + Name);
	return Default;
	}
	if (Strs.second.trim().getAsInteger(0, Ints.second)) {
	if (!OnlyFirstRequired \|\| !Strs.second.trim().empty()) {
	Ctx.emitError("can't parse second integer attribute " + Name);
	return Default;
	}
	}

	return Ints;
	}

	unsigned getVmcntBitMask(const IsaInfo::IsaVersion &Version) {
	unsigned VmcntLo = (1 << getVmcntBitWidthLo()) - 1;
	if (Version.Major < 9)
	return VmcntLo;

	unsigned VmcntHi = ((1 << getVmcntBitWidthHi()) - 1) << getVmcntBitWidthLo();
	return VmcntLo \| VmcntHi;
	}

	unsigned getExpcntBitMask(const IsaInfo::IsaVersion &Version) {
	return (1 << getExpcntBitWidth()) - 1;
	}

	unsigned getLgkmcntBitMask(const IsaInfo::IsaVersion &Version) {
	return (1 << getLgkmcntBitWidth()) - 1;
	}

	unsigned getWaitcntBitMask(const IsaInfo::IsaVersion &Version) {
	unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(), getVmcntBitWidthLo());
	unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth());
	unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), getLgkmcntBitWidth());
	unsigned Waitcnt = VmcntLo \| Expcnt \| Lgkmcnt;
	if (Version.Major < 9)
	return Waitcnt;

	unsigned VmcntHi = getBitMask(getVmcntBitShiftHi(), getVmcntBitWidthHi());
	return Waitcnt \| VmcntHi;
	}

	unsigned decodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) {
	unsigned VmcntLo =
	unpackBits(Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo());
	if (Version.Major < 9)
	return VmcntLo;

	unsigned VmcntHi =
	unpackBits(Waitcnt, getVmcntBitShiftHi(), getVmcntBitWidthHi());
	VmcntHi <<= getVmcntBitWidthLo();
	return VmcntLo \| VmcntHi;
	}

	unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) {
	return unpackBits(Waitcnt, getExpcntBitShift(), getExpcntBitWidth());
	}

	unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) {
	return unpackBits(Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth());
	}

	void decodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
	unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt) {
	Vmcnt = decodeVmcnt(Version, Waitcnt);
	Expcnt = decodeExpcnt(Version, Waitcnt);
	Lgkmcnt = decodeLgkmcnt(Version, Waitcnt);
	}

	unsigned encodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
	unsigned Vmcnt) {
	Waitcnt =
	packBits(Vmcnt, Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo());
	if (Version.Major < 9)
	return Waitcnt;

	Vmcnt >>= getVmcntBitWidthLo();
	return packBits(Vmcnt, Waitcnt, getVmcntBitShiftHi(), getVmcntBitWidthHi());
	}

	unsigned encodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
	unsigned Expcnt) {
	return packBits(Expcnt, Waitcnt, getExpcntBitShift(), getExpcntBitWidth());
	}

	unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
	unsigned Lgkmcnt) {
	return packBits(Lgkmcnt, Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth());
	}

	unsigned encodeWaitcnt(const IsaInfo::IsaVersion &Version,
	unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt) {
	unsigned Waitcnt = getWaitcntBitMask(Version);
	Waitcnt = encodeVmcnt(Version, Waitcnt, Vmcnt);
	Waitcnt = encodeExpcnt(Version, Waitcnt, Expcnt);
	Waitcnt = encodeLgkmcnt(Version, Waitcnt, Lgkmcnt);
	return Waitcnt;
	}

	unsigned getInitialPSInputAddr(const Function &F) {
	return getIntegerAttribute(F, "InitialPSInputAddr", 0);
	}

	bool isShader(CallingConv::ID cc) {
	switch(cc) {
	case CallingConv::AMDGPU_VS:
	case CallingConv::AMDGPU_LS:
	case CallingConv::AMDGPU_HS:
	case CallingConv::AMDGPU_ES:
	case CallingConv::AMDGPU_GS:
	case CallingConv::AMDGPU_PS:
	case CallingConv::AMDGPU_CS:
	return true;
	default:
	return false;
	}
	}

	bool isCompute(CallingConv::ID cc) {
	return !isShader(cc) \|\| cc == CallingConv::AMDGPU_CS;
	}

	bool isEntryFunctionCC(CallingConv::ID CC) {
	switch (CC) {
	case CallingConv::AMDGPU_KERNEL:
	case CallingConv::SPIR_KERNEL:
	case CallingConv::AMDGPU_VS:
	case CallingConv::AMDGPU_GS:
	case CallingConv::AMDGPU_PS:
	case CallingConv::AMDGPU_CS:
	case CallingConv::AMDGPU_ES:
	case CallingConv::AMDGPU_HS:
	case CallingConv::AMDGPU_LS:
	return true;
	default:
	return false;
	}
	}

	bool isSI(const MCSubtargetInfo &STI) {
	return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands];
	}

	bool isCI(const MCSubtargetInfo &STI) {
	return STI.getFeatureBits()[AMDGPU::FeatureSeaIslands];
	}

	bool isVI(const MCSubtargetInfo &STI) {
	return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands];
	}

	bool isGFX9(const MCSubtargetInfo &STI) {
	return STI.getFeatureBits()[AMDGPU::FeatureGFX9];
	}

	bool isGCN3Encoding(const MCSubtargetInfo &STI) {
	return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding];
	}

	bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
	const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);
	const unsigned FirstSubReg = TRI->getSubReg(Reg, 1);
	return SGPRClass.contains(FirstSubReg != 0 ? FirstSubReg : Reg) \|\|
	Reg == AMDGPU::SCC;
	}

	bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) {
	for (MCRegAliasIterator R(Reg0, TRI, true); R.isValid(); ++R) {
	if (*R == Reg1) return true;
	}
	return false;
	}

	#define MAP_REG2REG \
	using namespace AMDGPU; \
	switch(Reg) { \
	default: return Reg; \
	CASE_CI_VI(FLAT_SCR) \
	CASE_CI_VI(FLAT_SCR_LO) \
	CASE_CI_VI(FLAT_SCR_HI) \
	CASE_VI_GFX9(TTMP0) \
	CASE_VI_GFX9(TTMP1) \
	CASE_VI_GFX9(TTMP2) \
	CASE_VI_GFX9(TTMP3) \
	CASE_VI_GFX9(TTMP4) \
	CASE_VI_GFX9(TTMP5) \
	CASE_VI_GFX9(TTMP6) \
	CASE_VI_GFX9(TTMP7) \
	CASE_VI_GFX9(TTMP8) \
	CASE_VI_GFX9(TTMP9) \
	CASE_VI_GFX9(TTMP10) \
	CASE_VI_GFX9(TTMP11) \
	CASE_VI_GFX9(TTMP12) \
	CASE_VI_GFX9(TTMP13) \
	CASE_VI_GFX9(TTMP14) \
	CASE_VI_GFX9(TTMP15) \
	CASE_VI_GFX9(TTMP0_TTMP1) \
	CASE_VI_GFX9(TTMP2_TTMP3) \
	CASE_VI_GFX9(TTMP4_TTMP5) \
	CASE_VI_GFX9(TTMP6_TTMP7) \
	CASE_VI_GFX9(TTMP8_TTMP9) \
	CASE_VI_GFX9(TTMP10_TTMP11) \
	CASE_VI_GFX9(TTMP12_TTMP13) \
	CASE_VI_GFX9(TTMP14_TTMP15) \
	CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3) \
	CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7) \
	CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11) \
	CASE_VI_GFX9(TTMP12_TTMP13_TTMP14_TTMP15) \
	CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7) \
	CASE_VI_GFX9(TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11) \
	CASE_VI_GFX9(TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
	CASE_VI_GFX9(TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TTMP12_TTMP13_TTMP14_TTMP15) \
	}

	#define CASE_CI_VI(node) \
	assert(!isSI(STI)); \
	case node: return isCI(STI) ? node##_ci : node##_vi;

	#define CASE_VI_GFX9(node) \
	case node: return isGFX9(STI) ? node##_gfx9 : node##_vi;

	unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
	MAP_REG2REG
	}

	#undef CASE_CI_VI
	#undef CASE_VI_GFX9

	#define CASE_CI_VI(node) case node##_ci: case node##_vi: return node;
	#define CASE_VI_GFX9(node) case node##_vi: case node##_gfx9: return node;

	unsigned mc2PseudoReg(unsigned Reg) {
	MAP_REG2REG
	}

	#undef CASE_CI_VI
	#undef CASE_VI_GFX9
	#undef MAP_REG2REG

	bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) {
	assert(OpNo < Desc.NumOperands);
	unsigned OpType = Desc.OpInfo[OpNo].OperandType;
	return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
	OpType <= AMDGPU::OPERAND_SRC_LAST;
	}

	bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
	assert(OpNo < Desc.NumOperands);
	unsigned OpType = Desc.OpInfo[OpNo].OperandType;
	switch (OpType) {
	case AMDGPU::OPERAND_REG_IMM_FP32:
	case AMDGPU::OPERAND_REG_IMM_FP64:
	case AMDGPU::OPERAND_REG_IMM_FP16:
	case AMDGPU::OPERAND_REG_INLINE_C_FP32:
	case AMDGPU::OPERAND_REG_INLINE_C_FP64:
	case AMDGPU::OPERAND_REG_INLINE_C_FP16:
	case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
	return true;
	default:
	return false;
	}
	}

	bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) {
	assert(OpNo < Desc.NumOperands);
	unsigned OpType = Desc.OpInfo[OpNo].OperandType;
	return OpType >= AMDGPU::OPERAND_REG_INLINE_C_FIRST &&
	OpType <= AMDGPU::OPERAND_REG_INLINE_C_LAST;
	}

	// Avoid using MCRegisterClass::getSize, since that function will go away
	// (move from MC* level to Target* level). Return size in bits.
	unsigned getRegBitWidth(unsigned RCID) {
	switch (RCID) {
	case AMDGPU::SGPR_32RegClassID:
	case AMDGPU::VGPR_32RegClassID:
	case AMDGPU::VS_32RegClassID:
	case AMDGPU::SReg_32RegClassID:
	case AMDGPU::SReg_32_XM0RegClassID:
	return 32;
	case AMDGPU::SGPR_64RegClassID:
	case AMDGPU::VS_64RegClassID:
	case AMDGPU::SReg_64RegClassID:
	case AMDGPU::VReg_64RegClassID:
	return 64;
	case AMDGPU::VReg_96RegClassID:
	return 96;
	case AMDGPU::SGPR_128RegClassID:
	case AMDGPU::SReg_128RegClassID:
	case AMDGPU::VReg_128RegClassID:
	return 128;
	case AMDGPU::SReg_256RegClassID:
	case AMDGPU::VReg_256RegClassID:
	return 256;
	case AMDGPU::SReg_512RegClassID:
	case AMDGPU::VReg_512RegClassID:
	return 512;
	default:
	llvm_unreachable("Unexpected register class");
	}
	}

	unsigned getRegBitWidth(const MCRegisterClass &RC) {
	return getRegBitWidth(RC.getID());
	}

	unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
	unsigned OpNo) {
	assert(OpNo < Desc.NumOperands);
	unsigned RCID = Desc.OpInfo[OpNo].RegClass;
	return getRegBitWidth(MRI->getRegClass(RCID)) / 8;
	}

	bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi) {
	if (Literal >= -16 && Literal <= 64)
	return true;

	uint64_t Val = static_cast<uint64_t>(Literal);
	return (Val == DoubleToBits(0.0)) \|\|
	(Val == DoubleToBits(1.0)) \|\|
	(Val == DoubleToBits(-1.0)) \|\|
	(Val == DoubleToBits(0.5)) \|\|
	(Val == DoubleToBits(-0.5)) \|\|
	(Val == DoubleToBits(2.0)) \|\|
	(Val == DoubleToBits(-2.0)) \|\|
	(Val == DoubleToBits(4.0)) \|\|
	(Val == DoubleToBits(-4.0)) \|\|
	(Val == 0x3fc45f306dc9c882 && HasInv2Pi);
	}

	bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi) {
	if (Literal >= -16 && Literal <= 64)
	return true;

	// The actual type of the operand does not seem to matter as long
	// as the bits match one of the inline immediate values. For example:
	//
	// -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal,
	// so it is a legal inline immediate.
	//
	// 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in
	// floating-point, so it is a legal inline immediate.

	uint32_t Val = static_cast<uint32_t>(Literal);
	return (Val == FloatToBits(0.0f)) \|\|
	(Val == FloatToBits(1.0f)) \|\|
	(Val == FloatToBits(-1.0f)) \|\|
	(Val == FloatToBits(0.5f)) \|\|
	(Val == FloatToBits(-0.5f)) \|\|
	(Val == FloatToBits(2.0f)) \|\|
	(Val == FloatToBits(-2.0f)) \|\|
	(Val == FloatToBits(4.0f)) \|\|
	(Val == FloatToBits(-4.0f)) \|\|
	(Val == 0x3e22f983 && HasInv2Pi);
	}

	bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
	if (!HasInv2Pi)
	return false;

	if (Literal >= -16 && Literal <= 64)
	return true;

	uint16_t Val = static_cast<uint16_t>(Literal);
	return Val == 0x3C00 \|\| // 1.0
	Val == 0xBC00 \|\| // -1.0
	Val == 0x3800 \|\| // 0.5
	Val == 0xB800 \|\| // -0.5
	Val == 0x4000 \|\| // 2.0
	Val == 0xC000 \|\| // -2.0
	Val == 0x4400 \|\| // 4.0
	Val == 0xC400 \|\| // -4.0
	Val == 0x3118; // 1/2pi
	}

	bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) {
	assert(HasInv2Pi);

	if (!EnablePackedInlinableLiterals)
	return false;

	int16_t Lo16 = static_cast<int16_t>(Literal);
	int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
	return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi);
	}

	bool isArgPassedInSGPR(const Argument *A) {
	const Function *F = A->getParent();

	// Arguments to compute shaders are never a source of divergence.
	CallingConv::ID CC = F->getCallingConv();
	switch (CC) {
	case CallingConv::AMDGPU_KERNEL:
	case CallingConv::SPIR_KERNEL:
	return true;
	case CallingConv::AMDGPU_VS:
	case CallingConv::AMDGPU_LS:
	case CallingConv::AMDGPU_HS:
	case CallingConv::AMDGPU_ES:
	case CallingConv::AMDGPU_GS:
	case CallingConv::AMDGPU_PS:
	case CallingConv::AMDGPU_CS:
	// For non-compute shaders, SGPR inputs are marked with either inreg or byval.
	// Everything else is in VGPRs.
	return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) \|\|
	F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal);
	default:
	// TODO: Should calls support inreg for SGPR inputs?
	return false;
	}
	}

	-// TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence.
	-bool isUniformMMO(const MachineMemOperand *MMO) {
	- const Value *Ptr = MMO->getValue();
	- // UndefValue means this is a load of a kernel input. These are uniform.
	- // Sometimes LDS instructions have constant pointers.
	- // If Ptr is null, then that means this mem operand contains a
	- // PseudoSourceValue like GOT.
	- if (!Ptr \|\| isa<UndefValue>(Ptr) \|\|
	- isa<Constant>(Ptr) \|\| isa<GlobalValue>(Ptr))
	- return true;
	-
	- if (const Argument *Arg = dyn_cast<Argument>(Ptr))
	- return isArgPassedInSGPR(Arg);
	-
	- const Instruction *I = dyn_cast<Instruction>(Ptr);
	- return I && I->getMetadata("amdgpu.uniform");
	-}
	-
	int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
	if (isGCN3Encoding(ST))
	return ByteOffset;
	return ByteOffset >> 2;
	}

	bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
	int64_t EncodedOffset = getSMRDEncodedOffset(ST, ByteOffset);
	return isGCN3Encoding(ST) ?
	isUInt<20>(EncodedOffset) : isUInt<8>(EncodedOffset);
	}

	} // end namespace AMDGPU

	} // end namespace llvm

	namespace llvm {
	namespace AMDGPU {

	AMDGPUAS getAMDGPUAS(Triple T) {
	auto Env = T.getEnvironmentName();
	AMDGPUAS AS;
	if (Env == "amdgiz" \|\| Env == "amdgizcl") {
	AS.FLAT_ADDRESS = 0;
	AS.PRIVATE_ADDRESS = 5;
	AS.REGION_ADDRESS = 4;
	}
	else {
	AS.FLAT_ADDRESS = 4;
	AS.PRIVATE_ADDRESS = 0;
	AS.REGION_ADDRESS = 5;
	}
	return AS;
	}

	AMDGPUAS getAMDGPUAS(const TargetMachine &M) {
	return getAMDGPUAS(M.getTargetTriple());
	}

	AMDGPUAS getAMDGPUAS(const Module &M) {
	return getAMDGPUAS(Triple(M.getTargetTriple()));
	}
	} // namespace AMDGPU
	} // namespace llvm
	Index: head/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
	===================================================================
	--- head/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h (revision 329409)
	+++ head/contrib/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h (revision 329410)
	@@ -1,380 +1,379 @@
	//===- AMDGPUBaseInfo.h - Top level definitions for AMDGPU ------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
	#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H

	#include "AMDGPU.h"
	#include "AMDKernelCodeT.h"
	#include "SIDefines.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/MC/MCInstrDesc.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/ErrorHandling.h"
	#include <cstdint>
	#include <string>
	#include <utility>

	namespace llvm {

	class Argument;
	class FeatureBitset;
	class Function;
	class GlobalValue;
	class MachineMemOperand;
	class MCContext;
	class MCRegisterClass;
	class MCRegisterInfo;
	class MCSection;
	class MCSubtargetInfo;
	class Triple;

	namespace AMDGPU {
	namespace IsaInfo {

	enum {
	// The closed Vulkan driver sets 96, which limits the wave count to 8 but
	// doesn't spill SGPRs as much as when 80 is set.
	FIXED_NUM_SGPRS_FOR_INIT_BUG = 96
	};

	/// \brief Instruction set architecture version.
	struct IsaVersion {
	unsigned Major;
	unsigned Minor;
	unsigned Stepping;
	};

	/// \returns Isa version for given subtarget \p Features.
	IsaVersion getIsaVersion(const FeatureBitset &Features);

	/// \brief Streams isa version string for given subtarget \p STI into \p Stream.
	void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream);

	/// \returns True if given subtarget \p Features support code object version 3,
	/// false otherwise.
	bool hasCodeObjectV3(const FeatureBitset &Features);

	/// \returns Wavefront size for given subtarget \p Features.
	unsigned getWavefrontSize(const FeatureBitset &Features);

	/// \returns Local memory size in bytes for given subtarget \p Features.
	unsigned getLocalMemorySize(const FeatureBitset &Features);

	/// \returns Number of execution units per compute unit for given subtarget \p
	/// Features.
	unsigned getEUsPerCU(const FeatureBitset &Features);

	/// \returns Maximum number of work groups per compute unit for given subtarget
	/// \p Features and limited by given \p FlatWorkGroupSize.
	unsigned getMaxWorkGroupsPerCU(const FeatureBitset &Features,
	unsigned FlatWorkGroupSize);

	/// \returns Maximum number of waves per compute unit for given subtarget \p
	/// Features without any kind of limitation.
	unsigned getMaxWavesPerCU(const FeatureBitset &Features);

	/// \returns Maximum number of waves per compute unit for given subtarget \p
	/// Features and limited by given \p FlatWorkGroupSize.
	unsigned getMaxWavesPerCU(const FeatureBitset &Features,
	unsigned FlatWorkGroupSize);

	/// \returns Minimum number of waves per execution unit for given subtarget \p
	/// Features.
	unsigned getMinWavesPerEU(const FeatureBitset &Features);

	/// \returns Maximum number of waves per execution unit for given subtarget \p
	/// Features without any kind of limitation.
	unsigned getMaxWavesPerEU(const FeatureBitset &Features);

	/// \returns Maximum number of waves per execution unit for given subtarget \p
	/// Features and limited by given \p FlatWorkGroupSize.
	unsigned getMaxWavesPerEU(const FeatureBitset &Features,
	unsigned FlatWorkGroupSize);

	/// \returns Minimum flat work group size for given subtarget \p Features.
	unsigned getMinFlatWorkGroupSize(const FeatureBitset &Features);

	/// \returns Maximum flat work group size for given subtarget \p Features.
	unsigned getMaxFlatWorkGroupSize(const FeatureBitset &Features);

	/// \returns Number of waves per work group for given subtarget \p Features and
	/// limited by given \p FlatWorkGroupSize.
	unsigned getWavesPerWorkGroup(const FeatureBitset &Features,
	unsigned FlatWorkGroupSize);

	/// \returns SGPR allocation granularity for given subtarget \p Features.
	unsigned getSGPRAllocGranule(const FeatureBitset &Features);

	/// \returns SGPR encoding granularity for given subtarget \p Features.
	unsigned getSGPREncodingGranule(const FeatureBitset &Features);

	/// \returns Total number of SGPRs for given subtarget \p Features.
	unsigned getTotalNumSGPRs(const FeatureBitset &Features);

	/// \returns Addressable number of SGPRs for given subtarget \p Features.
	unsigned getAddressableNumSGPRs(const FeatureBitset &Features);

	/// \returns Minimum number of SGPRs that meets the given number of waves per
	/// execution unit requirement for given subtarget \p Features.
	unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU);

	/// \returns Maximum number of SGPRs that meets the given number of waves per
	/// execution unit requirement for given subtarget \p Features.
	unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU,
	bool Addressable);

	/// \returns VGPR allocation granularity for given subtarget \p Features.
	unsigned getVGPRAllocGranule(const FeatureBitset &Features);

	/// \returns VGPR encoding granularity for given subtarget \p Features.
	unsigned getVGPREncodingGranule(const FeatureBitset &Features);

	/// \returns Total number of VGPRs for given subtarget \p Features.
	unsigned getTotalNumVGPRs(const FeatureBitset &Features);

	/// \returns Addressable number of VGPRs for given subtarget \p Features.
	unsigned getAddressableNumVGPRs(const FeatureBitset &Features);

	/// \returns Minimum number of VGPRs that meets given number of waves per
	/// execution unit requirement for given subtarget \p Features.
	unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU);

	/// \returns Maximum number of VGPRs that meets given number of waves per
	/// execution unit requirement for given subtarget \p Features.
	unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU);

	} // end namespace IsaInfo

	LLVM_READONLY
	int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx);

	LLVM_READONLY
	int getMaskedMIMGOp(const MCInstrInfo &MII,
	unsigned Opc, unsigned NewChannels);
	LLVM_READONLY
	int getMCOpcode(uint16_t Opcode, unsigned Gen);

	void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
	const FeatureBitset &Features);

	bool isGroupSegment(const GlobalValue *GV);
	bool isGlobalSegment(const GlobalValue *GV);
	bool isReadOnlySegment(const GlobalValue *GV);

	/// \returns True if constants should be emitted to .text section for given
	/// target triple \p TT, false otherwise.
	bool shouldEmitConstantsToTextSection(const Triple &TT);

	/// \returns Integer value requested using \p F's \p Name attribute.
	///
	/// \returns \p Default if attribute is not present.
	///
	/// \returns \p Default and emits error if requested value cannot be converted
	/// to integer.
	int getIntegerAttribute(const Function &F, StringRef Name, int Default);

	/// \returns A pair of integer values requested using \p F's \p Name attribute
	/// in "first[,second]" format ("second" is optional unless \p OnlyFirstRequired
	/// is false).
	///
	/// \returns \p Default if attribute is not present.
	///
	/// \returns \p Default and emits error if one of the requested values cannot be
	/// converted to integer, or \p OnlyFirstRequired is false and "second" value is
	/// not present.
	std::pair<int, int> getIntegerPairAttribute(const Function &F,
	StringRef Name,
	std::pair<int, int> Default,
	bool OnlyFirstRequired = false);

	/// \returns Vmcnt bit mask for given isa \p Version.
	unsigned getVmcntBitMask(const IsaInfo::IsaVersion &Version);

	/// \returns Expcnt bit mask for given isa \p Version.
	unsigned getExpcntBitMask(const IsaInfo::IsaVersion &Version);

	/// \returns Lgkmcnt bit mask for given isa \p Version.
	unsigned getLgkmcntBitMask(const IsaInfo::IsaVersion &Version);

	/// \returns Waitcnt bit mask for given isa \p Version.
	unsigned getWaitcntBitMask(const IsaInfo::IsaVersion &Version);

	/// \returns Decoded Vmcnt from given \p Waitcnt for given isa \p Version.
	unsigned decodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt);

	/// \returns Decoded Expcnt from given \p Waitcnt for given isa \p Version.
	unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt);

	/// \returns Decoded Lgkmcnt from given \p Waitcnt for given isa \p Version.
	unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt);

	/// \brief Decodes Vmcnt, Expcnt and Lgkmcnt from given \p Waitcnt for given isa
	/// \p Version, and writes decoded values into \p Vmcnt, \p Expcnt and
	/// \p Lgkmcnt respectively.
	///
	/// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are decoded as follows:
	/// \p Vmcnt = \p Waitcnt[3:0] (pre-gfx9 only)
	/// \p Vmcnt = \p Waitcnt[3:0] \| \p Waitcnt[15:14] (gfx9+ only)
	/// \p Expcnt = \p Waitcnt[6:4]
	/// \p Lgkmcnt = \p Waitcnt[11:8]
	void decodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
	unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt);

	/// \returns \p Waitcnt with encoded \p Vmcnt for given isa \p Version.
	unsigned encodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
	unsigned Vmcnt);

	/// \returns \p Waitcnt with encoded \p Expcnt for given isa \p Version.
	unsigned encodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
	unsigned Expcnt);

	/// \returns \p Waitcnt with encoded \p Lgkmcnt for given isa \p Version.
	unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
	unsigned Lgkmcnt);

	/// \brief Encodes \p Vmcnt, \p Expcnt and \p Lgkmcnt into Waitcnt for given isa
	/// \p Version.
	///
	/// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are encoded as follows:
	/// Waitcnt[3:0] = \p Vmcnt (pre-gfx9 only)
	/// Waitcnt[3:0] = \p Vmcnt[3:0] (gfx9+ only)
	/// Waitcnt[6:4] = \p Expcnt
	/// Waitcnt[11:8] = \p Lgkmcnt
	/// Waitcnt[15:14] = \p Vmcnt[5:4] (gfx9+ only)
	///
	/// \returns Waitcnt with encoded \p Vmcnt, \p Expcnt and \p Lgkmcnt for given
	/// isa \p Version.
	unsigned encodeWaitcnt(const IsaInfo::IsaVersion &Version,
	unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt);

	unsigned getInitialPSInputAddr(const Function &F);

	LLVM_READNONE
	bool isShader(CallingConv::ID CC);

	LLVM_READNONE
	bool isCompute(CallingConv::ID CC);

	LLVM_READNONE
	bool isEntryFunctionCC(CallingConv::ID CC);

	// FIXME: Remove this when calling conventions cleaned up
	LLVM_READNONE
	inline bool isKernel(CallingConv::ID CC) {
	switch (CC) {
	case CallingConv::AMDGPU_KERNEL:
	case CallingConv::SPIR_KERNEL:
	return true;
	default:
	return false;
	}
	}

	bool isSI(const MCSubtargetInfo &STI);
	bool isCI(const MCSubtargetInfo &STI);
	bool isVI(const MCSubtargetInfo &STI);
	bool isGFX9(const MCSubtargetInfo &STI);

	/// \brief Is Reg - scalar register
	bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);

	/// \brief Is there any intersection between registers
	bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI);

	/// If \p Reg is a pseudo reg, return the correct hardware register given
	/// \p STI otherwise return \p Reg.
	unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI);

	/// \brief Convert hardware register \p Reg to a pseudo register
	LLVM_READNONE
	unsigned mc2PseudoReg(unsigned Reg);

	/// \brief Can this operand also contain immediate values?
	bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo);

	/// \brief Is this floating-point operand?
	bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo);

	/// \brief Does this opearnd support only inlinable literals?
	bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo);

	/// \brief Get the size in bits of a register from the register class \p RC.
	unsigned getRegBitWidth(unsigned RCID);

	/// \brief Get the size in bits of a register from the register class \p RC.
	unsigned getRegBitWidth(const MCRegisterClass &RC);

	/// \brief Get size of register operand
	unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
	unsigned OpNo);

	LLVM_READNONE
	inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {
	switch (OpInfo.OperandType) {
	case AMDGPU::OPERAND_REG_IMM_INT32:
	case AMDGPU::OPERAND_REG_IMM_FP32:
	case AMDGPU::OPERAND_REG_INLINE_C_INT32:
	case AMDGPU::OPERAND_REG_INLINE_C_FP32:
	return 4;

	case AMDGPU::OPERAND_REG_IMM_INT64:
	case AMDGPU::OPERAND_REG_IMM_FP64:
	case AMDGPU::OPERAND_REG_INLINE_C_INT64:
	case AMDGPU::OPERAND_REG_INLINE_C_FP64:
	return 8;

	case AMDGPU::OPERAND_REG_IMM_INT16:
	case AMDGPU::OPERAND_REG_IMM_FP16:
	case AMDGPU::OPERAND_REG_INLINE_C_INT16:
	case AMDGPU::OPERAND_REG_INLINE_C_FP16:
	case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
	case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
	return 2;

	default:
	llvm_unreachable("unhandled operand type");
	}
	}

	LLVM_READNONE
	inline unsigned getOperandSize(const MCInstrDesc &Desc, unsigned OpNo) {
	return getOperandSize(Desc.OpInfo[OpNo]);
	}

	/// \brief Is this literal inlinable
	LLVM_READNONE
	bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi);

	LLVM_READNONE
	bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi);

	LLVM_READNONE
	bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi);

	LLVM_READNONE
	bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi);

	bool isArgPassedInSGPR(const Argument *Arg);
	-bool isUniformMMO(const MachineMemOperand *MMO);

	/// \returns The encoding that will be used for \p ByteOffset in the SMRD
	/// offset field.
	int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);

	/// \returns true if this offset is small enough to fit in the SMRD
	/// offset field. \p ByteOffset should be the offset in bytes and
	/// not the encoded offset.
	bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);

	} // end namespace AMDGPU
	} // end namespace llvm

	#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
	Index: head/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td
	===================================================================
	--- head/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td (revision 329409)
	+++ head/contrib/llvm/lib/Target/AMDGPU/VOP2Instructions.td (revision 329410)
	@@ -1,928 +1,928 @@
	//===-- VOP2Instructions.td - Vector Instruction Defintions ---------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// VOP2 Classes
	//===----------------------------------------------------------------------===//

	class VOP2e <bits<6> op, VOPProfile P> : Enc32 {
	bits<8> vdst;
	bits<9> src0;
	bits<8> src1;

	let Inst{8-0} = !if(P.HasSrc0, src0, 0);
	let Inst{16-9} = !if(P.HasSrc1, src1, 0);
	let Inst{24-17} = !if(P.EmitDst, vdst, 0);
	let Inst{30-25} = op;
	let Inst{31} = 0x0; //encoding
	}

	class VOP2_MADKe <bits<6> op, VOPProfile P> : Enc64 {
	bits<8> vdst;
	bits<9> src0;
	bits<8> src1;
	bits<32> imm;

	let Inst{8-0} = !if(P.HasSrc0, src0, 0);
	let Inst{16-9} = !if(P.HasSrc1, src1, 0);
	let Inst{24-17} = !if(P.EmitDst, vdst, 0);
	let Inst{30-25} = op;
	let Inst{31} = 0x0; // encoding
	let Inst{63-32} = imm;
	}

	class VOP2_SDWAe <bits<6> op, VOPProfile P> : VOP_SDWAe <P> {
	bits<8> vdst;
	bits<8> src1;

	let Inst{8-0} = 0xf9; // sdwa
	let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0);
	let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
	let Inst{30-25} = op;
	let Inst{31} = 0x0; // encoding
	}

	class VOP2_SDWA9Ae <bits<6> op, VOPProfile P> : VOP_SDWA9Ae <P> {
	bits<8> vdst;
	bits<9> src1;

	let Inst{8-0} = 0xf9; // sdwa
	let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0);
	let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
	let Inst{30-25} = op;
	let Inst{31} = 0x0; // encoding
	let Inst{63} = !if(P.HasSrc1, src1{8}, 0); // src1_sgpr
	}

	class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suffix = "_e32"> :
	InstSI <P.Outs32, P.Ins32, "", pattern>,
	VOP <opName>,
	SIMCInstr <opName#suffix, SIEncodingFamily.NONE>,
	MnemonicAlias<opName#suffix, opName> {

	let isPseudo = 1;
	let isCodeGenOnly = 1;
	let UseNamedOperandTable = 1;

	string Mnemonic = opName;
	string AsmOperands = P.Asm32;

	let Size = 4;
	let mayLoad = 0;
	let mayStore = 0;
	let hasSideEffects = 0;
	let SubtargetPredicate = isGCN;

	let VOP2 = 1;
	let VALU = 1;
	let Uses = [EXEC];

	let AsmVariantName = AMDGPUAsmVariants.Default;

	VOPProfile Pfl = P;
	}

	class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> :
	InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
	SIMCInstr <ps.PseudoInstr, EncodingFamily> {

	let isPseudo = 0;
	let isCodeGenOnly = 0;

	let Constraints = ps.Constraints;
	let DisableEncoding = ps.DisableEncoding;

	// copy relevant pseudo op flags
	let SubtargetPredicate = ps.SubtargetPredicate;
	let AsmMatchConverter = ps.AsmMatchConverter;
	let AsmVariantName = ps.AsmVariantName;
	let Constraints = ps.Constraints;
	let DisableEncoding = ps.DisableEncoding;
	let TSFlags = ps.TSFlags;
	let UseNamedOperandTable = ps.UseNamedOperandTable;
	let Uses = ps.Uses;
	}

	class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
	VOP_SDWA_Pseudo <OpName, P, pattern> {
	let AsmMatchConverter = "cvtSdwaVOP2";
	}

	class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
	list<dag> ret = !if(P.HasModifiers,
	[(set P.DstVT:$vdst,
	(node (P.Src0VT
	!if(P.HasOMod,
	(VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod),
	(VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))),
	(P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))],
	[(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]);
	}

	multiclass VOP2Inst <string opName,
	VOPProfile P,
	SDPatternOperator node = null_frag,
	string revOp = opName,
	bit GFX9Renamed = 0> {

	let renamedInGFX9 = GFX9Renamed in {

	def _e32 : VOP2_Pseudo <opName, P>,
	Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;

	def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
	Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;

	def _sdwa : VOP2_SDWA_Pseudo <opName, P>;

	}
	}

	multiclass VOP2bInst <string opName,
	VOPProfile P,
	SDPatternOperator node = null_frag,
	string revOp = opName,
	bit GFX9Renamed = 0,
	bit useSGPRInput = !eq(P.NumSrcArgs, 3)> {
	let renamedInGFX9 = GFX9Renamed in {
	let SchedRW = [Write32Bit, WriteSALU] in {
	let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in {
	def _e32 : VOP2_Pseudo <opName, P>,
	Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;

	def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
	let AsmMatchConverter = "cvtSdwaVOP2b";
	}
	}

	def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
	Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
	}
	}
	}

	multiclass VOP2eInst <string opName,
	VOPProfile P,
	SDPatternOperator node = null_frag,
	string revOp = opName,
	bit useSGPRInput = !eq(P.NumSrcArgs, 3)> {

	let SchedRW = [Write32Bit] in {
	let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]) in {
	def _e32 : VOP2_Pseudo <opName, P>,
	Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
	}

	def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
	Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
	}
	}

	class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
	field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
	field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm);
	field bit HasExt = 0;

	// Hack to stop printing _e64
	let DstRC = RegisterOperand<VGPR_32>;
	field string Asm32 = " $vdst, $src0, $src1, $imm";
	}

	def VOP_MADAK_F16 : VOP_MADAK <f16>;
	def VOP_MADAK_F32 : VOP_MADAK <f32>;

	class VOP_MADMK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
	field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
	field dag Ins32 = (ins VCSrc_f32:$src0, ImmOpType:$imm, VGPR_32:$src1);
	field bit HasExt = 0;

	// Hack to stop printing _e64
	let DstRC = RegisterOperand<VGPR_32>;
	field string Asm32 = " $vdst, $src0, $imm, $src1";
	}

	def VOP_MADMK_F16 : VOP_MADMK <f16>;
	def VOP_MADMK_F32 : VOP_MADMK <f32>;

	// FIXME: Remove src2_modifiers. It isn't used, so is wasting memory
	// and processing time but it makes it easier to convert to mad.
	class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
	let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
	let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
	0, HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret;
	let InsDPP = (ins DstRCDPP:$old,
	Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
	Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
	dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
	bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);

	let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
	Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
	VGPR_32:$src2, // stub argument
	clampmod:$clamp, omod:$omod,
	dst_sel:$dst_sel, dst_unused:$dst_unused,
	src0_sel:$src0_sel, src1_sel:$src1_sel);
	let Asm32 = getAsm32<1, 2, vt>.ret;
	let Asm64 = getAsm64<1, 2, 0, HasModifiers, HasOMod, vt>.ret;
	let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret;
	let AsmSDWA = getAsmSDWA<1, 2, vt>.ret;
	let AsmSDWA9 = getAsmSDWA9<1, 1, 2, vt>.ret;
	let HasSrc2 = 0;
	let HasSrc2Mods = 0;
	let HasExt = 1;
	let HasSDWA9 = 0;
	}

	def VOP_MAC_F16 : VOP_MAC <f16> {
	// FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives
	// 'not a string initializer' error.
	let Asm64 = getAsm64<1, 2, 0, HasModifiers, HasOMod, f16>.ret;
	}

	def VOP_MAC_F32 : VOP_MAC <f32> {
	// FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives
	// 'not a string initializer' error.
	let Asm64 = getAsm64<1, 2, 0, HasModifiers, HasOMod, f32>.ret;
	}

	// Write out to vcc or arbitrary SGPR.
	def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> {
	let Asm32 = "$vdst, vcc, $src0, $src1";
	let Asm64 = "$vdst, $sdst, $src0, $src1";
	let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel";
	let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel";
	let AsmDPP = "$vdst, vcc, $src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
	let Outs32 = (outs DstRC:$vdst);
	let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
	}

	// Write out to vcc or arbitrary SGPR and read in from vcc or
	// arbitrary SGPR.
	def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
	// We use VCSrc_b32 to exclude literal constants, even though the
	// encoding normally allows them since the implicit VCC use means
	// using one would always violate the constant bus
	// restriction. SGPRs are still allowed because it should
	// technically be possible to use VCC again as src0.
	let Src0RC32 = VCSrc_b32;
	let Asm32 = "$vdst, vcc, $src0, $src1, vcc";
	let Asm64 = "$vdst, $sdst, $src0, $src1, $src2";
	let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
	let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
	let AsmDPP = "$vdst, vcc, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
	let Outs32 = (outs DstRC:$vdst);
	let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);

	// Suppress src2 implied by type since the 32-bit encoding uses an
	// implicit VCC use.
	let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);

	let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
	Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
	clampmod:$clamp,
	dst_sel:$dst_sel, dst_unused:$dst_unused,
	src0_sel:$src0_sel, src1_sel:$src1_sel);

	let InsDPP = (ins DstRCDPP:$old,
	Src0DPP:$src0,
	Src1DPP:$src1,
	dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
	bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
	let HasExt = 1;
	let HasSDWA9 = 1;
	}

	// Read in from vcc or arbitrary SGPR
	def VOP2e_I32_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
	let Src0RC32 = VCSrc_b32; // See comment in def VOP2b_I32_I1_I32_I32_I1 above.
	let Asm32 = "$vdst, $src0, $src1, vcc";
	let Asm64 = "$vdst, $src0, $src1, $src2";
	let Outs32 = (outs DstRC:$vdst);
	let Outs64 = (outs DstRC:$vdst);

	// Suppress src2 implied by type since the 32-bit encoding uses an
	// implicit VCC use.
	let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);
	}

	def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
	let Outs32 = (outs SReg_32:$vdst);
	let Outs64 = Outs32;
	let Ins32 = (ins VGPR_32:$src0, SCSrc_b32:$src1);
	let Ins64 = Ins32;
	let Asm32 = " $vdst, $src0, $src1";
	let Asm64 = Asm32;
	let HasExt = 0;
	let HasSDWA9 = 0;
	}

	def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> {
	let Outs32 = (outs VGPR_32:$vdst);
	let Outs64 = Outs32;
	let Ins32 = (ins SCSrc_b32:$src0, SCSrc_b32:$src1);
	let Ins64 = Ins32;
	let Asm32 = " $vdst, $src0, $src1";
	let Asm64 = Asm32;
	let HasExt = 0;
	let HasSDWA9 = 0;
	}

	//===----------------------------------------------------------------------===//
	// VOP2 Instructions
	//===----------------------------------------------------------------------===//

	let SubtargetPredicate = isGCN in {

	defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>;
	def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, [], "">;

	let isCommutable = 1 in {
	defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, fadd>;
	defm V_SUB_F32 : VOP2Inst <"v_sub_f32", VOP_F32_F32_F32, fsub>;
	defm V_SUBREV_F32 : VOP2Inst <"v_subrev_f32", VOP_F32_F32_F32, null_frag, "v_sub_f32">;
	defm V_MUL_LEGACY_F32 : VOP2Inst <"v_mul_legacy_f32", VOP_F32_F32_F32, AMDGPUfmul_legacy>;
	defm V_MUL_F32 : VOP2Inst <"v_mul_f32", VOP_F32_F32_F32, fmul>;
	defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i32_i24", VOP_I32_I32_I32, AMDGPUmul_i24>;
	defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_I32_I32_I32, AMDGPUmulhi_i24>;
	defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_I32_I32_I32, AMDGPUmul_u24>;
	defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_I32_I32_I32, AMDGPUmulhi_u24>;
	defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum>;
	defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum>;
	defm V_MIN_I32 : VOP2Inst <"v_min_i32", VOP_I32_I32_I32>;
	defm V_MAX_I32 : VOP2Inst <"v_max_i32", VOP_I32_I32_I32>;
	defm V_MIN_U32 : VOP2Inst <"v_min_u32", VOP_I32_I32_I32>;
	defm V_MAX_U32 : VOP2Inst <"v_max_u32", VOP_I32_I32_I32>;
	defm V_LSHRREV_B32 : VOP2Inst <"v_lshrrev_b32", VOP_I32_I32_I32, null_frag, "v_lshr_b32">;
	defm V_ASHRREV_I32 : VOP2Inst <"v_ashrrev_i32", VOP_I32_I32_I32, null_frag, "v_ashr_i32">;
	defm V_LSHLREV_B32 : VOP2Inst <"v_lshlrev_b32", VOP_I32_I32_I32, null_frag, "v_lshl_b32">;
	defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_I32_I32_I32>;
	defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_I32_I32_I32>;
	defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_I32_I32_I32>;

	let Constraints = "$vdst = $src2", DisableEncoding="$src2",
	isConvertibleToThreeAddress = 1 in {
	defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>;
	}

	def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, [], "">;

	// No patterns so that the scalar instructions are always selected.
	// The scalar versions will be replaced with vector when needed later.

	// V_ADD_I32, V_SUB_I32, and V_SUBREV_I32 where renamed to *_U32 in VI,
	// but the VI instructions behave the same as the SI versions.
	defm V_ADD_I32 : VOP2bInst <"v_add_i32", VOP2b_I32_I1_I32_I32, null_frag, "v_add_i32", 1>;
	defm V_SUB_I32 : VOP2bInst <"v_sub_i32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_i32", 1>;
	defm V_SUBREV_I32 : VOP2bInst <"v_subrev_i32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_i32", 1>;
	defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_addc_u32", 1>;
	defm V_SUBB_U32 : VOP2bInst <"v_subb_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32", 1>;
	defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_subb_u32", 1>;


	let SubtargetPredicate = HasAddNoCarryInsts in {
	defm V_ADD_U32 : VOP2Inst <"v_add_u32", VOP_I32_I32_I32, null_frag, "v_add_u32", 1>;
	defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32, null_frag, "v_sub_u32", 1>;
	defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32, null_frag, "v_sub_u32", 1>;
	}

	} // End isCommutable = 1

	// These are special and do not read the exec mask.
	let isConvergent = 1, Uses = []<Register> in {
	def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
	[(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))], "">;

	def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, [], "">;
	} // End isConvergent = 1

	defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_NO_EXT<VOP_I32_I32_I32>>;
	defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>>;
	defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_lo>;
	defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>;
	defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>;
	defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst"
	-defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
	-defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
	+defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpknorm_i16_f32>;
	+defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpknorm_u16_f32>;
	defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpkrtz_f16_f32>;
	-defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>>;
	-defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>>;
	+defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>, AMDGPUpk_u16_u32>;
	+defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>, AMDGPUpk_i16_i32>;

	} // End SubtargetPredicate = isGCN

	def : GCNPat<
	(AMDGPUadde i32:$src0, i32:$src1, i1:$src2),
	(V_ADDC_U32_e64 $src0, $src1, $src2)
	>;

	def : GCNPat<
	(AMDGPUsube i32:$src0, i32:$src1, i1:$src2),
	(V_SUBB_U32_e64 $src0, $src1, $src2)
	>;

	// These instructions only exist on SI and CI
	let SubtargetPredicate = isSICI in {

	defm V_MIN_LEGACY_F32 : VOP2Inst <"v_min_legacy_f32", VOP_F32_F32_F32, AMDGPUfmin_legacy>;
	defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfmax_legacy>;

	let isCommutable = 1 in {
	defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>;
	defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32>;
	defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32>;
	defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>;
	} // End isCommutable = 1

	} // End let SubtargetPredicate = SICI

	let SubtargetPredicate = Has16BitInsts in {

	def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">;
	defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>;
	defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>;
	defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16>;
	defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>;

	let isCommutable = 1 in {
	defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, fadd>;
	defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>;
	defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">;
	defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>;
	def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">;
	defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>;
	defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>;
	defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">;
	defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16>;
	defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum>;
	defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum>;
	defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16>;
	defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16>;
	defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16>;
	defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16>;

	let Constraints = "$vdst = $src2", DisableEncoding="$src2",
	isConvertibleToThreeAddress = 1 in {
	defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>;
	}
	} // End isCommutable = 1

	} // End SubtargetPredicate = Has16BitInsts

	// Note: 16-bit instructions produce a 0 result in the high 16-bits.
	multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst> {

	def : GCNPat<
	(op i16:$src0, i16:$src1),
	(inst $src0, $src1)
	>;

	def : GCNPat<
	(i32 (zext (op i16:$src0, i16:$src1))),
	(inst $src0, $src1)
	>;

	def : GCNPat<
	(i64 (zext (op i16:$src0, i16:$src1))),
	(REG_SEQUENCE VReg_64,
	(inst $src0, $src1), sub0,
	(V_MOV_B32_e32 (i32 0)), sub1)
	>;

	}

	multiclass Bits_OpsRev_i16_Pats <SDPatternOperator op, Instruction inst> {

	def : GCNPat<
	(op i16:$src0, i16:$src1),
	(inst $src1, $src0)
	>;

	def : GCNPat<
	(i32 (zext (op i16:$src0, i16:$src1))),
	(inst $src1, $src0)
	>;


	def : GCNPat<
	(i64 (zext (op i16:$src0, i16:$src1))),
	(REG_SEQUENCE VReg_64,
	(inst $src1, $src0), sub0,
	(V_MOV_B32_e32 (i32 0)), sub1)
	>;
	}

	class ZExt_i16_i1_Pat <SDNode ext> : GCNPat <
	(i16 (ext i1:$src)),
	(V_CNDMASK_B32_e64 (i32 0), (i32 1), $src)
	>;

	let Predicates = [Has16BitInsts] in {

	defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64>;
	defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64>;
	defm : Arithmetic_i16_Pats<sub, V_SUB_U16_e64>;
	defm : Arithmetic_i16_Pats<smin, V_MIN_I16_e64>;
	defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e64>;
	defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e64>;
	defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e64>;

	def : GCNPat <
	(and i16:$src0, i16:$src1),
	(V_AND_B32_e64 $src0, $src1)
	>;

	def : GCNPat <
	(or i16:$src0, i16:$src1),
	(V_OR_B32_e64 $src0, $src1)
	>;

	def : GCNPat <
	(xor i16:$src0, i16:$src1),
	(V_XOR_B32_e64 $src0, $src1)
	>;

	defm : Bits_OpsRev_i16_Pats<shl, V_LSHLREV_B16_e64>;
	defm : Bits_OpsRev_i16_Pats<srl, V_LSHRREV_B16_e64>;
	defm : Bits_OpsRev_i16_Pats<sra, V_ASHRREV_I16_e64>;

	def : ZExt_i16_i1_Pat<zext>;
	def : ZExt_i16_i1_Pat<anyext>;

	def : GCNPat <
	(i16 (sext i1:$src)),
	(V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src)
	>;

	// Undo sub x, c -> add x, -c canonicalization since c is more likely
	// an inline immediate than -c.
	// TODO: Also do for 64-bit.
	def : GCNPat<
	(add i16:$src0, (i16 NegSubInlineConst16:$src1)),
	(V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1)
	>;

	} // End Predicates = [Has16BitInsts]

	//===----------------------------------------------------------------------===//
	// SI
	//===----------------------------------------------------------------------===//

	let AssemblerPredicates = [isSICI], DecoderNamespace = "SICI" in {

	multiclass VOP2_Real_si <bits<6> op> {
	def _si :
	VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.SI>,
	VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
	}

	multiclass VOP2_Real_MADK_si <bits<6> op> {
	def _si : VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.SI>,
	VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
	}

	multiclass VOP2_Real_e32_si <bits<6> op> {
	def _e32_si :
	VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.SI>,
	VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>;
	}

	multiclass VOP2_Real_e32e64_si <bits<6> op> : VOP2_Real_e32_si<op> {
	def _e64_si :
	VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
	VOP3e_si <{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
	}

	multiclass VOP2be_Real_e32e64_si <bits<6> op> : VOP2_Real_e32_si<op> {
	def _e64_si :
	VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.SI>,
	VOP3be_si <{1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
	}

	} // End AssemblerPredicates = [isSICI], DecoderNamespace = "SICI"

	defm V_CNDMASK_B32 : VOP2_Real_e32e64_si <0x0>;
	defm V_ADD_F32 : VOP2_Real_e32e64_si <0x3>;
	defm V_SUB_F32 : VOP2_Real_e32e64_si <0x4>;
	defm V_SUBREV_F32 : VOP2_Real_e32e64_si <0x5>;
	defm V_MUL_LEGACY_F32 : VOP2_Real_e32e64_si <0x7>;
	defm V_MUL_F32 : VOP2_Real_e32e64_si <0x8>;
	defm V_MUL_I32_I24 : VOP2_Real_e32e64_si <0x9>;
	defm V_MUL_HI_I32_I24 : VOP2_Real_e32e64_si <0xa>;
	defm V_MUL_U32_U24 : VOP2_Real_e32e64_si <0xb>;
	defm V_MUL_HI_U32_U24 : VOP2_Real_e32e64_si <0xc>;
	defm V_MIN_F32 : VOP2_Real_e32e64_si <0xf>;
	defm V_MAX_F32 : VOP2_Real_e32e64_si <0x10>;
	defm V_MIN_I32 : VOP2_Real_e32e64_si <0x11>;
	defm V_MAX_I32 : VOP2_Real_e32e64_si <0x12>;
	defm V_MIN_U32 : VOP2_Real_e32e64_si <0x13>;
	defm V_MAX_U32 : VOP2_Real_e32e64_si <0x14>;
	defm V_LSHRREV_B32 : VOP2_Real_e32e64_si <0x16>;
	defm V_ASHRREV_I32 : VOP2_Real_e32e64_si <0x18>;
	defm V_LSHLREV_B32 : VOP2_Real_e32e64_si <0x1a>;
	defm V_AND_B32 : VOP2_Real_e32e64_si <0x1b>;
	defm V_OR_B32 : VOP2_Real_e32e64_si <0x1c>;
	defm V_XOR_B32 : VOP2_Real_e32e64_si <0x1d>;
	defm V_MAC_F32 : VOP2_Real_e32e64_si <0x1f>;
	defm V_MADMK_F32 : VOP2_Real_MADK_si <0x20>;
	defm V_MADAK_F32 : VOP2_Real_MADK_si <0x21>;
	defm V_ADD_I32 : VOP2be_Real_e32e64_si <0x25>;
	defm V_SUB_I32 : VOP2be_Real_e32e64_si <0x26>;
	defm V_SUBREV_I32 : VOP2be_Real_e32e64_si <0x27>;
	defm V_ADDC_U32 : VOP2be_Real_e32e64_si <0x28>;
	defm V_SUBB_U32 : VOP2be_Real_e32e64_si <0x29>;
	defm V_SUBBREV_U32 : VOP2be_Real_e32e64_si <0x2a>;

	defm V_READLANE_B32 : VOP2_Real_si <0x01>;

	let InOperandList = (ins SSrc_b32:$src0, SCSrc_b32:$src1) in {
	defm V_WRITELANE_B32 : VOP2_Real_si <0x02>;
	}

	defm V_MAC_LEGACY_F32 : VOP2_Real_e32e64_si <0x6>;
	defm V_MIN_LEGACY_F32 : VOP2_Real_e32e64_si <0xd>;
	defm V_MAX_LEGACY_F32 : VOP2_Real_e32e64_si <0xe>;
	defm V_LSHR_B32 : VOP2_Real_e32e64_si <0x15>;
	defm V_ASHR_I32 : VOP2_Real_e32e64_si <0x17>;
	defm V_LSHL_B32 : VOP2_Real_e32e64_si <0x19>;

	defm V_BFM_B32 : VOP2_Real_e32e64_si <0x1e>;
	defm V_BCNT_U32_B32 : VOP2_Real_e32e64_si <0x22>;
	defm V_MBCNT_LO_U32_B32 : VOP2_Real_e32e64_si <0x23>;
	defm V_MBCNT_HI_U32_B32 : VOP2_Real_e32e64_si <0x24>;
	defm V_LDEXP_F32 : VOP2_Real_e32e64_si <0x2b>;
	defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e32e64_si <0x2c>;
	defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e32e64_si <0x2d>;
	defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e32e64_si <0x2e>;
	defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_e32e64_si <0x2f>;
	defm V_CVT_PK_U16_U32 : VOP2_Real_e32e64_si <0x30>;
	defm V_CVT_PK_I16_I32 : VOP2_Real_e32e64_si <0x31>;


	//===----------------------------------------------------------------------===//
	// VI
	//===----------------------------------------------------------------------===//

	class VOP2_DPP <bits<6> op, VOP2_Pseudo ps, string OpName = ps.OpName, VOPProfile P = ps.Pfl> :
	VOP_DPP <OpName, P> {
	let Defs = ps.Defs;
	let Uses = ps.Uses;
	let SchedRW = ps.SchedRW;
	let hasSideEffects = ps.hasSideEffects;

	bits<8> vdst;
	bits<8> src1;
	let Inst{8-0} = 0xfa; //dpp
	let Inst{16-9} = !if(P.HasSrc1, src1{7-0}, 0);
	let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
	let Inst{30-25} = op;
	let Inst{31} = 0x0; //encoding
	}

	let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {

	multiclass VOP32_Real_vi <bits<10> op> {
	def _vi :
	VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.VI>,
	VOP3e_vi<op, !cast<VOP2_Pseudo>(NAME).Pfl>;
	}

	multiclass VOP2_Real_MADK_vi <bits<6> op> {
	def _vi : VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.VI>,
	VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
	}

	multiclass VOP2_Real_e32_vi <bits<6> op> {
	def _e32_vi :
	VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.VI>,
	VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>;
	}

	multiclass VOP2_Real_e64_vi <bits<10> op> {
	def _e64_vi :
	VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
	VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
	}

	multiclass VOP2_Real_e64only_vi <bits<10> op> {
	def _e64_vi :
	VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
	VOP3e_vi <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
	// Hack to stop printing _e64
	VOP3_Pseudo ps = !cast<VOP3_Pseudo>(NAME#"_e64");
	let OutOperandList = (outs VGPR_32:$vdst);
	let AsmString = ps.Mnemonic # " " # ps.AsmOperands;
	}
	}

	multiclass Base_VOP2_Real_e32e64_vi <bits<6> op> :
	VOP2_Real_e32_vi<op>,
	VOP2_Real_e64_vi<{0, 1, 0, 0, op{5-0}}>;

	} // End AssemblerPredicates = [isVI], DecoderNamespace = "VI"

	multiclass VOP2_SDWA_Real <bits<6> op> {
	def _sdwa_vi :
	VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
	VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
	}

	multiclass VOP2_SDWA9_Real <bits<6> op> {
	def _sdwa_gfx9 :
	VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
	VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
	}

	let AssemblerPredicates = [isVIOnly] in {

	multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName> {
	def _e32_vi :
	VOP2_Real<!cast<VOP2_Pseudo>(OpName#"_e32"), SIEncodingFamily.VI>,
	VOP2e<op{5-0}, !cast<VOP2_Pseudo>(OpName#"_e32").Pfl> {
	VOP2_Pseudo ps = !cast<VOP2_Pseudo>(OpName#"_e32");
	let AsmString = AsmName # ps.AsmOperands;
	let DecoderNamespace = "VI";
	}
	def _e64_vi :
	VOP3_Real<!cast<VOP3_Pseudo>(OpName#"_e64"), SIEncodingFamily.VI>,
	VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(OpName#"_e64").Pfl> {
	VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName#"_e64");
	let AsmString = AsmName # ps.AsmOperands;
	let DecoderNamespace = "VI";
	}
	def _sdwa_vi :
	VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>,
	VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> {
	VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");
	let AsmString = AsmName # ps.AsmOperands;
	}
	def _dpp :
	VOP2_DPP<op, !cast<VOP2_Pseudo>(OpName#"_e32"), AsmName>;
	}
	}

	let AssemblerPredicates = [isGFX9] in {

	multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> {
	def _e32_gfx9 :
	VOP2_Real<!cast<VOP2_Pseudo>(OpName#"_e32"), SIEncodingFamily.GFX9>,
	VOP2e<op{5-0}, !cast<VOP2_Pseudo>(OpName#"_e32").Pfl> {
	VOP2_Pseudo ps = !cast<VOP2_Pseudo>(OpName#"_e32");
	let AsmString = AsmName # ps.AsmOperands;
	let DecoderNamespace = "GFX9";
	}
	def _e64_gfx9 :
	VOP3_Real<!cast<VOP3_Pseudo>(OpName#"_e64"), SIEncodingFamily.GFX9>,
	VOP3be_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(OpName#"_e64").Pfl> {
	VOP3_Pseudo ps = !cast<VOP3_Pseudo>(OpName#"_e64");
	let AsmString = AsmName # ps.AsmOperands;
	let DecoderNamespace = "GFX9";
	}
	def _sdwa_gfx9 :
	VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>,
	VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> {
	VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");
	let AsmString = AsmName # ps.AsmOperands;
	}
	def _dpp_gfx9 :
	VOP2_DPP<op, !cast<VOP2_Pseudo>(OpName#"_e32"), AsmName> {
	let DecoderNamespace = "SDWA9";
	}
	}

	multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
	def _e32_gfx9 :
	VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX9>,
	VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>{
	let DecoderNamespace = "GFX9";
	}
	def _e64_gfx9 :
	VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX9>,
	VOP3e_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
	let DecoderNamespace = "GFX9";
	}
	def _sdwa_gfx9 :
	VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
	VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
	}
	def _dpp_gfx9 :
	VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")> {
	let DecoderNamespace = "SDWA9";
	}
	}

	} // AssemblerPredicates = [isGFX9]

	multiclass VOP2_Real_e32e64_vi <bits<6> op> :
	Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> {
	// For now left dpp only for asm/dasm
	// TODO: add corresponding pseudo
	def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
	}

	defm V_CNDMASK_B32 : Base_VOP2_Real_e32e64_vi <0x0>;
	defm V_ADD_F32 : VOP2_Real_e32e64_vi <0x1>;
	defm V_SUB_F32 : VOP2_Real_e32e64_vi <0x2>;
	defm V_SUBREV_F32 : VOP2_Real_e32e64_vi <0x3>;
	defm V_MUL_LEGACY_F32 : VOP2_Real_e32e64_vi <0x4>;
	defm V_MUL_F32 : VOP2_Real_e32e64_vi <0x5>;
	defm V_MUL_I32_I24 : VOP2_Real_e32e64_vi <0x6>;
	defm V_MUL_HI_I32_I24 : VOP2_Real_e32e64_vi <0x7>;
	defm V_MUL_U32_U24 : VOP2_Real_e32e64_vi <0x8>;
	defm V_MUL_HI_U32_U24 : VOP2_Real_e32e64_vi <0x9>;
	defm V_MIN_F32 : VOP2_Real_e32e64_vi <0xa>;
	defm V_MAX_F32 : VOP2_Real_e32e64_vi <0xb>;
	defm V_MIN_I32 : VOP2_Real_e32e64_vi <0xc>;
	defm V_MAX_I32 : VOP2_Real_e32e64_vi <0xd>;
	defm V_MIN_U32 : VOP2_Real_e32e64_vi <0xe>;
	defm V_MAX_U32 : VOP2_Real_e32e64_vi <0xf>;
	defm V_LSHRREV_B32 : VOP2_Real_e32e64_vi <0x10>;
	defm V_ASHRREV_I32 : VOP2_Real_e32e64_vi <0x11>;
	defm V_LSHLREV_B32 : VOP2_Real_e32e64_vi <0x12>;
	defm V_AND_B32 : VOP2_Real_e32e64_vi <0x13>;
	defm V_OR_B32 : VOP2_Real_e32e64_vi <0x14>;
	defm V_XOR_B32 : VOP2_Real_e32e64_vi <0x15>;
	defm V_MAC_F32 : VOP2_Real_e32e64_vi <0x16>;
	defm V_MADMK_F32 : VOP2_Real_MADK_vi <0x17>;
	defm V_MADAK_F32 : VOP2_Real_MADK_vi <0x18>;

	defm V_ADD_U32 : VOP2be_Real_e32e64_vi_only <0x19, "V_ADD_I32", "v_add_u32">;
	defm V_SUB_U32 : VOP2be_Real_e32e64_vi_only <0x1a, "V_SUB_I32", "v_sub_u32">;
	defm V_SUBREV_U32 : VOP2be_Real_e32e64_vi_only <0x1b, "V_SUBREV_I32", "v_subrev_u32">;
	defm V_ADDC_U32 : VOP2be_Real_e32e64_vi_only <0x1c, "V_ADDC_U32", "v_addc_u32">;
	defm V_SUBB_U32 : VOP2be_Real_e32e64_vi_only <0x1d, "V_SUBB_U32", "v_subb_u32">;
	defm V_SUBBREV_U32 : VOP2be_Real_e32e64_vi_only <0x1e, "V_SUBBREV_U32", "v_subbrev_u32">;

	defm V_ADD_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x19, "V_ADD_I32", "v_add_co_u32">;
	defm V_SUB_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1a, "V_SUB_I32", "v_sub_co_u32">;
	defm V_SUBREV_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1b, "V_SUBREV_I32", "v_subrev_co_u32">;
	defm V_ADDC_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1c, "V_ADDC_U32", "v_addc_co_u32">;
	defm V_SUBB_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1d, "V_SUBB_U32", "v_subb_co_u32">;
	defm V_SUBBREV_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1e, "V_SUBBREV_U32", "v_subbrev_co_u32">;

	defm V_ADD_U32 : VOP2_Real_e32e64_gfx9 <0x34>;
	defm V_SUB_U32 : VOP2_Real_e32e64_gfx9 <0x35>;
	defm V_SUBREV_U32 : VOP2_Real_e32e64_gfx9 <0x36>;

	defm V_READLANE_B32 : VOP32_Real_vi <0x289>;
	defm V_WRITELANE_B32 : VOP32_Real_vi <0x28a>;

	defm V_BFM_B32 : VOP2_Real_e64only_vi <0x293>;
	defm V_BCNT_U32_B32 : VOP2_Real_e64only_vi <0x28b>;
	defm V_MBCNT_LO_U32_B32 : VOP2_Real_e64only_vi <0x28c>;
	defm V_MBCNT_HI_U32_B32 : VOP2_Real_e64only_vi <0x28d>;
	defm V_LDEXP_F32 : VOP2_Real_e64only_vi <0x288>;
	defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e64only_vi <0x1f0>;
	defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e64only_vi <0x294>;
	defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e64only_vi <0x295>;
	defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_e64only_vi <0x296>;
	defm V_CVT_PK_U16_U32 : VOP2_Real_e64only_vi <0x297>;
	defm V_CVT_PK_I16_I32 : VOP2_Real_e64only_vi <0x298>;

	defm V_ADD_F16 : VOP2_Real_e32e64_vi <0x1f>;
	defm V_SUB_F16 : VOP2_Real_e32e64_vi <0x20>;
	defm V_SUBREV_F16 : VOP2_Real_e32e64_vi <0x21>;
	defm V_MUL_F16 : VOP2_Real_e32e64_vi <0x22>;
	defm V_MAC_F16 : VOP2_Real_e32e64_vi <0x23>;
	defm V_MADMK_F16 : VOP2_Real_MADK_vi <0x24>;
	defm V_MADAK_F16 : VOP2_Real_MADK_vi <0x25>;
	defm V_ADD_U16 : VOP2_Real_e32e64_vi <0x26>;
	defm V_SUB_U16 : VOP2_Real_e32e64_vi <0x27>;
	defm V_SUBREV_U16 : VOP2_Real_e32e64_vi <0x28>;
	defm V_MUL_LO_U16 : VOP2_Real_e32e64_vi <0x29>;
	defm V_LSHLREV_B16 : VOP2_Real_e32e64_vi <0x2a>;
	defm V_LSHRREV_B16 : VOP2_Real_e32e64_vi <0x2b>;
	defm V_ASHRREV_I16 : VOP2_Real_e32e64_vi <0x2c>;
	defm V_MAX_F16 : VOP2_Real_e32e64_vi <0x2d>;
	defm V_MIN_F16 : VOP2_Real_e32e64_vi <0x2e>;
	defm V_MAX_U16 : VOP2_Real_e32e64_vi <0x2f>;
	defm V_MAX_I16 : VOP2_Real_e32e64_vi <0x30>;
	defm V_MIN_U16 : VOP2_Real_e32e64_vi <0x31>;
	defm V_MIN_I16 : VOP2_Real_e32e64_vi <0x32>;
	defm V_LDEXP_F16 : VOP2_Real_e32e64_vi <0x33>;

	let SubtargetPredicate = isVI in {

	// Aliases to simplify matching of floating-point instructions that
	// are VOP2 on SI and VOP3 on VI.
	class SI2_VI3Alias <string name, VOP3_Real inst> : InstAlias <
	name#" $dst, $src0, $src1",
	!if(inst.Pfl.HasOMod,
	(inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0, 0),
	(inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0))
	>, PredicateControl {
	let UseInstAsmMatchConverter = 0;
	let AsmVariantName = AMDGPUAsmVariants.VOP3;
	}

	def : SI2_VI3Alias <"v_ldexp_f32", V_LDEXP_F32_e64_vi>;
	def : SI2_VI3Alias <"v_cvt_pkaccum_u8_f32", V_CVT_PKACCUM_U8_F32_e64_vi>;
	def : SI2_VI3Alias <"v_cvt_pknorm_i16_f32", V_CVT_PKNORM_I16_F32_e64_vi>;
	def : SI2_VI3Alias <"v_cvt_pknorm_u16_f32", V_CVT_PKNORM_U16_F32_e64_vi>;
	def : SI2_VI3Alias <"v_cvt_pkrtz_f16_f32", V_CVT_PKRTZ_F16_F32_e64_vi>;

	} // End SubtargetPredicate = isVI
	Index: head/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
	===================================================================
	--- head/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp (revision 329409)
	+++ head/contrib/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp (revision 329410)
	@@ -1,1540 +1,1544 @@
	//===-- X86MCCodeEmitter.cpp - Convert X86 code to machine code -----------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the X86MCCodeEmitter class.
	//
	//===----------------------------------------------------------------------===//

	#include "MCTargetDesc/X86BaseInfo.h"
	#include "MCTargetDesc/X86FixupKinds.h"
	#include "MCTargetDesc/X86MCTargetDesc.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/MC/MCCodeEmitter.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCExpr.h"
	#include "llvm/MC/MCFixup.h"
	#include "llvm/MC/MCInst.h"
	#include "llvm/MC/MCInstrDesc.h"
	#include "llvm/MC/MCInstrInfo.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/MC/MCSubtargetInfo.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/raw_ostream.h"
	#include <cassert>
	#include <cstdint>
	#include <cstdlib>

	using namespace llvm;

	#define DEBUG_TYPE "mccodeemitter"

	namespace {

	class X86MCCodeEmitter : public MCCodeEmitter {
	const MCInstrInfo &MCII;
	MCContext &Ctx;

	public:
	X86MCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
	: MCII(mcii), Ctx(ctx) {
	}
	X86MCCodeEmitter(const X86MCCodeEmitter &) = delete;
	X86MCCodeEmitter &operator=(const X86MCCodeEmitter &) = delete;
	~X86MCCodeEmitter() override = default;

	bool is64BitMode(const MCSubtargetInfo &STI) const {
	return STI.getFeatureBits()[X86::Mode64Bit];
	}

	bool is32BitMode(const MCSubtargetInfo &STI) const {
	return STI.getFeatureBits()[X86::Mode32Bit];
	}

	bool is16BitMode(const MCSubtargetInfo &STI) const {
	return STI.getFeatureBits()[X86::Mode16Bit];
	}

	/// Is16BitMemOperand - Return true if the specified instruction has
	/// a 16-bit memory operand. Op specifies the operand # of the memoperand.
	bool Is16BitMemOperand(const MCInst &MI, unsigned Op,
	const MCSubtargetInfo &STI) const {
	const MCOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg);
	const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
	const MCOperand &Disp = MI.getOperand(Op+X86::AddrDisp);

	if (is16BitMode(STI) && BaseReg.getReg() == 0 &&
	Disp.isImm() && Disp.getImm() < 0x10000)
	return true;
	if ((BaseReg.getReg() != 0 &&
	X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) \|\|
	(IndexReg.getReg() != 0 &&
	X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg())))
	return true;
	return false;
	}

	unsigned GetX86RegNum(const MCOperand &MO) const {
	return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()) & 0x7;
	}

	unsigned getX86RegEncoding(const MCInst &MI, unsigned OpNum) const {
	return Ctx.getRegisterInfo()->getEncodingValue(
	MI.getOperand(OpNum).getReg());
	}

	// Does this register require a bit to be set in REX prefix.
	bool isREXExtendedReg(const MCInst &MI, unsigned OpNum) const {
	return (getX86RegEncoding(MI, OpNum) >> 3) & 1;
	}

	void EmitByte(uint8_t C, unsigned &CurByte, raw_ostream &OS) const {
	OS << (char)C;
	++CurByte;
	}

	void EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
	raw_ostream &OS) const {
	// Output the constant in little endian byte order.
	for (unsigned i = 0; i != Size; ++i) {
	EmitByte(Val & 255, CurByte, OS);
	Val >>= 8;
	}
	}

	void EmitImmediate(const MCOperand &Disp, SMLoc Loc,
	unsigned ImmSize, MCFixupKind FixupKind,
	unsigned &CurByte, raw_ostream &OS,
	SmallVectorImpl<MCFixup> &Fixups,
	int ImmOffset = 0) const;

	static uint8_t ModRMByte(unsigned Mod, unsigned RegOpcode, unsigned RM) {
	assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
	return RM \| (RegOpcode << 3) \| (Mod << 6);
	}

	void EmitRegModRMByte(const MCOperand &ModRMReg, unsigned RegOpcodeFld,
	unsigned &CurByte, raw_ostream &OS) const {
	EmitByte(ModRMByte(3, RegOpcodeFld, GetX86RegNum(ModRMReg)), CurByte, OS);
	}

	void EmitSIBByte(unsigned SS, unsigned Index, unsigned Base,
	unsigned &CurByte, raw_ostream &OS) const {
	// SIB byte is in the same format as the ModRMByte.
	EmitByte(ModRMByte(SS, Index, Base), CurByte, OS);
	}

	void emitMemModRMByte(const MCInst &MI, unsigned Op, unsigned RegOpcodeField,
	uint64_t TSFlags, bool Rex, unsigned &CurByte,
	raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
	const MCSubtargetInfo &STI) const;

	void encodeInstruction(const MCInst &MI, raw_ostream &OS,
	SmallVectorImpl<MCFixup> &Fixups,
	const MCSubtargetInfo &STI) const override;

	void EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
	const MCInst &MI, const MCInstrDesc &Desc,
	raw_ostream &OS) const;

	void EmitSegmentOverridePrefix(unsigned &CurByte, unsigned SegOperand,
	const MCInst &MI, raw_ostream &OS) const;

	bool emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
	const MCInst &MI, const MCInstrDesc &Desc,
	const MCSubtargetInfo &STI, raw_ostream &OS) const;

	uint8_t DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
	int MemOperand, const MCInstrDesc &Desc) const;
	};

	} // end anonymous namespace

	/// isDisp8 - Return true if this signed displacement fits in a 8-bit
	/// sign-extended field.
	static bool isDisp8(int Value) {
	return Value == (int8_t)Value;
	}

	/// isCDisp8 - Return true if this signed displacement fits in a 8-bit
	/// compressed dispacement field.
	static bool isCDisp8(uint64_t TSFlags, int Value, int& CValue) {
	assert(((TSFlags & X86II::EncodingMask) == X86II::EVEX) &&
	"Compressed 8-bit displacement is only valid for EVEX inst.");

	unsigned CD8_Scale =
	(TSFlags & X86II::CD8_Scale_Mask) >> X86II::CD8_Scale_Shift;
	if (CD8_Scale == 0) {
	CValue = Value;
	return isDisp8(Value);
	}

	unsigned Mask = CD8_Scale - 1;
	assert((CD8_Scale & Mask) == 0 && "Invalid memory object size.");
	if (Value & Mask) // Unaligned offset
	return false;
	Value /= (int)CD8_Scale;
	bool Ret = (Value == (int8_t)Value);

	if (Ret)
	CValue = Value;
	return Ret;
	}

	/// getImmFixupKind - Return the appropriate fixup kind to use for an immediate
	/// in an instruction with the specified TSFlags.
	static MCFixupKind getImmFixupKind(uint64_t TSFlags) {
	unsigned Size = X86II::getSizeOfImm(TSFlags);
	bool isPCRel = X86II::isImmPCRel(TSFlags);

	if (X86II::isImmSigned(TSFlags)) {
	switch (Size) {
	default: llvm_unreachable("Unsupported signed fixup size!");
	case 4: return MCFixupKind(X86::reloc_signed_4byte);
	}
	}
	return MCFixup::getKindForSize(Size, isPCRel);
	}

	/// Is32BitMemOperand - Return true if the specified instruction has
	/// a 32-bit memory operand. Op specifies the operand # of the memoperand.
	static bool Is32BitMemOperand(const MCInst &MI, unsigned Op) {
	const MCOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg);
	const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);

	if ((BaseReg.getReg() != 0 &&
	X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) \|\|
	(IndexReg.getReg() != 0 &&
	X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg.getReg())))
	return true;
	if (BaseReg.getReg() == X86::EIP) {
	assert(IndexReg.getReg() == 0 && "Invalid eip-based address.");
	return true;
	}
	return false;
	}

	/// Is64BitMemOperand - Return true if the specified instruction has
	/// a 64-bit memory operand. Op specifies the operand # of the memoperand.
	#ifndef NDEBUG
	static bool Is64BitMemOperand(const MCInst &MI, unsigned Op) {
	const MCOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg);
	const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);

	if ((BaseReg.getReg() != 0 &&
	X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg.getReg())) \|\|
	(IndexReg.getReg() != 0 &&
	X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg.getReg())))
	return true;
	return false;
	}
	#endif

	/// StartsWithGlobalOffsetTable - Check if this expression starts with
	/// _GLOBAL_OFFSET_TABLE_ and if it is of the form
	/// _GLOBAL_OFFSET_TABLE_-symbol. This is needed to support PIC on ELF
	/// i386 as _GLOBAL_OFFSET_TABLE_ is magical. We check only simple case that
	/// are know to be used: _GLOBAL_OFFSET_TABLE_ by itself or at the start
	/// of a binary expression.
	enum GlobalOffsetTableExprKind {
	GOT_None,
	GOT_Normal,
	GOT_SymDiff
	};
	static GlobalOffsetTableExprKind
	StartsWithGlobalOffsetTable(const MCExpr *Expr) {
	const MCExpr *RHS = nullptr;
	if (Expr->getKind() == MCExpr::Binary) {
	const MCBinaryExpr BE = static_cast<const MCBinaryExpr >(Expr);
	Expr = BE->getLHS();
	RHS = BE->getRHS();
	}

	if (Expr->getKind() != MCExpr::SymbolRef)
	return GOT_None;

	const MCSymbolRefExpr Ref = static_cast<const MCSymbolRefExpr>(Expr);
	const MCSymbol &S = Ref->getSymbol();
	if (S.getName() != "_GLOBAL_OFFSET_TABLE_")
	return GOT_None;
	if (RHS && RHS->getKind() == MCExpr::SymbolRef)
	return GOT_SymDiff;
	return GOT_Normal;
	}

	static bool HasSecRelSymbolRef(const MCExpr *Expr) {
	if (Expr->getKind() == MCExpr::SymbolRef) {
	const MCSymbolRefExpr Ref = static_cast<const MCSymbolRefExpr>(Expr);
	return Ref->getKind() == MCSymbolRefExpr::VK_SECREL;
	}
	return false;
	}

	void X86MCCodeEmitter::
	EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
	MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS,
	SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const {
	const MCExpr *Expr = nullptr;
	if (DispOp.isImm()) {
	// If this is a simple integer displacement that doesn't require a
	// relocation, emit it now.
	if (FixupKind != FK_PCRel_1 &&
	FixupKind != FK_PCRel_2 &&
	FixupKind != FK_PCRel_4) {
	EmitConstant(DispOp.getImm()+ImmOffset, Size, CurByte, OS);
	return;
	}
	Expr = MCConstantExpr::create(DispOp.getImm(), Ctx);
	} else {
	Expr = DispOp.getExpr();
	}

	// If we have an immoffset, add it to the expression.
	if ((FixupKind == FK_Data_4 \|\|
	FixupKind == FK_Data_8 \|\|
	FixupKind == MCFixupKind(X86::reloc_signed_4byte))) {
	GlobalOffsetTableExprKind Kind = StartsWithGlobalOffsetTable(Expr);
	if (Kind != GOT_None) {
	assert(ImmOffset == 0);

	if (Size == 8) {
	FixupKind = MCFixupKind(X86::reloc_global_offset_table8);
	} else {
	assert(Size == 4);
	FixupKind = MCFixupKind(X86::reloc_global_offset_table);
	}

	if (Kind == GOT_Normal)
	ImmOffset = CurByte;
	} else if (Expr->getKind() == MCExpr::SymbolRef) {
	if (HasSecRelSymbolRef(Expr)) {
	FixupKind = MCFixupKind(FK_SecRel_4);
	}
	} else if (Expr->getKind() == MCExpr::Binary) {
	const MCBinaryExpr Bin = static_cast<const MCBinaryExpr>(Expr);
	if (HasSecRelSymbolRef(Bin->getLHS())
	\|\| HasSecRelSymbolRef(Bin->getRHS())) {
	FixupKind = MCFixupKind(FK_SecRel_4);
	}
	}
	}

	// If the fixup is pc-relative, we need to bias the value to be relative to
	// the start of the field, not the end of the field.
	if (FixupKind == FK_PCRel_4 \|\|
	FixupKind == MCFixupKind(X86::reloc_riprel_4byte) \|\|
	FixupKind == MCFixupKind(X86::reloc_riprel_4byte_movq_load) \|\|
	FixupKind == MCFixupKind(X86::reloc_riprel_4byte_relax) \|\|
	FixupKind == MCFixupKind(X86::reloc_riprel_4byte_relax_rex))
	ImmOffset -= 4;
	if (FixupKind == FK_PCRel_2)
	ImmOffset -= 2;
	if (FixupKind == FK_PCRel_1)
	ImmOffset -= 1;

	if (ImmOffset)
	Expr = MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(ImmOffset, Ctx),
	Ctx);

	// Emit a symbolic constant as a fixup and 4 zeros.
	Fixups.push_back(MCFixup::create(CurByte, Expr, FixupKind, Loc));
	EmitConstant(0, Size, CurByte, OS);
	}

	void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
	unsigned RegOpcodeField,
	uint64_t TSFlags, bool Rex,
	unsigned &CurByte, raw_ostream &OS,
	SmallVectorImpl<MCFixup> &Fixups,
	const MCSubtargetInfo &STI) const {
	const MCOperand &Disp = MI.getOperand(Op+X86::AddrDisp);
	const MCOperand &Base = MI.getOperand(Op+X86::AddrBaseReg);
	const MCOperand &Scale = MI.getOperand(Op+X86::AddrScaleAmt);
	const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
	unsigned BaseReg = Base.getReg();
	bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX;

	// Handle %rip relative addressing.
	if (BaseReg == X86::RIP \|\|
	BaseReg == X86::EIP) { // [disp32+rIP] in X86-64 mode
	assert(is64BitMode(STI) && "Rip-relative addressing requires 64-bit mode");
	assert(IndexReg.getReg() == 0 && "Invalid rip-relative address");
	EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);

	unsigned Opcode = MI.getOpcode();
	// movq loads are handled with a special relocation form which allows the
	// linker to eliminate some loads for GOT references which end up in the
	// same linkage unit.
	unsigned FixupKind = [=]() {
	switch (Opcode) {
	default:
	return X86::reloc_riprel_4byte;
	case X86::MOV64rm:
	assert(Rex);
	return X86::reloc_riprel_4byte_movq_load;
	case X86::CALL64m:
	case X86::JMP64m:
	case X86::TEST64mr:
	case X86::ADC64rm:
	case X86::ADD64rm:
	case X86::AND64rm:
	case X86::CMP64rm:
	case X86::OR64rm:
	case X86::SBB64rm:
	case X86::SUB64rm:
	case X86::XOR64rm:
	return Rex ? X86::reloc_riprel_4byte_relax_rex
	: X86::reloc_riprel_4byte_relax;
	}
	}();

	// rip-relative addressing is actually relative to the next instruction.
	// Since an immediate can follow the mod/rm byte for an instruction, this
	- // means that we need to bias the immediate field of the instruction with
	- // the size of the immediate field. If we have this case, add it into the
	+ // means that we need to bias the displacement field of the instruction with
	+ // the size of the immediate field. If we have this case, add it into the
	// expression to emit.
	- int ImmSize = X86II::hasImm(TSFlags) ? X86II::getSizeOfImm(TSFlags) : 0;
	+ // Note: rip-relative addressing using immediate displacement values should
	+ // not be adjusted, assuming it was the user's intent.
	+ int ImmSize = !Disp.isImm() && X86II::hasImm(TSFlags)
	+ ? X86II::getSizeOfImm(TSFlags)
	+ : 0;

	EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind),
	CurByte, OS, Fixups, -ImmSize);
	return;
	}

	unsigned BaseRegNo = BaseReg ? GetX86RegNum(Base) : -1U;

	// 16-bit addressing forms of the ModR/M byte have a different encoding for
	// the R/M field and are far more limited in which registers can be used.
	if (Is16BitMemOperand(MI, Op, STI)) {
	if (BaseReg) {
	// For 32-bit addressing, the row and column values in Table 2-2 are
	// basically the same. It's AX/CX/DX/BX/SP/BP/SI/DI in that order, with
	// some special cases. And GetX86RegNum reflects that numbering.
	// For 16-bit addressing it's more fun, as shown in the SDM Vol 2A,
	// Table 2-1 "16-Bit Addressing Forms with the ModR/M byte". We can only
	// use SI/DI/BP/BX, which have "row" values 4-7 in no particular order,
	// while values 0-3 indicate the allowed combinations (base+index) of
	// those: 0 for BX+SI, 1 for BX+DI, 2 for BP+SI, 3 for BP+DI.
	//
	// R16Table[] is a lookup from the normal RegNo, to the row values from
	// Table 2-1 for 16-bit addressing modes. Where zero means disallowed.
	static const unsigned R16Table[] = { 0, 0, 0, 7, 0, 6, 4, 5 };
	unsigned RMfield = R16Table[BaseRegNo];

	assert(RMfield && "invalid 16-bit base register");

	if (IndexReg.getReg()) {
	unsigned IndexReg16 = R16Table[GetX86RegNum(IndexReg)];

	assert(IndexReg16 && "invalid 16-bit index register");
	// We must have one of SI/DI (4,5), and one of BP/BX (6,7).
	assert(((IndexReg16 ^ RMfield) & 2) &&
	"invalid 16-bit base/index register combination");
	assert(Scale.getImm() == 1 &&
	"invalid scale for 16-bit memory reference");

	// Allow base/index to appear in either order (although GAS doesn't).
	if (IndexReg16 & 2)
	RMfield = (RMfield & 1) \| ((7 - IndexReg16) << 1);
	else
	RMfield = (IndexReg16 & 1) \| ((7 - RMfield) << 1);
	}

	if (Disp.isImm() && isDisp8(Disp.getImm())) {
	if (Disp.getImm() == 0 && BaseRegNo != N86::EBP) {
	// There is no displacement; just the register.
	EmitByte(ModRMByte(0, RegOpcodeField, RMfield), CurByte, OS);
	return;
	}
	// Use the [REG]+disp8 form, including for [BP] which cannot be encoded.
	EmitByte(ModRMByte(1, RegOpcodeField, RMfield), CurByte, OS);
	EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
	return;
	}
	// This is the [REG]+disp16 case.
	EmitByte(ModRMByte(2, RegOpcodeField, RMfield), CurByte, OS);
	} else {
	// There is no BaseReg; this is the plain [disp16] case.
	EmitByte(ModRMByte(0, RegOpcodeField, 6), CurByte, OS);
	}

	// Emit 16-bit displacement for plain disp16 or [REG]+disp16 cases.
	EmitImmediate(Disp, MI.getLoc(), 2, FK_Data_2, CurByte, OS, Fixups);
	return;
	}

	// Determine whether a SIB byte is needed.
	// If no BaseReg, issue a RIP relative instruction only if the MCE can
	// resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table
	// 2-7) and absolute references.

	if (// The SIB byte must be used if there is an index register.
	IndexReg.getReg() == 0 &&
	// The SIB byte must be used if the base is ESP/RSP/R12, all of which
	// encode to an R/M value of 4, which indicates that a SIB byte is
	// present.
	BaseRegNo != N86::ESP &&
	// If there is no base register and we're in 64-bit mode, we need a SIB
	// byte to emit an addr that is just 'disp32' (the non-RIP relative form).
	(!is64BitMode(STI) \|\| BaseReg != 0)) {

	if (BaseReg == 0) { // [disp32] in X86-32 mode
	EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);
	EmitImmediate(Disp, MI.getLoc(), 4, FK_Data_4, CurByte, OS, Fixups);
	return;
	}

	// If the base is not EBP/ESP and there is no displacement, use simple
	// indirect register encoding, this handles addresses like [EAX]. The
	// encoding for [EBP] with no displacement means [disp32] so we handle it
	// by emitting a displacement of 0 below.
	if (Disp.isImm() && Disp.getImm() == 0 && BaseRegNo != N86::EBP) {
	EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS);
	return;
	}

	// Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
	if (Disp.isImm()) {
	if (!HasEVEX && isDisp8(Disp.getImm())) {
	EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
	EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
	return;
	}
	// Try EVEX compressed 8-bit displacement first; if failed, fall back to
	// 32-bit displacement.
	int CDisp8 = 0;
	if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
	EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
	EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups,
	CDisp8 - Disp.getImm());
	return;
	}
	}

	// Otherwise, emit the most general non-SIB encoding: [REG+disp32]
	EmitByte(ModRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS);
	unsigned Opcode = MI.getOpcode();
	unsigned FixupKind = Opcode == X86::MOV32rm ? X86::reloc_signed_4byte_relax
	: X86::reloc_signed_4byte;
	EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), CurByte, OS,
	Fixups);
	return;
	}

	// We need a SIB byte, so start by outputting the ModR/M byte first
	assert(IndexReg.getReg() != X86::ESP &&
	IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");

	bool ForceDisp32 = false;
	bool ForceDisp8 = false;
	int CDisp8 = 0;
	int ImmOffset = 0;
	if (BaseReg == 0) {
	// If there is no base register, we emit the special case SIB byte with
	// MOD=0, BASE=5, to JUST get the index, scale, and displacement.
	EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS);
	ForceDisp32 = true;
	} else if (!Disp.isImm()) {
	// Emit the normal disp32 encoding.
	EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
	ForceDisp32 = true;
	} else if (Disp.getImm() == 0 &&
	// Base reg can't be anything that ends up with '5' as the base
	// reg, it is the magic [*] nomenclature that indicates no base.
	BaseRegNo != N86::EBP) {
	// Emit no displacement ModR/M byte
	EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS);
	} else if (!HasEVEX && isDisp8(Disp.getImm())) {
	// Emit the disp8 encoding.
	EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS);
	ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
	} else if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
	// Emit the disp8 encoding.
	EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS);
	ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
	ImmOffset = CDisp8 - Disp.getImm();
	} else {
	// Emit the normal disp32 encoding.
	EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
	}

	// Calculate what the SS field value should be...
	static const unsigned SSTable[] = { ~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3 };
	unsigned SS = SSTable[Scale.getImm()];

	if (BaseReg == 0) {
	// Handle the SIB byte for the case where there is no base, see Intel
	// Manual 2A, table 2-7. The displacement has already been output.
	unsigned IndexRegNo;
	if (IndexReg.getReg())
	IndexRegNo = GetX86RegNum(IndexReg);
	else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5)
	IndexRegNo = 4;
	EmitSIBByte(SS, IndexRegNo, 5, CurByte, OS);
	} else {
	unsigned IndexRegNo;
	if (IndexReg.getReg())
	IndexRegNo = GetX86RegNum(IndexReg);
	else
	IndexRegNo = 4; // For example [ESP+1*<noreg>+4]
	EmitSIBByte(SS, IndexRegNo, GetX86RegNum(Base), CurByte, OS);
	}

	// Do we need to output a displacement?
	if (ForceDisp8)
	EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups, ImmOffset);
	else if (ForceDisp32 \|\| Disp.getImm() != 0)
	EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte),
	CurByte, OS, Fixups);
	}

	/// EmitVEXOpcodePrefix - AVX instructions are encoded using a opcode prefix
	/// called VEX.
	void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
	int MemOperand, const MCInst &MI,
	const MCInstrDesc &Desc,
	raw_ostream &OS) const {
	assert(!(TSFlags & X86II::LOCK) && "Can't have LOCK VEX.");

	uint64_t Encoding = TSFlags & X86II::EncodingMask;
	bool HasEVEX_K = TSFlags & X86II::EVEX_K;
	bool HasVEX_4V = TSFlags & X86II::VEX_4V;
	bool HasEVEX_RC = TSFlags & X86II::EVEX_RC;

	// VEX_R: opcode externsion equivalent to REX.R in
	// 1's complement (inverted) form
	//
	// 1: Same as REX_R=0 (must be 1 in 32-bit mode)
	// 0: Same as REX_R=1 (64 bit mode only)
	//
	uint8_t VEX_R = 0x1;
	uint8_t EVEX_R2 = 0x1;

	// VEX_X: equivalent to REX.X, only used when a
	// register is used for index in SIB Byte.
	//
	// 1: Same as REX.X=0 (must be 1 in 32-bit mode)
	// 0: Same as REX.X=1 (64-bit mode only)
	uint8_t VEX_X = 0x1;

	// VEX_B:
	//
	// 1: Same as REX_B=0 (ignored in 32-bit mode)
	// 0: Same as REX_B=1 (64 bit mode only)
	//
	uint8_t VEX_B = 0x1;

	// VEX_W: opcode specific (use like REX.W, or used for
	// opcode extension, or ignored, depending on the opcode byte)
	uint8_t VEX_W = (TSFlags & X86II::VEX_W) ? 1 : 0;

	// VEX_5M (VEX m-mmmmm field):
	//
	// 0b00000: Reserved for future use
	// 0b00001: implied 0F leading opcode
	// 0b00010: implied 0F 38 leading opcode bytes
	// 0b00011: implied 0F 3A leading opcode bytes
	// 0b00100-0b11111: Reserved for future use
	// 0b01000: XOP map select - 08h instructions with imm byte
	// 0b01001: XOP map select - 09h instructions with no imm byte
	// 0b01010: XOP map select - 0Ah instructions with imm dword
	uint8_t VEX_5M;
	switch (TSFlags & X86II::OpMapMask) {
	default: llvm_unreachable("Invalid prefix!");
	case X86II::TB: VEX_5M = 0x1; break; // 0F
	case X86II::T8: VEX_5M = 0x2; break; // 0F 38
	case X86II::TA: VEX_5M = 0x3; break; // 0F 3A
	case X86II::XOP8: VEX_5M = 0x8; break;
	case X86II::XOP9: VEX_5M = 0x9; break;
	case X86II::XOPA: VEX_5M = 0xA; break;
	}

	// VEX_4V (VEX vvvv field): a register specifier
	// (in 1's complement form) or 1111 if unused.
	uint8_t VEX_4V = 0xf;
	uint8_t EVEX_V2 = 0x1;

	// EVEX_L2/VEX_L (Vector Length):
	//
	// L2 L
	// 0 0: scalar or 128-bit vector
	// 0 1: 256-bit vector
	// 1 0: 512-bit vector
	//
	uint8_t VEX_L = (TSFlags & X86II::VEX_L) ? 1 : 0;
	uint8_t EVEX_L2 = (TSFlags & X86II::EVEX_L2) ? 1 : 0;

	// VEX_PP: opcode extension providing equivalent
	// functionality of a SIMD prefix
	//
	// 0b00: None
	// 0b01: 66
	// 0b10: F3
	// 0b11: F2
	//
	uint8_t VEX_PP;
	switch (TSFlags & X86II::OpPrefixMask) {
	default: llvm_unreachable("Invalid op prefix!");
	case X86II::PS: VEX_PP = 0x0; break; // none
	case X86II::PD: VEX_PP = 0x1; break; // 66
	case X86II::XS: VEX_PP = 0x2; break; // F3
	case X86II::XD: VEX_PP = 0x3; break; // F2
	}

	// EVEX_U
	uint8_t EVEX_U = 1; // Always '1' so far

	// EVEX_z
	uint8_t EVEX_z = (HasEVEX_K && (TSFlags & X86II::EVEX_Z)) ? 1 : 0;

	// EVEX_b
	uint8_t EVEX_b = (TSFlags & X86II::EVEX_B) ? 1 : 0;

	// EVEX_rc
	uint8_t EVEX_rc = 0;

	// EVEX_aaa
	uint8_t EVEX_aaa = 0;

	bool EncodeRC = false;

	// Classify VEX_B, VEX_4V, VEX_R, VEX_X
	unsigned NumOps = Desc.getNumOperands();
	unsigned CurOp = X86II::getOperandBias(Desc);

	switch (TSFlags & X86II::FormMask) {
	default: llvm_unreachable("Unexpected form in EmitVEXOpcodePrefix!");
	case X86II::RawFrm:
	break;
	case X86II::MRMDestMem: {
	// MRMDestMem instructions forms:
	// MemAddr, src1(ModR/M)
	// MemAddr, src1(VEX_4V), src2(ModR/M)
	// MemAddr, src1(ModR/M), imm8
	//
	unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
	VEX_B = ~(BaseRegEnc >> 3) & 1;
	unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg);
	VEX_X = ~(IndexRegEnc >> 3) & 1;
	if (!HasVEX_4V) // Only needed with VSIB which don't use VVVV.
	EVEX_V2 = ~(IndexRegEnc >> 4) & 1;

	CurOp += X86::AddrNumOperands;

	if (HasEVEX_K)
	EVEX_aaa = getX86RegEncoding(MI, CurOp++);

	if (HasVEX_4V) {
	unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
	VEX_4V = ~VRegEnc & 0xf;
	EVEX_V2 = ~(VRegEnc >> 4) & 1;
	}

	unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
	VEX_R = ~(RegEnc >> 3) & 1;
	EVEX_R2 = ~(RegEnc >> 4) & 1;
	break;
	}
	case X86II::MRMSrcMem: {
	// MRMSrcMem instructions forms:
	// src1(ModR/M), MemAddr
	// src1(ModR/M), src2(VEX_4V), MemAddr
	// src1(ModR/M), MemAddr, imm8
	// src1(ModR/M), MemAddr, src2(Imm[7:4])
	//
	// FMA4:
	// dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(Imm[7:4])
	unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
	VEX_R = ~(RegEnc >> 3) & 1;
	EVEX_R2 = ~(RegEnc >> 4) & 1;

	if (HasEVEX_K)
	EVEX_aaa = getX86RegEncoding(MI, CurOp++);

	if (HasVEX_4V) {
	unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
	VEX_4V = ~VRegEnc & 0xf;
	EVEX_V2 = ~(VRegEnc >> 4) & 1;
	}

	unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
	VEX_B = ~(BaseRegEnc >> 3) & 1;
	unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg);
	VEX_X = ~(IndexRegEnc >> 3) & 1;
	if (!HasVEX_4V) // Only needed with VSIB which don't use VVVV.
	EVEX_V2 = ~(IndexRegEnc >> 4) & 1;

	break;
	}
	case X86II::MRMSrcMem4VOp3: {
	// Instruction format for 4VOp3:
	// src1(ModR/M), MemAddr, src3(VEX_4V)
	unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
	VEX_R = ~(RegEnc >> 3) & 1;

	unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
	VEX_B = ~(BaseRegEnc >> 3) & 1;
	unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg);
	VEX_X = ~(IndexRegEnc >> 3) & 1;

	VEX_4V = ~getX86RegEncoding(MI, CurOp + X86::AddrNumOperands) & 0xf;
	break;
	}
	case X86II::MRMSrcMemOp4: {
	// dst(ModR/M.reg), src1(VEX_4V), src2(Imm[7:4]), src3(ModR/M),
	unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
	VEX_R = ~(RegEnc >> 3) & 1;

	unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
	VEX_4V = ~VRegEnc & 0xf;

	unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
	VEX_B = ~(BaseRegEnc >> 3) & 1;
	unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg);
	VEX_X = ~(IndexRegEnc >> 3) & 1;
	break;
	}
	case X86II::MRM0m: case X86II::MRM1m:
	case X86II::MRM2m: case X86II::MRM3m:
	case X86II::MRM4m: case X86II::MRM5m:
	case X86II::MRM6m: case X86II::MRM7m: {
	// MRM[0-9]m instructions forms:
	// MemAddr
	// src1(VEX_4V), MemAddr
	if (HasVEX_4V) {
	unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
	VEX_4V = ~VRegEnc & 0xf;
	EVEX_V2 = ~(VRegEnc >> 4) & 1;
	}

	if (HasEVEX_K)
	EVEX_aaa = getX86RegEncoding(MI, CurOp++);

	unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
	VEX_B = ~(BaseRegEnc >> 3) & 1;
	unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg);
	VEX_X = ~(IndexRegEnc >> 3) & 1;
	break;
	}
	case X86II::MRMSrcReg: {
	// MRMSrcReg instructions forms:
	// dst(ModR/M), src1(VEX_4V), src2(ModR/M), src3(Imm[7:4])
	// dst(ModR/M), src1(ModR/M)
	// dst(ModR/M), src1(ModR/M), imm8
	//
	// FMA4:
	// dst(ModR/M.reg), src1(VEX_4V), src2(Imm[7:4]), src3(ModR/M),
	unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
	VEX_R = ~(RegEnc >> 3) & 1;
	EVEX_R2 = ~(RegEnc >> 4) & 1;

	if (HasEVEX_K)
	EVEX_aaa = getX86RegEncoding(MI, CurOp++);

	if (HasVEX_4V) {
	unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
	VEX_4V = ~VRegEnc & 0xf;
	EVEX_V2 = ~(VRegEnc >> 4) & 1;
	}

	RegEnc = getX86RegEncoding(MI, CurOp++);
	VEX_B = ~(RegEnc >> 3) & 1;
	VEX_X = ~(RegEnc >> 4) & 1;

	if (EVEX_b) {
	if (HasEVEX_RC) {
	unsigned RcOperand = NumOps-1;
	assert(RcOperand >= CurOp);
	EVEX_rc = MI.getOperand(RcOperand).getImm() & 0x3;
	}
	EncodeRC = true;
	}
	break;
	}
	case X86II::MRMSrcReg4VOp3: {
	// Instruction format for 4VOp3:
	// src1(ModR/M), src2(ModR/M), src3(VEX_4V)
	unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
	VEX_R = ~(RegEnc >> 3) & 1;

	RegEnc = getX86RegEncoding(MI, CurOp++);
	VEX_B = ~(RegEnc >> 3) & 1;

	VEX_4V = ~getX86RegEncoding(MI, CurOp++) & 0xf;
	break;
	}
	case X86II::MRMSrcRegOp4: {
	// dst(ModR/M.reg), src1(VEX_4V), src2(Imm[7:4]), src3(ModR/M),
	unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
	VEX_R = ~(RegEnc >> 3) & 1;

	unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
	VEX_4V = ~VRegEnc & 0xf;

	// Skip second register source (encoded in Imm[7:4])
	++CurOp;

	RegEnc = getX86RegEncoding(MI, CurOp++);
	VEX_B = ~(RegEnc >> 3) & 1;
	VEX_X = ~(RegEnc >> 4) & 1;
	break;
	}
	case X86II::MRMDestReg: {
	// MRMDestReg instructions forms:
	// dst(ModR/M), src(ModR/M)
	// dst(ModR/M), src(ModR/M), imm8
	// dst(ModR/M), src1(VEX_4V), src2(ModR/M)
	unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
	VEX_B = ~(RegEnc >> 3) & 1;
	VEX_X = ~(RegEnc >> 4) & 1;

	if (HasEVEX_K)
	EVEX_aaa = getX86RegEncoding(MI, CurOp++);

	if (HasVEX_4V) {
	unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
	VEX_4V = ~VRegEnc & 0xf;
	EVEX_V2 = ~(VRegEnc >> 4) & 1;
	}

	RegEnc = getX86RegEncoding(MI, CurOp++);
	VEX_R = ~(RegEnc >> 3) & 1;
	EVEX_R2 = ~(RegEnc >> 4) & 1;
	if (EVEX_b)
	EncodeRC = true;
	break;
	}
	case X86II::MRM0r: case X86II::MRM1r:
	case X86II::MRM2r: case X86II::MRM3r:
	case X86II::MRM4r: case X86II::MRM5r:
	case X86II::MRM6r: case X86II::MRM7r: {
	// MRM0r-MRM7r instructions forms:
	// dst(VEX_4V), src(ModR/M), imm8
	if (HasVEX_4V) {
	unsigned VRegEnc = getX86RegEncoding(MI, CurOp++);
	VEX_4V = ~VRegEnc & 0xf;
	EVEX_V2 = ~(VRegEnc >> 4) & 1;
	}
	if (HasEVEX_K)
	EVEX_aaa = getX86RegEncoding(MI, CurOp++);

	unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
	VEX_B = ~(RegEnc >> 3) & 1;
	VEX_X = ~(RegEnc >> 4) & 1;
	break;
	}
	}

	if (Encoding == X86II::VEX \|\| Encoding == X86II::XOP) {
	// VEX opcode prefix can have 2 or 3 bytes
	//
	// 3 bytes:
	// +-----+ +--------------+ +-------------------+
	// \| C4h \| \| RXB \| m-mmmm \| \| W \| vvvv \| L \| pp \|
	// +-----+ +--------------+ +-------------------+
	// 2 bytes:
	// +-----+ +-------------------+
	// \| C5h \| \| R \| vvvv \| L \| pp \|
	// +-----+ +-------------------+
	//
	// XOP uses a similar prefix:
	// +-----+ +--------------+ +-------------------+
	// \| 8Fh \| \| RXB \| m-mmmm \| \| W \| vvvv \| L \| pp \|
	// +-----+ +--------------+ +-------------------+
	uint8_t LastByte = VEX_PP \| (VEX_L << 2) \| (VEX_4V << 3);

	// Can we use the 2 byte VEX prefix?
	if (Encoding == X86II::VEX && VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) {
	EmitByte(0xC5, CurByte, OS);
	EmitByte(LastByte \| (VEX_R << 7), CurByte, OS);
	return;
	}

	// 3 byte VEX prefix
	EmitByte(Encoding == X86II::XOP ? 0x8F : 0xC4, CurByte, OS);
	EmitByte(VEX_R << 7 \| VEX_X << 6 \| VEX_B << 5 \| VEX_5M, CurByte, OS);
	EmitByte(LastByte \| (VEX_W << 7), CurByte, OS);
	} else {
	assert(Encoding == X86II::EVEX && "unknown encoding!");
	// EVEX opcode prefix can have 4 bytes
	//
	// +-----+ +--------------+ +-------------------+ +------------------------+
	// \| 62h \| \| RXBR' \| 00mm \| \| W \| vvvv \| U \| pp \| \| z \| L'L \| b \| v' \| aaa \|
	// +-----+ +--------------+ +-------------------+ +------------------------+
	assert((VEX_5M & 0x3) == VEX_5M
	&& "More than 2 significant bits in VEX.m-mmmm fields for EVEX!");

	EmitByte(0x62, CurByte, OS);
	EmitByte((VEX_R << 7) \|
	(VEX_X << 6) \|
	(VEX_B << 5) \|
	(EVEX_R2 << 4) \|
	VEX_5M, CurByte, OS);
	EmitByte((VEX_W << 7) \|
	(VEX_4V << 3) \|
	(EVEX_U << 2) \|
	VEX_PP, CurByte, OS);
	if (EncodeRC)
	EmitByte((EVEX_z << 7) \|
	(EVEX_rc << 5) \|
	(EVEX_b << 4) \|
	(EVEX_V2 << 3) \|
	EVEX_aaa, CurByte, OS);
	else
	EmitByte((EVEX_z << 7) \|
	(EVEX_L2 << 6) \|
	(VEX_L << 5) \|
	(EVEX_b << 4) \|
	(EVEX_V2 << 3) \|
	EVEX_aaa, CurByte, OS);
	}
	}

	/// DetermineREXPrefix - Determine if the MCInst has to be encoded with a X86-64
	/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand
	/// size, and 3) use of X86-64 extended registers.
	uint8_t X86MCCodeEmitter::DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
	int MemOperand,
	const MCInstrDesc &Desc) const {
	uint8_t REX = 0;
	bool UsesHighByteReg = false;

	if (TSFlags & X86II::REX_W)
	REX \|= 1 << 3; // set REX.W

	if (MI.getNumOperands() == 0) return REX;

	unsigned NumOps = MI.getNumOperands();
	unsigned CurOp = X86II::getOperandBias(Desc);

	// If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
	for (unsigned i = CurOp; i != NumOps; ++i) {
	const MCOperand &MO = MI.getOperand(i);
	if (!MO.isReg()) continue;
	unsigned Reg = MO.getReg();
	if (Reg == X86::AH \|\| Reg == X86::BH \|\| Reg == X86::CH \|\| Reg == X86::DH)
	UsesHighByteReg = true;
	if (X86II::isX86_64NonExtLowByteReg(Reg))
	// FIXME: The caller of DetermineREXPrefix slaps this prefix onto anything
	// that returns non-zero.
	REX \|= 0x40; // REX fixed encoding prefix
	}

	switch (TSFlags & X86II::FormMask) {
	case X86II::AddRegFrm:
	REX \|= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
	break;
	case X86II::MRMSrcReg:
	REX \|= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
	REX \|= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
	break;
	case X86II::MRMSrcMem: {
	REX \|= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
	REX \|= isREXExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B
	REX \|= isREXExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X
	CurOp += X86::AddrNumOperands;
	break;
	}
	case X86II::MRMDestReg:
	REX \|= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
	REX \|= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
	break;
	case X86II::MRMDestMem:
	REX \|= isREXExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B
	REX \|= isREXExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X
	CurOp += X86::AddrNumOperands;
	REX \|= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
	break;
	case X86II::MRMXm:
	case X86II::MRM0m: case X86II::MRM1m:
	case X86II::MRM2m: case X86II::MRM3m:
	case X86II::MRM4m: case X86II::MRM5m:
	case X86II::MRM6m: case X86II::MRM7m:
	REX \|= isREXExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B
	REX \|= isREXExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X
	break;
	case X86II::MRMXr:
	case X86II::MRM0r: case X86II::MRM1r:
	case X86II::MRM2r: case X86II::MRM3r:
	case X86II::MRM4r: case X86II::MRM5r:
	case X86II::MRM6r: case X86II::MRM7r:
	REX \|= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
	break;
	}
	if (REX && UsesHighByteReg)
	report_fatal_error("Cannot encode high byte register in REX-prefixed instruction");

	return REX;
	}

	/// EmitSegmentOverridePrefix - Emit segment override opcode prefix as needed
	void X86MCCodeEmitter::EmitSegmentOverridePrefix(unsigned &CurByte,
	unsigned SegOperand,
	const MCInst &MI,
	raw_ostream &OS) const {
	// Check for explicit segment override on memory operand.
	switch (MI.getOperand(SegOperand).getReg()) {
	default: llvm_unreachable("Unknown segment register!");
	case 0: break;
	case X86::CS: EmitByte(0x2E, CurByte, OS); break;
	case X86::SS: EmitByte(0x36, CurByte, OS); break;
	case X86::DS: EmitByte(0x3E, CurByte, OS); break;
	case X86::ES: EmitByte(0x26, CurByte, OS); break;
	case X86::FS: EmitByte(0x64, CurByte, OS); break;
	case X86::GS: EmitByte(0x65, CurByte, OS); break;
	}
	}

	/// Emit all instruction prefixes prior to the opcode.
	///
	/// MemOperand is the operand # of the start of a memory operand if present. If
	/// Not present, it is -1.
	///
	/// Returns true if a REX prefix was used.
	bool X86MCCodeEmitter::emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
	int MemOperand, const MCInst &MI,
	const MCInstrDesc &Desc,
	const MCSubtargetInfo &STI,
	raw_ostream &OS) const {
	bool Ret = false;
	// Emit the operand size opcode prefix as needed.
	if ((TSFlags & X86II::OpSizeMask) == (is16BitMode(STI) ? X86II::OpSize32
	: X86II::OpSize16))
	EmitByte(0x66, CurByte, OS);

	// Emit the LOCK opcode prefix.
	if (TSFlags & X86II::LOCK \|\| MI.getFlags() & X86::IP_HAS_LOCK)
	EmitByte(0xF0, CurByte, OS);

	switch (TSFlags & X86II::OpPrefixMask) {
	case X86II::PD: // 66
	EmitByte(0x66, CurByte, OS);
	break;
	case X86II::XS: // F3
	EmitByte(0xF3, CurByte, OS);
	break;
	case X86II::XD: // F2
	EmitByte(0xF2, CurByte, OS);
	break;
	}

	// Handle REX prefix.
	// FIXME: Can this come before F2 etc to simplify emission?
	if (is64BitMode(STI)) {
	if (uint8_t REX = DetermineREXPrefix(MI, TSFlags, MemOperand, Desc)) {
	EmitByte(0x40 \| REX, CurByte, OS);
	Ret = true;
	}
	} else {
	assert(!(TSFlags & X86II::REX_W) && "REX.W requires 64bit mode.");
	}

	// 0x0F escape code must be emitted just before the opcode.
	switch (TSFlags & X86II::OpMapMask) {
	case X86II::TB: // Two-byte opcode map
	case X86II::T8: // 0F 38
	case X86II::TA: // 0F 3A
	EmitByte(0x0F, CurByte, OS);
	break;
	}

	switch (TSFlags & X86II::OpMapMask) {
	case X86II::T8: // 0F 38
	EmitByte(0x38, CurByte, OS);
	break;
	case X86II::TA: // 0F 3A
	EmitByte(0x3A, CurByte, OS);
	break;
	}
	return Ret;
	}

	void X86MCCodeEmitter::
	encodeInstruction(const MCInst &MI, raw_ostream &OS,
	SmallVectorImpl<MCFixup> &Fixups,
	const MCSubtargetInfo &STI) const {
	unsigned Opcode = MI.getOpcode();
	const MCInstrDesc &Desc = MCII.get(Opcode);
	uint64_t TSFlags = Desc.TSFlags;
	unsigned Flags = MI.getFlags();

	// Pseudo instructions don't get encoded.
	if ((TSFlags & X86II::FormMask) == X86II::Pseudo)
	return;

	unsigned NumOps = Desc.getNumOperands();
	unsigned CurOp = X86II::getOperandBias(Desc);

	// Keep track of the current byte being emitted.
	unsigned CurByte = 0;

	// Encoding type for this instruction.
	uint64_t Encoding = TSFlags & X86II::EncodingMask;

	// It uses the VEX.VVVV field?
	bool HasVEX_4V = TSFlags & X86II::VEX_4V;
	bool HasVEX_I8Reg = (TSFlags & X86II::ImmMask) == X86II::Imm8Reg;

	// It uses the EVEX.aaa field?
	bool HasEVEX_K = TSFlags & X86II::EVEX_K;
	bool HasEVEX_RC = TSFlags & X86II::EVEX_RC;

	// Used if a register is encoded in 7:4 of immediate.
	unsigned I8RegNum = 0;

	// Determine where the memory operand starts, if present.
	int MemoryOperand = X86II::getMemoryOperandNo(TSFlags);
	if (MemoryOperand != -1) MemoryOperand += CurOp;

	// Emit segment override opcode prefix as needed.
	if (MemoryOperand >= 0)
	EmitSegmentOverridePrefix(CurByte, MemoryOperand+X86::AddrSegmentReg,
	MI, OS);

	// Emit the repeat opcode prefix as needed.
	if (TSFlags & X86II::REP \|\| Flags & X86::IP_HAS_REPEAT)
	EmitByte(0xF3, CurByte, OS);
	if (Flags & X86::IP_HAS_REPEAT_NE)
	EmitByte(0xF2, CurByte, OS);

	// Emit the address size opcode prefix as needed.
	bool need_address_override;
	uint64_t AdSize = TSFlags & X86II::AdSizeMask;
	if ((is16BitMode(STI) && AdSize == X86II::AdSize32) \|\|
	(is32BitMode(STI) && AdSize == X86II::AdSize16) \|\|
	(is64BitMode(STI) && AdSize == X86II::AdSize32)) {
	need_address_override = true;
	} else if (MemoryOperand < 0) {
	need_address_override = false;
	} else if (is64BitMode(STI)) {
	assert(!Is16BitMemOperand(MI, MemoryOperand, STI));
	need_address_override = Is32BitMemOperand(MI, MemoryOperand);
	} else if (is32BitMode(STI)) {
	assert(!Is64BitMemOperand(MI, MemoryOperand));
	need_address_override = Is16BitMemOperand(MI, MemoryOperand, STI);
	} else {
	assert(is16BitMode(STI));
	assert(!Is64BitMemOperand(MI, MemoryOperand));
	need_address_override = !Is16BitMemOperand(MI, MemoryOperand, STI);
	}

	if (need_address_override)
	EmitByte(0x67, CurByte, OS);

	bool Rex = false;
	if (Encoding == 0)
	Rex = emitOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, STI, OS);
	else
	EmitVEXOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS);

	uint8_t BaseOpcode = X86II::getBaseOpcodeFor(TSFlags);

	if (TSFlags & X86II::Has3DNow0F0FOpcode)
	BaseOpcode = 0x0F; // Weird 3DNow! encoding.

	uint64_t Form = TSFlags & X86II::FormMask;
	switch (Form) {
	default: errs() << "FORM: " << Form << "\n";
	llvm_unreachable("Unknown FormMask value in X86MCCodeEmitter!");
	case X86II::Pseudo:
	llvm_unreachable("Pseudo instruction shouldn't be emitted");
	case X86II::RawFrmDstSrc: {
	unsigned siReg = MI.getOperand(1).getReg();
	assert(((siReg == X86::SI && MI.getOperand(0).getReg() == X86::DI) \|\|
	(siReg == X86::ESI && MI.getOperand(0).getReg() == X86::EDI) \|\|
	(siReg == X86::RSI && MI.getOperand(0).getReg() == X86::RDI)) &&
	"SI and DI register sizes do not match");
	// Emit segment override opcode prefix as needed (not for %ds).
	if (MI.getOperand(2).getReg() != X86::DS)
	EmitSegmentOverridePrefix(CurByte, 2, MI, OS);
	// Emit AdSize prefix as needed.
	if ((!is32BitMode(STI) && siReg == X86::ESI) \|\|
	(is32BitMode(STI) && siReg == X86::SI))
	EmitByte(0x67, CurByte, OS);
	CurOp += 3; // Consume operands.
	EmitByte(BaseOpcode, CurByte, OS);
	break;
	}
	case X86II::RawFrmSrc: {
	unsigned siReg = MI.getOperand(0).getReg();
	// Emit segment override opcode prefix as needed (not for %ds).
	if (MI.getOperand(1).getReg() != X86::DS)
	EmitSegmentOverridePrefix(CurByte, 1, MI, OS);
	// Emit AdSize prefix as needed.
	if ((!is32BitMode(STI) && siReg == X86::ESI) \|\|
	(is32BitMode(STI) && siReg == X86::SI))
	EmitByte(0x67, CurByte, OS);
	CurOp += 2; // Consume operands.
	EmitByte(BaseOpcode, CurByte, OS);
	break;
	}
	case X86II::RawFrmDst: {
	unsigned siReg = MI.getOperand(0).getReg();
	// Emit AdSize prefix as needed.
	if ((!is32BitMode(STI) && siReg == X86::EDI) \|\|
	(is32BitMode(STI) && siReg == X86::DI))
	EmitByte(0x67, CurByte, OS);
	++CurOp; // Consume operand.
	EmitByte(BaseOpcode, CurByte, OS);
	break;
	}
	case X86II::RawFrm:
	EmitByte(BaseOpcode, CurByte, OS);
	break;
	case X86II::RawFrmMemOffs:
	// Emit segment override opcode prefix as needed.
	EmitSegmentOverridePrefix(CurByte, 1, MI, OS);
	EmitByte(BaseOpcode, CurByte, OS);
	EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
	X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
	CurByte, OS, Fixups);
	++CurOp; // skip segment operand
	break;
	case X86II::RawFrmImm8:
	EmitByte(BaseOpcode, CurByte, OS);
	EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
	X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
	CurByte, OS, Fixups);
	EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 1, FK_Data_1, CurByte,
	OS, Fixups);
	break;
	case X86II::RawFrmImm16:
	EmitByte(BaseOpcode, CurByte, OS);
	EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
	X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
	CurByte, OS, Fixups);
	EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 2, FK_Data_2, CurByte,
	OS, Fixups);
	break;

	case X86II::AddRegFrm:
	EmitByte(BaseOpcode + GetX86RegNum(MI.getOperand(CurOp++)), CurByte, OS);
	break;

	case X86II::MRMDestReg: {
	EmitByte(BaseOpcode, CurByte, OS);
	unsigned SrcRegNum = CurOp + 1;

	if (HasEVEX_K) // Skip writemask
	++SrcRegNum;

	if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
	++SrcRegNum;

	EmitRegModRMByte(MI.getOperand(CurOp),
	GetX86RegNum(MI.getOperand(SrcRegNum)), CurByte, OS);
	CurOp = SrcRegNum + 1;
	break;
	}
	case X86II::MRMDestMem: {
	EmitByte(BaseOpcode, CurByte, OS);
	unsigned SrcRegNum = CurOp + X86::AddrNumOperands;

	if (HasEVEX_K) // Skip writemask
	++SrcRegNum;

	if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
	++SrcRegNum;

	emitMemModRMByte(MI, CurOp, GetX86RegNum(MI.getOperand(SrcRegNum)), TSFlags,
	Rex, CurByte, OS, Fixups, STI);
	CurOp = SrcRegNum + 1;
	break;
	}
	case X86II::MRMSrcReg: {
	EmitByte(BaseOpcode, CurByte, OS);
	unsigned SrcRegNum = CurOp + 1;

	if (HasEVEX_K) // Skip writemask
	++SrcRegNum;

	if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
	++SrcRegNum;

	EmitRegModRMByte(MI.getOperand(SrcRegNum),
	GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
	CurOp = SrcRegNum + 1;
	if (HasVEX_I8Reg)
	I8RegNum = getX86RegEncoding(MI, CurOp++);
	// do not count the rounding control operand
	if (HasEVEX_RC)
	--NumOps;
	break;
	}
	case X86II::MRMSrcReg4VOp3: {
	EmitByte(BaseOpcode, CurByte, OS);
	unsigned SrcRegNum = CurOp + 1;

	EmitRegModRMByte(MI.getOperand(SrcRegNum),
	GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
	CurOp = SrcRegNum + 1;
	++CurOp; // Encoded in VEX.VVVV
	break;
	}
	case X86II::MRMSrcRegOp4: {
	EmitByte(BaseOpcode, CurByte, OS);
	unsigned SrcRegNum = CurOp + 1;

	// Skip 1st src (which is encoded in VEX_VVVV)
	++SrcRegNum;

	// Capture 2nd src (which is encoded in Imm[7:4])
	assert(HasVEX_I8Reg && "MRMSrcRegOp4 should imply VEX_I8Reg");
	I8RegNum = getX86RegEncoding(MI, SrcRegNum++);

	EmitRegModRMByte(MI.getOperand(SrcRegNum),
	GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
	CurOp = SrcRegNum + 1;
	break;
	}
	case X86II::MRMSrcMem: {
	unsigned FirstMemOp = CurOp+1;

	if (HasEVEX_K) // Skip writemask
	++FirstMemOp;

	if (HasVEX_4V)
	++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV).

	EmitByte(BaseOpcode, CurByte, OS);

	emitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)),
	TSFlags, Rex, CurByte, OS, Fixups, STI);
	CurOp = FirstMemOp + X86::AddrNumOperands;
	if (HasVEX_I8Reg)
	I8RegNum = getX86RegEncoding(MI, CurOp++);
	break;
	}
	case X86II::MRMSrcMem4VOp3: {
	unsigned FirstMemOp = CurOp+1;

	EmitByte(BaseOpcode, CurByte, OS);

	emitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)),
	TSFlags, Rex, CurByte, OS, Fixups, STI);
	CurOp = FirstMemOp + X86::AddrNumOperands;
	++CurOp; // Encoded in VEX.VVVV.
	break;
	}
	case X86II::MRMSrcMemOp4: {
	unsigned FirstMemOp = CurOp+1;

	++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV).

	// Capture second register source (encoded in Imm[7:4])
	assert(HasVEX_I8Reg && "MRMSrcRegOp4 should imply VEX_I8Reg");
	I8RegNum = getX86RegEncoding(MI, FirstMemOp++);

	EmitByte(BaseOpcode, CurByte, OS);

	emitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)),
	TSFlags, Rex, CurByte, OS, Fixups, STI);
	CurOp = FirstMemOp + X86::AddrNumOperands;
	break;
	}

	case X86II::MRMXr:
	case X86II::MRM0r: case X86II::MRM1r:
	case X86II::MRM2r: case X86II::MRM3r:
	case X86II::MRM4r: case X86II::MRM5r:
	case X86II::MRM6r: case X86II::MRM7r:
	if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
	++CurOp;
	if (HasEVEX_K) // Skip writemask
	++CurOp;
	EmitByte(BaseOpcode, CurByte, OS);
	EmitRegModRMByte(MI.getOperand(CurOp++),
	(Form == X86II::MRMXr) ? 0 : Form-X86II::MRM0r,
	CurByte, OS);
	break;

	case X86II::MRMXm:
	case X86II::MRM0m: case X86II::MRM1m:
	case X86II::MRM2m: case X86II::MRM3m:
	case X86II::MRM4m: case X86II::MRM5m:
	case X86II::MRM6m: case X86II::MRM7m:
	if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
	++CurOp;
	if (HasEVEX_K) // Skip writemask
	++CurOp;
	EmitByte(BaseOpcode, CurByte, OS);
	emitMemModRMByte(MI, CurOp,
	(Form == X86II::MRMXm) ? 0 : Form - X86II::MRM0m, TSFlags,
	Rex, CurByte, OS, Fixups, STI);
	CurOp += X86::AddrNumOperands;
	break;

	case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
	case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C5:
	case X86II::MRM_C6: case X86II::MRM_C7: case X86II::MRM_C8:
	case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB:
	case X86II::MRM_CC: case X86II::MRM_CD: case X86II::MRM_CE:
	case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1:
	case X86II::MRM_D2: case X86II::MRM_D3: case X86II::MRM_D4:
	case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D7:
	case X86II::MRM_D8: case X86II::MRM_D9: case X86II::MRM_DA:
	case X86II::MRM_DB: case X86II::MRM_DC: case X86II::MRM_DD:
	case X86II::MRM_DE: case X86II::MRM_DF: case X86II::MRM_E0:
	case X86II::MRM_E1: case X86II::MRM_E2: case X86II::MRM_E3:
	case X86II::MRM_E4: case X86II::MRM_E5: case X86II::MRM_E6:
	case X86II::MRM_E7: case X86II::MRM_E8: case X86II::MRM_E9:
	case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC:
	case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_EF:
	case X86II::MRM_F0: case X86II::MRM_F1: case X86II::MRM_F2:
	case X86II::MRM_F3: case X86II::MRM_F4: case X86II::MRM_F5:
	case X86II::MRM_F6: case X86II::MRM_F7: case X86II::MRM_F8:
	case X86II::MRM_F9: case X86II::MRM_FA: case X86II::MRM_FB:
	case X86II::MRM_FC: case X86II::MRM_FD: case X86II::MRM_FE:
	case X86II::MRM_FF:
	EmitByte(BaseOpcode, CurByte, OS);
	EmitByte(0xC0 + Form - X86II::MRM_C0, CurByte, OS);
	break;
	}

	if (HasVEX_I8Reg) {
	// The last source register of a 4 operand instruction in AVX is encoded
	// in bits[7:4] of a immediate byte.
	assert(I8RegNum < 16 && "Register encoding out of range");
	I8RegNum <<= 4;
	if (CurOp != NumOps) {
	unsigned Val = MI.getOperand(CurOp++).getImm();
	assert(Val < 16 && "Immediate operand value out of range");
	I8RegNum \|= Val;
	}
	EmitImmediate(MCOperand::createImm(I8RegNum), MI.getLoc(), 1, FK_Data_1,
	CurByte, OS, Fixups);
	} else {
	// If there is a remaining operand, it must be a trailing immediate. Emit it
	// according to the right size for the instruction. Some instructions
	// (SSE4a extrq and insertq) have two trailing immediates.
	while (CurOp != NumOps && NumOps - CurOp <= 2) {
	EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
	X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
	CurByte, OS, Fixups);
	}
	}

	if (TSFlags & X86II::Has3DNow0F0FOpcode)
	EmitByte(X86II::getBaseOpcodeFor(TSFlags), CurByte, OS);

	#ifndef NDEBUG
	// FIXME: Verify.
	if (/!Desc.isVariadic() &&/ CurOp != NumOps) {
	errs() << "Cannot encode all operands of: ";
	MI.dump();
	errs() << '\n';
	abort();
	}
	#endif
	}

	MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII,
	const MCRegisterInfo &MRI,
	MCContext &Ctx) {
	return new X86MCCodeEmitter(MCII, Ctx);
	}
	Index: head/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp
	===================================================================
	--- head/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp (revision 329409)
	+++ head/contrib/llvm/lib/Target/X86/X86AsmPrinter.cpp (revision 329410)
	@@ -1,687 +1,696 @@
	//===-- X86AsmPrinter.cpp - Convert X86 LLVM code to AT&T assembly --------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains a printer that converts from our internal representation
	// of machine-dependent LLVM code to X86 machine code.
	//
	//===----------------------------------------------------------------------===//

	#include "X86AsmPrinter.h"
	#include "InstPrinter/X86ATTInstPrinter.h"
	#include "MCTargetDesc/X86BaseInfo.h"
	#include "MCTargetDesc/X86TargetStreamer.h"
	#include "X86InstrInfo.h"
	#include "X86MachineFunctionInfo.h"
	#include "llvm/BinaryFormat/COFF.h"
	#include "llvm/CodeGen/MachineConstantPool.h"
	#include "llvm/CodeGen/MachineModuleInfoImpls.h"
	#include "llvm/CodeGen/MachineValueType.h"
	#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Mangler.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/Type.h"
	#include "llvm/MC/MCCodeEmitter.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCExpr.h"
	#include "llvm/MC/MCSectionCOFF.h"
	#include "llvm/MC/MCSectionMachO.h"
	#include "llvm/MC/MCStreamer.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/TargetRegistry.h"
	using namespace llvm;

	X86AsmPrinter::X86AsmPrinter(TargetMachine &TM,
	std::unique_ptr<MCStreamer> Streamer)
	: AsmPrinter(TM, std::move(Streamer)), SM(this), FM(this) {}

	//===----------------------------------------------------------------------===//
	// Primitive Helper Functions.
	//===----------------------------------------------------------------------===//

	/// runOnMachineFunction - Emit the function body.
	///
	bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
	Subtarget = &MF.getSubtarget<X86Subtarget>();

	SMShadowTracker.startFunction(MF);
	CodeEmitter.reset(TM.getTarget().createMCCodeEmitter(
	Subtarget->getInstrInfo(), Subtarget->getRegisterInfo(),
	MF.getContext()));

	EmitFPOData =
	Subtarget->isTargetWin32() && MF.getMMI().getModule()->getCodeViewFlag();

	SetupMachineFunction(MF);

	if (Subtarget->isTargetCOFF()) {
	bool Local = MF.getFunction().hasLocalLinkage();
	OutStreamer->BeginCOFFSymbolDef(CurrentFnSym);
	OutStreamer->EmitCOFFSymbolStorageClass(
	Local ? COFF::IMAGE_SYM_CLASS_STATIC : COFF::IMAGE_SYM_CLASS_EXTERNAL);
	OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
	<< COFF::SCT_COMPLEX_TYPE_SHIFT);
	OutStreamer->EndCOFFSymbolDef();
	}

	// Emit the rest of the function body.
	EmitFunctionBody();

	// Emit the XRay table for this function.
	emitXRayTable();

	EmitFPOData = false;

	// We didn't modify anything.
	return false;
	}

	void X86AsmPrinter::EmitFunctionBodyStart() {
	if (EmitFPOData) {
	X86TargetStreamer *XTS =
	static_cast<X86TargetStreamer *>(OutStreamer->getTargetStreamer());
	unsigned ParamsSize =
	MF->getInfo<X86MachineFunctionInfo>()->getArgumentStackSize();
	XTS->emitFPOProc(CurrentFnSym, ParamsSize);
	}
	}

	void X86AsmPrinter::EmitFunctionBodyEnd() {
	if (EmitFPOData) {
	X86TargetStreamer *XTS =
	static_cast<X86TargetStreamer *>(OutStreamer->getTargetStreamer());
	XTS->emitFPOEndProc();
	}
	}

	/// printSymbolOperand - Print a raw symbol reference operand. This handles
	/// jump tables, constant pools, global address and external symbols, all of
	/// which print to a label with various suffixes for relocation types etc.
	static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
	raw_ostream &O) {
	switch (MO.getType()) {
	default: llvm_unreachable("unknown symbol type!");
	case MachineOperand::MO_ConstantPoolIndex:
	P.GetCPISymbol(MO.getIndex())->print(O, P.MAI);
	P.printOffset(MO.getOffset(), O);
	break;
	case MachineOperand::MO_GlobalAddress: {
	const GlobalValue *GV = MO.getGlobal();

	MCSymbol *GVSym;
	if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY \|\|
	MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE)
	GVSym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
	else
	GVSym = P.getSymbol(GV);

	// Handle dllimport linkage.
	if (MO.getTargetFlags() == X86II::MO_DLLIMPORT)
	GVSym =
	P.OutContext.getOrCreateSymbol(Twine("__imp_") + GVSym->getName());

	if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY \|\|
	MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE) {
	MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
	MachineModuleInfoImpl::StubValueTy &StubSym =
	P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym);
	if (!StubSym.getPointer())
	StubSym = MachineModuleInfoImpl::
	StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage());
	}

	// If the name begins with a dollar-sign, enclose it in parens. We do this
	// to avoid having it look like an integer immediate to the assembler.
	if (GVSym->getName()[0] != '$')
	GVSym->print(O, P.MAI);
	else {
	O << '(';
	GVSym->print(O, P.MAI);
	O << ')';
	}
	P.printOffset(MO.getOffset(), O);
	break;
	}
	}

	switch (MO.getTargetFlags()) {
	default:
	llvm_unreachable("Unknown target flag on GV operand");
	case X86II::MO_NO_FLAG: // No flag.
	break;
	case X86II::MO_DARWIN_NONLAZY:
	case X86II::MO_DLLIMPORT:
	// These affect the name of the symbol, not any suffix.
	break;
	case X86II::MO_GOT_ABSOLUTE_ADDRESS:
	O << " + [.-";
	P.MF->getPICBaseSymbol()->print(O, P.MAI);
	O << ']';
	break;
	case X86II::MO_PIC_BASE_OFFSET:
	case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
	O << '-';
	P.MF->getPICBaseSymbol()->print(O, P.MAI);
	break;
	case X86II::MO_TLSGD: O << "@TLSGD"; break;
	case X86II::MO_TLSLD: O << "@TLSLD"; break;
	case X86II::MO_TLSLDM: O << "@TLSLDM"; break;
	case X86II::MO_GOTTPOFF: O << "@GOTTPOFF"; break;
	case X86II::MO_INDNTPOFF: O << "@INDNTPOFF"; break;
	case X86II::MO_TPOFF: O << "@TPOFF"; break;
	case X86II::MO_DTPOFF: O << "@DTPOFF"; break;
	case X86II::MO_NTPOFF: O << "@NTPOFF"; break;
	case X86II::MO_GOTNTPOFF: O << "@GOTNTPOFF"; break;
	case X86II::MO_GOTPCREL: O << "@GOTPCREL"; break;
	case X86II::MO_GOT: O << "@GOT"; break;
	case X86II::MO_GOTOFF: O << "@GOTOFF"; break;
	case X86II::MO_PLT: O << "@PLT"; break;
	case X86II::MO_TLVP: O << "@TLVP"; break;
	case X86II::MO_TLVP_PIC_BASE:
	O << "@TLVP" << '-';
	P.MF->getPICBaseSymbol()->print(O, P.MAI);
	break;
	case X86II::MO_SECREL: O << "@SECREL32"; break;
	}
	}

	static void printOperand(X86AsmPrinter &P, const MachineInstr *MI,
	unsigned OpNo, raw_ostream &O,
	const char *Modifier = nullptr, unsigned AsmVariant = 0);

	/// printPCRelImm - This is used to print an immediate value that ends up
	/// being encoded as a pc-relative value. These print slightly differently, for
	/// example, a $ is not emitted.
	static void printPCRelImm(X86AsmPrinter &P, const MachineInstr *MI,
	unsigned OpNo, raw_ostream &O) {
	const MachineOperand &MO = MI->getOperand(OpNo);
	switch (MO.getType()) {
	default: llvm_unreachable("Unknown pcrel immediate operand");
	case MachineOperand::MO_Register:
	// pc-relativeness was handled when computing the value in the reg.
	printOperand(P, MI, OpNo, O);
	return;
	case MachineOperand::MO_Immediate:
	O << MO.getImm();
	return;
	case MachineOperand::MO_GlobalAddress:
	printSymbolOperand(P, MO, O);
	return;
	}
	}

	static void printOperand(X86AsmPrinter &P, const MachineInstr *MI,
	unsigned OpNo, raw_ostream &O, const char *Modifier,
	unsigned AsmVariant) {
	const MachineOperand &MO = MI->getOperand(OpNo);
	switch (MO.getType()) {
	default: llvm_unreachable("unknown operand type!");
	case MachineOperand::MO_Register: {
	// FIXME: Enumerating AsmVariant, so we can remove magic number.
	if (AsmVariant == 0) O << '%';
	unsigned Reg = MO.getReg();
	if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
	unsigned Size = (strcmp(Modifier+6,"64") == 0) ? 64 :
	(strcmp(Modifier+6,"32") == 0) ? 32 :
	(strcmp(Modifier+6,"16") == 0) ? 16 : 8;
	Reg = getX86SubSuperRegister(Reg, Size);
	}
	O << X86ATTInstPrinter::getRegisterName(Reg);
	return;
	}

	case MachineOperand::MO_Immediate:
	if (AsmVariant == 0) O << '$';
	O << MO.getImm();
	return;

	case MachineOperand::MO_GlobalAddress: {
	if (AsmVariant == 0) O << '$';
	printSymbolOperand(P, MO, O);
	break;
	}
	}
	}

	static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI,
	unsigned Op, raw_ostream &O,
	const char *Modifier = nullptr) {
	const MachineOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg);
	const MachineOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
	const MachineOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);

	// If we really don't want to print out (rip), don't.
	bool HasBaseReg = BaseReg.getReg() != 0;
	if (HasBaseReg && Modifier && !strcmp(Modifier, "no-rip") &&
	BaseReg.getReg() == X86::RIP)
	HasBaseReg = false;

	// HasParenPart - True if we will print out the () part of the mem ref.
	bool HasParenPart = IndexReg.getReg() \|\| HasBaseReg;

	switch (DispSpec.getType()) {
	default:
	llvm_unreachable("unknown operand type!");
	case MachineOperand::MO_Immediate: {
	int DispVal = DispSpec.getImm();
	if (DispVal \|\| !HasParenPart)
	O << DispVal;
	break;
	}
	case MachineOperand::MO_GlobalAddress:
	case MachineOperand::MO_ConstantPoolIndex:
	printSymbolOperand(P, DispSpec, O);
	}

	if (Modifier && strcmp(Modifier, "H") == 0)
	O << "+8";

	if (HasParenPart) {
	assert(IndexReg.getReg() != X86::ESP &&
	"X86 doesn't allow scaling by ESP");

	O << '(';
	if (HasBaseReg)
	printOperand(P, MI, Op+X86::AddrBaseReg, O, Modifier);

	if (IndexReg.getReg()) {
	O << ',';
	printOperand(P, MI, Op+X86::AddrIndexReg, O, Modifier);
	unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
	if (ScaleVal != 1)
	O << ',' << ScaleVal;
	}
	O << ')';
	}
	}

	static void printMemReference(X86AsmPrinter &P, const MachineInstr *MI,
	unsigned Op, raw_ostream &O,
	const char *Modifier = nullptr) {
	assert(isMem(*MI, Op) && "Invalid memory reference!");
	const MachineOperand &Segment = MI->getOperand(Op+X86::AddrSegmentReg);
	if (Segment.getReg()) {
	printOperand(P, MI, Op+X86::AddrSegmentReg, O, Modifier);
	O << ':';
	}
	printLeaMemReference(P, MI, Op, O, Modifier);
	}

	static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI,
	unsigned Op, raw_ostream &O,
	const char *Modifier = nullptr,
	unsigned AsmVariant = 1) {
	const MachineOperand &BaseReg = MI->getOperand(Op+X86::AddrBaseReg);
	unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
	const MachineOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
	const MachineOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
	const MachineOperand &SegReg = MI->getOperand(Op+X86::AddrSegmentReg);

	// If this has a segment register, print it.
	if (SegReg.getReg()) {
	printOperand(P, MI, Op+X86::AddrSegmentReg, O, Modifier, AsmVariant);
	O << ':';
	}

	O << '[';

	bool NeedPlus = false;
	if (BaseReg.getReg()) {
	printOperand(P, MI, Op+X86::AddrBaseReg, O, Modifier, AsmVariant);
	NeedPlus = true;
	}

	if (IndexReg.getReg()) {
	if (NeedPlus) O << " + ";
	if (ScaleVal != 1)
	O << ScaleVal << '*';
	printOperand(P, MI, Op+X86::AddrIndexReg, O, Modifier, AsmVariant);
	NeedPlus = true;
	}

	if (!DispSpec.isImm()) {
	if (NeedPlus) O << " + ";
	printOperand(P, MI, Op+X86::AddrDisp, O, Modifier, AsmVariant);
	} else {
	int64_t DispVal = DispSpec.getImm();
	if (DispVal \|\| (!IndexReg.getReg() && !BaseReg.getReg())) {
	if (NeedPlus) {
	if (DispVal > 0)
	O << " + ";
	else {
	O << " - ";
	DispVal = -DispVal;
	}
	}
	O << DispVal;
	}
	}
	O << ']';
	}

	static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
	char Mode, raw_ostream &O) {
	unsigned Reg = MO.getReg();
	+ bool EmitPercent = true;
	+
	switch (Mode) {
	default: return true; // Unknown mode.
	case 'b': // Print QImode register
	Reg = getX86SubSuperRegister(Reg, 8);
	break;
	case 'h': // Print QImode high register
	Reg = getX86SubSuperRegister(Reg, 8, true);
	break;
	case 'w': // Print HImode register
	Reg = getX86SubSuperRegister(Reg, 16);
	break;
	case 'k': // Print SImode register
	Reg = getX86SubSuperRegister(Reg, 32);
	break;
	+ case 'V':
	+ EmitPercent = false;
	+ LLVM_FALLTHROUGH;
	case 'q':
	// Print 64-bit register names if 64-bit integer registers are available.
	// Otherwise, print 32-bit register names.
	Reg = getX86SubSuperRegister(Reg, P.getSubtarget().is64Bit() ? 64 : 32);
	break;
	}

	- O << '%' << X86ATTInstPrinter::getRegisterName(Reg);
	+ if (EmitPercent)
	+ O << '%';
	+
	+ O << X86ATTInstPrinter::getRegisterName(Reg);
	return false;
	}

	/// PrintAsmOperand - Print out an operand for an inline asm expression.
	///
	bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
	unsigned AsmVariant,
	const char *ExtraCode, raw_ostream &O) {
	// Does this asm operand have a single letter operand modifier?
	if (ExtraCode && ExtraCode[0]) {
	if (ExtraCode[1] != 0) return true; // Unknown modifier.

	const MachineOperand &MO = MI->getOperand(OpNo);

	switch (ExtraCode[0]) {
	default:
	// See if this is a generic print operand
	return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
	case 'a': // This is an address. Currently only 'i' and 'r' are expected.
	switch (MO.getType()) {
	default:
	return true;
	case MachineOperand::MO_Immediate:
	O << MO.getImm();
	return false;
	case MachineOperand::MO_ConstantPoolIndex:
	case MachineOperand::MO_JumpTableIndex:
	case MachineOperand::MO_ExternalSymbol:
	llvm_unreachable("unexpected operand type!");
	case MachineOperand::MO_GlobalAddress:
	printSymbolOperand(*this, MO, O);
	if (Subtarget->isPICStyleRIPRel())
	O << "(%rip)";
	return false;
	case MachineOperand::MO_Register:
	O << '(';
	printOperand(*this, MI, OpNo, O);
	O << ')';
	return false;
	}

	case 'c': // Don't print "$" before a global var name or constant.
	switch (MO.getType()) {
	default:
	printOperand(*this, MI, OpNo, O);
	break;
	case MachineOperand::MO_Immediate:
	O << MO.getImm();
	break;
	case MachineOperand::MO_ConstantPoolIndex:
	case MachineOperand::MO_JumpTableIndex:
	case MachineOperand::MO_ExternalSymbol:
	llvm_unreachable("unexpected operand type!");
	case MachineOperand::MO_GlobalAddress:
	printSymbolOperand(*this, MO, O);
	break;
	}
	return false;

	case 'A': // Print '*' before a register (it must be a register)
	if (MO.isReg()) {
	O << '*';
	printOperand(*this, MI, OpNo, O);
	return false;
	}
	return true;

	case 'b': // Print QImode register
	case 'h': // Print QImode high register
	case 'w': // Print HImode register
	case 'k': // Print SImode register
	case 'q': // Print DImode register
	+ case 'V': // Print native register without '%'
	if (MO.isReg())
	return printAsmMRegister(*this, MO, ExtraCode[0], O);
	printOperand(*this, MI, OpNo, O);
	return false;

	case 'P': // This is the operand of a call, treat specially.
	printPCRelImm(*this, MI, OpNo, O);
	return false;

	case 'n': // Negate the immediate or print a '-' before the operand.
	// Note: this is a temporary solution. It should be handled target
	// independently as part of the 'MC' work.
	if (MO.isImm()) {
	O << -MO.getImm();
	return false;
	}
	O << '-';
	}
	}

	printOperand(this, MI, OpNo, O, /Modifier*/ nullptr, AsmVariant);
	return false;
	}

	bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
	unsigned OpNo, unsigned AsmVariant,
	const char *ExtraCode,
	raw_ostream &O) {
	if (AsmVariant) {
	printIntelMemReference(*this, MI, OpNo, O);
	return false;
	}

	if (ExtraCode && ExtraCode[0]) {
	if (ExtraCode[1] != 0) return true; // Unknown modifier.

	switch (ExtraCode[0]) {
	default: return true; // Unknown modifier.
	case 'b': // Print QImode register
	case 'h': // Print QImode high register
	case 'w': // Print HImode register
	case 'k': // Print SImode register
	case 'q': // Print SImode register
	// These only apply to registers, ignore on mem.
	break;
	case 'H':
	printMemReference(*this, MI, OpNo, O, "H");
	return false;
	case 'P': // Don't print @PLT, but do print as memory.
	printMemReference(*this, MI, OpNo, O, "no-rip");
	return false;
	}
	}
	printMemReference(*this, MI, OpNo, O);
	return false;
	}

	void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
	const Triple &TT = TM.getTargetTriple();

	if (TT.isOSBinFormatMachO())
	OutStreamer->SwitchSection(getObjFileLowering().getTextSection());

	if (TT.isOSBinFormatCOFF()) {
	// Emit an absolute @feat.00 symbol. This appears to be some kind of
	// compiler features bitfield read by link.exe.
	if (TT.getArch() == Triple::x86) {
	MCSymbol *S = MMI->getContext().getOrCreateSymbol(StringRef("@feat.00"));
	OutStreamer->BeginCOFFSymbolDef(S);
	OutStreamer->EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC);
	OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_NULL);
	OutStreamer->EndCOFFSymbolDef();
	// According to the PE-COFF spec, the LSB of this value marks the object
	// for "registered SEH". This means that all SEH handler entry points
	// must be registered in .sxdata. Use of any unregistered handlers will
	// cause the process to terminate immediately. LLVM does not know how to
	// register any SEH handlers, so its object files should be safe.
	OutStreamer->EmitSymbolAttribute(S, MCSA_Global);
	OutStreamer->EmitAssignment(
	S, MCConstantExpr::create(int64_t(1), MMI->getContext()));
	}
	}
	OutStreamer->EmitSyntaxDirective();

	// If this is not inline asm and we're in 16-bit
	// mode prefix assembly with .code16.
	bool is16 = TT.getEnvironment() == Triple::CODE16;
	if (M.getModuleInlineAsm().empty() && is16)
	OutStreamer->EmitAssemblerFlag(MCAF_Code16);
	}

	static void
	emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel,
	MachineModuleInfoImpl::StubValueTy &MCSym) {
	// L_foo$stub:
	OutStreamer.EmitLabel(StubLabel);
	// .indirect_symbol _foo
	OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol);

	if (MCSym.getInt())
	// External to current translation unit.
	OutStreamer.EmitIntValue(0, 4/size/);
	else
	// Internal to current translation unit.
	//
	// When we place the LSDA into the TEXT section, the type info
	// pointers need to be indirect and pc-rel. We accomplish this by
	// using NLPs; however, sometimes the types are local to the file.
	// We need to fill in the value for the NLP in those cases.
	OutStreamer.EmitValue(
	MCSymbolRefExpr::create(MCSym.getPointer(), OutStreamer.getContext()),
	4 /size/);
	}

	MCSymbol *X86AsmPrinter::GetCPISymbol(unsigned CPID) const {
	if (Subtarget->isTargetKnownWindowsMSVC()) {
	const MachineConstantPoolEntry &CPE =
	MF->getConstantPool()->getConstants()[CPID];
	if (!CPE.isMachineConstantPoolEntry()) {
	const DataLayout &DL = MF->getDataLayout();
	SectionKind Kind = CPE.getSectionKind(&DL);
	const Constant *C = CPE.Val.ConstVal;
	unsigned Align = CPE.Alignment;
	if (const MCSectionCOFF *S = dyn_cast<MCSectionCOFF>(
	getObjFileLowering().getSectionForConstant(DL, Kind, C, Align))) {
	if (MCSymbol *Sym = S->getCOMDATSymbol()) {
	if (Sym->isUndefined())
	OutStreamer->EmitSymbolAttribute(Sym, MCSA_Global);
	return Sym;
	}
	}
	}
	}

	return AsmPrinter::GetCPISymbol(CPID);
	}

	void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
	const Triple &TT = TM.getTargetTriple();

	if (TT.isOSBinFormatMachO()) {
	// All darwin targets use mach-o.
	MachineModuleInfoMachO &MMIMacho =
	MMI->getObjFileInfo<MachineModuleInfoMachO>();

	// Output stubs for dynamically-linked functions.
	MachineModuleInfoMachO::SymbolListTy Stubs;

	// Output stubs for external and common global variables.
	Stubs = MMIMacho.GetGVStubList();
	if (!Stubs.empty()) {
	MCSection *TheSection = OutContext.getMachOSection(
	"__IMPORT", "__pointers", MachO::S_NON_LAZY_SYMBOL_POINTERS,
	SectionKind::getMetadata());
	OutStreamer->SwitchSection(TheSection);

	for (auto &Stub : Stubs)
	emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second);

	Stubs.clear();
	OutStreamer->AddBlankLine();
	}

	SM.serializeToStackMapSection();
	FM.serializeToFaultMapSection();

	// Funny Darwin hack: This flag tells the linker that no global symbols
	// contain code that falls through to other global symbols (e.g. the obvious
	// implementation of multiple entry points). If this doesn't occur, the
	// linker can safely perform dead code stripping. Since LLVM never
	// generates code that does this, it is always safe to set.
	OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
	}

	if (TT.isKnownWindowsMSVCEnvironment() && MMI->usesVAFloatArgument()) {
	StringRef SymbolName =
	(TT.getArch() == Triple::x86_64) ? "_fltused" : "__fltused";
	MCSymbol *S = MMI->getContext().getOrCreateSymbol(SymbolName);
	OutStreamer->EmitSymbolAttribute(S, MCSA_Global);
	}

	if (TT.isOSBinFormatCOFF()) {
	const TargetLoweringObjectFileCOFF &TLOFCOFF =
	static_cast<const TargetLoweringObjectFileCOFF&>(getObjFileLowering());

	std::string Flags;
	raw_string_ostream FlagsOS(Flags);

	for (const auto &Function : M)
	TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Function);
	for (const auto &Global : M.globals())
	TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Global);
	for (const auto &Alias : M.aliases())
	TLOFCOFF.emitLinkerFlagsForGlobal(FlagsOS, &Alias);

	FlagsOS.flush();

	// Output collected flags.
	if (!Flags.empty()) {
	OutStreamer->SwitchSection(TLOFCOFF.getDrectveSection());
	OutStreamer->EmitBytes(Flags);
	}

	SM.serializeToStackMapSection();
	}

	if (TT.isOSBinFormatELF()) {
	SM.serializeToStackMapSection();
	FM.serializeToFaultMapSection();
	}
	}

	//===----------------------------------------------------------------------===//
	// Target Registry Stuff
	//===----------------------------------------------------------------------===//

	// Force static initialization.
	extern "C" void LLVMInitializeX86AsmPrinter() {
	RegisterAsmPrinter<X86AsmPrinter> X(getTheX86_32Target());
	RegisterAsmPrinter<X86AsmPrinter> Y(getTheX86_64Target());
	}
	Index: head/contrib/llvm/lib/Target/X86/X86DomainReassignment.cpp
	===================================================================
	--- head/contrib/llvm/lib/Target/X86/X86DomainReassignment.cpp (revision 329409)
	+++ head/contrib/llvm/lib/Target/X86/X86DomainReassignment.cpp (revision 329410)
	@@ -1,764 +1,768 @@
	//===--- X86DomainReassignment.cpp - Selectively switch register classes---===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This pass attempts to find instruction chains (closures) in one domain,
	// and convert them to equivalent instructions in a different domain,
	// if profitable.
	//
	//===----------------------------------------------------------------------===//

	#include "X86.h"
	#include "X86InstrInfo.h"
	#include "X86Subtarget.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/DenseMapInfo.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/CodeGen/MachineFunctionPass.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/TargetRegisterInfo.h"
	#include "llvm/Support/Debug.h"
	#include <bitset>

	using namespace llvm;

	namespace llvm {
	void initializeX86DomainReassignmentPass(PassRegistry &);
	}

	#define DEBUG_TYPE "x86-domain-reassignment"

	STATISTIC(NumClosuresConverted, "Number of closures converted by the pass");

	static cl::opt<bool> DisableX86DomainReassignment(
	"disable-x86-domain-reassignment", cl::Hidden,
	cl::desc("X86: Disable Virtual Register Reassignment."), cl::init(false));

	namespace {
	enum RegDomain { NoDomain = -1, GPRDomain, MaskDomain, OtherDomain, NumDomains };

	static bool isGPR(const TargetRegisterClass *RC) {
	return X86::GR64RegClass.hasSubClassEq(RC) \|\|
	X86::GR32RegClass.hasSubClassEq(RC) \|\|
	X86::GR16RegClass.hasSubClassEq(RC) \|\|
	X86::GR8RegClass.hasSubClassEq(RC);
	}

	static bool isMask(const TargetRegisterClass *RC,
	const TargetRegisterInfo *TRI) {
	return X86::VK16RegClass.hasSubClassEq(RC);
	}

	static RegDomain getDomain(const TargetRegisterClass *RC,
	const TargetRegisterInfo *TRI) {
	if (isGPR(RC))
	return GPRDomain;
	if (isMask(RC, TRI))
	return MaskDomain;
	return OtherDomain;
	}

	/// Return a register class equivalent to \p SrcRC, in \p Domain.
	static const TargetRegisterClass getDstRC(const TargetRegisterClass SrcRC,
	RegDomain Domain) {
	assert(Domain == MaskDomain && "add domain");
	if (X86::GR8RegClass.hasSubClassEq(SrcRC))
	return &X86::VK8RegClass;
	if (X86::GR16RegClass.hasSubClassEq(SrcRC))
	return &X86::VK16RegClass;
	if (X86::GR32RegClass.hasSubClassEq(SrcRC))
	return &X86::VK32RegClass;
	if (X86::GR64RegClass.hasSubClassEq(SrcRC))
	return &X86::VK64RegClass;
	llvm_unreachable("add register class");
	return nullptr;
	}

	/// Abstract Instruction Converter class.
	class InstrConverterBase {
	protected:
	unsigned SrcOpcode;

	public:
	InstrConverterBase(unsigned SrcOpcode) : SrcOpcode(SrcOpcode) {}

	virtual ~InstrConverterBase() {}

	/// \returns true if \p MI is legal to convert.
	virtual bool isLegal(const MachineInstr *MI,
	const TargetInstrInfo *TII) const {
	assert(MI->getOpcode() == SrcOpcode &&
	"Wrong instruction passed to converter");
	return true;
	}

	/// Applies conversion to \p MI.
	///
	/// \returns true if \p MI is no longer need, and can be deleted.
	virtual bool convertInstr(MachineInstr MI, const TargetInstrInfo TII,
	MachineRegisterInfo *MRI) const = 0;

	/// \returns the cost increment incurred by converting \p MI.
	virtual double getExtraCost(const MachineInstr *MI,
	MachineRegisterInfo *MRI) const = 0;
	};

	/// An Instruction Converter which ignores the given instruction.
	/// For example, PHI instructions can be safely ignored since only the registers
	/// need to change.
	class InstrIgnore : public InstrConverterBase {
	public:
	InstrIgnore(unsigned SrcOpcode) : InstrConverterBase(SrcOpcode) {}

	bool convertInstr(MachineInstr MI, const TargetInstrInfo TII,
	MachineRegisterInfo *MRI) const override {
	assert(isLegal(MI, TII) && "Cannot convert instruction");
	return false;
	}

	double getExtraCost(const MachineInstr *MI,
	MachineRegisterInfo *MRI) const override {
	return 0;
	}
	};

	/// An Instruction Converter which replaces an instruction with another.
	class InstrReplacer : public InstrConverterBase {
	public:
	/// Opcode of the destination instruction.
	unsigned DstOpcode;

	InstrReplacer(unsigned SrcOpcode, unsigned DstOpcode)
	: InstrConverterBase(SrcOpcode), DstOpcode(DstOpcode) {}

	bool isLegal(const MachineInstr *MI,
	const TargetInstrInfo *TII) const override {
	if (!InstrConverterBase::isLegal(MI, TII))
	return false;
	// It's illegal to replace an instruction that implicitly defines a register
	// with an instruction that doesn't, unless that register dead.
	for (auto &MO : MI->implicit_operands())
	if (MO.isReg() && MO.isDef() && !MO.isDead() &&
	!TII->get(DstOpcode).hasImplicitDefOfPhysReg(MO.getReg()))
	return false;
	return true;
	}

	bool convertInstr(MachineInstr MI, const TargetInstrInfo TII,
	MachineRegisterInfo *MRI) const override {
	assert(isLegal(MI, TII) && "Cannot convert instruction");
	MachineInstrBuilder Bld =
	BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(DstOpcode));
	// Transfer explicit operands from original instruction. Implicit operands
	// are handled by BuildMI.
	for (auto &Op : MI->explicit_operands())
	Bld.add(Op);
	return true;
	}

	double getExtraCost(const MachineInstr *MI,
	MachineRegisterInfo *MRI) const override {
	// Assuming instructions have the same cost.
	return 0;
	}
	};

	/// An Instruction Converter which replaces an instruction with another, and
	/// adds a COPY from the new instruction's destination to the old one's.
	class InstrReplacerDstCOPY : public InstrConverterBase {
	public:
	unsigned DstOpcode;

	InstrReplacerDstCOPY(unsigned SrcOpcode, unsigned DstOpcode)
	: InstrConverterBase(SrcOpcode), DstOpcode(DstOpcode) {}

	bool convertInstr(MachineInstr MI, const TargetInstrInfo TII,
	MachineRegisterInfo *MRI) const override {
	assert(isLegal(MI, TII) && "Cannot convert instruction");
	MachineBasicBlock *MBB = MI->getParent();
	auto &DL = MI->getDebugLoc();

	unsigned Reg = MRI->createVirtualRegister(
	TII->getRegClass(TII->get(DstOpcode), 0, MRI->getTargetRegisterInfo(),
	*MBB->getParent()));
	MachineInstrBuilder Bld = BuildMI(*MBB, MI, DL, TII->get(DstOpcode), Reg);
	for (unsigned Idx = 1, End = MI->getNumOperands(); Idx < End; ++Idx)
	Bld.add(MI->getOperand(Idx));

	BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY))
	.add(MI->getOperand(0))
	.addReg(Reg);

	return true;
	}

	double getExtraCost(const MachineInstr *MI,
	MachineRegisterInfo *MRI) const override {
	// Assuming instructions have the same cost, and that COPY is in the same
	// domain so it will be eliminated.
	return 0;
	}
	};

	/// An Instruction Converter for replacing COPY instructions.
	class InstrCOPYReplacer : public InstrReplacer {
	public:
	RegDomain DstDomain;

	InstrCOPYReplacer(unsigned SrcOpcode, RegDomain DstDomain, unsigned DstOpcode)
	: InstrReplacer(SrcOpcode, DstOpcode), DstDomain(DstDomain) {}

	double getExtraCost(const MachineInstr *MI,
	MachineRegisterInfo *MRI) const override {
	assert(MI->getOpcode() == TargetOpcode::COPY && "Expected a COPY");

	for (auto &MO : MI->operands()) {
	// Physical registers will not be converted. Assume that converting the
	// COPY to the destination domain will eventually result in a actual
	// instruction.
	if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
	return 1;

	RegDomain OpDomain = getDomain(MRI->getRegClass(MO.getReg()),
	MRI->getTargetRegisterInfo());
	// Converting a cross domain COPY to a same domain COPY should eliminate
	// an insturction
	if (OpDomain == DstDomain)
	return -1;
	}
	return 0;
	}
	};

	/// An Instruction Converter which replaces an instruction with a COPY.
	class InstrReplaceWithCopy : public InstrConverterBase {
	public:
	// Source instruction operand Index, to be used as the COPY source.
	unsigned SrcOpIdx;

	InstrReplaceWithCopy(unsigned SrcOpcode, unsigned SrcOpIdx)
	: InstrConverterBase(SrcOpcode), SrcOpIdx(SrcOpIdx) {}

	bool convertInstr(MachineInstr MI, const TargetInstrInfo TII,
	MachineRegisterInfo *MRI) const override {
	assert(isLegal(MI, TII) && "Cannot convert instruction");
	BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
	TII->get(TargetOpcode::COPY))
	.add({MI->getOperand(0), MI->getOperand(SrcOpIdx)});
	return true;
	}

	double getExtraCost(const MachineInstr *MI,
	MachineRegisterInfo *MRI) const override {
	return 0;
	}
	};

	/// An Instruction Converter which completely deletes an instruction.
	/// For example, IMPLICIT_DEF instructions can be deleted when converting from
	/// GPR to mask.
	class InstrDeleter : public InstrConverterBase {
	public:
	InstrDeleter(unsigned SrcOpcode) : InstrConverterBase(SrcOpcode) {}

	bool convertInstr(MachineInstr MI, const TargetInstrInfo TII,
	MachineRegisterInfo *MRI) const override {
	assert(isLegal(MI, TII) && "Cannot convert instruction");
	return true;
	}

	double getExtraCost(const MachineInstr *MI,
	MachineRegisterInfo *MRI) const override {
	return 0;
	}
	};

	// Key type to be used by the Instruction Converters map.
	// A converter is identified by <destination domain, source opcode>
	typedef std::pair<int, unsigned> InstrConverterBaseKeyTy;

	typedef DenseMap<InstrConverterBaseKeyTy, InstrConverterBase *>
	InstrConverterBaseMap;

	/// A closure is a set of virtual register representing all of the edges in
	/// the closure, as well as all of the instructions connected by those edges.
	///
	/// A closure may encompass virtual registers in the same register bank that
	/// have different widths. For example, it may contain 32-bit GPRs as well as
	/// 64-bit GPRs.
	///
	/// A closure that computes an address (i.e. defines a virtual register that is
	/// used in a memory operand) excludes the instructions that contain memory
	/// operands using the address. Such an instruction will be included in a
	/// different closure that manipulates the loaded or stored value.
	class Closure {
	private:
	/// Virtual registers in the closure.
	DenseSet<unsigned> Edges;

	/// Instructions in the closure.
	SmallVector<MachineInstr *, 8> Instrs;

	/// Domains which this closure can legally be reassigned to.
	std::bitset<NumDomains> LegalDstDomains;

	public:
	Closure(std::initializer_list<RegDomain> LegalDstDomainList) {
	for (RegDomain D : LegalDstDomainList)
	LegalDstDomains.set(D);
	}

	/// Mark this closure as illegal for reassignment to all domains.
	void setAllIllegal() { LegalDstDomains.reset(); }

	/// \returns true if this closure has domains which are legal to reassign to.
	bool hasLegalDstDomain() const { return LegalDstDomains.any(); }

	/// \returns true if is legal to reassign this closure to domain \p RD.
	bool isLegal(RegDomain RD) const { return LegalDstDomains[RD]; }

	/// Mark this closure as illegal for reassignment to domain \p RD.
	void setIllegal(RegDomain RD) { LegalDstDomains[RD] = false; }

	bool empty() const { return Edges.empty(); }

	bool insertEdge(unsigned Reg) {
	return Edges.insert(Reg).second;
	}

	using const_edge_iterator = DenseSet<unsigned>::const_iterator;
	iterator_range<const_edge_iterator> edges() const {
	return iterator_range<const_edge_iterator>(Edges.begin(), Edges.end());
	}

	void addInstruction(MachineInstr *I) {
	Instrs.push_back(I);
	}

	ArrayRef<MachineInstr *> instructions() const {
	return Instrs;
	}

	};

	class X86DomainReassignment : public MachineFunctionPass {
	const X86Subtarget *STI;
	MachineRegisterInfo *MRI;
	const X86InstrInfo *TII;

	/// All edges that are included in some closure
	DenseSet<unsigned> EnclosedEdges;

	/// All instructions that are included in some closure.
	DenseMap<MachineInstr , Closure > EnclosedInstrs;

	public:
	static char ID;

	X86DomainReassignment() : MachineFunctionPass(ID) {
	initializeX86DomainReassignmentPass(*PassRegistry::getPassRegistry());
	}

	bool runOnMachineFunction(MachineFunction &MF) override;

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.setPreservesCFG();
	MachineFunctionPass::getAnalysisUsage(AU);
	}

	StringRef getPassName() const override {
	return "X86 Domain Reassignment Pass";
	}

	private:
	/// A map of available Instruction Converters.
	InstrConverterBaseMap Converters;

	/// Initialize Converters map.
	void initConverters();

	/// Starting from \Reg, expand the closure as much as possible.
	void buildClosure(Closure &, unsigned Reg);

	/// Enqueue \p Reg to be considered for addition to the closure.
	void visitRegister(Closure &, unsigned Reg, RegDomain &Domain,
	SmallVectorImpl<unsigned> &Worklist);

	/// Reassign the closure to \p Domain.
	void reassign(const Closure &C, RegDomain Domain) const;

	/// Add \p MI to the closure.
	void encloseInstr(Closure &C, MachineInstr *MI);

	/// /returns true if it is profitable to reassign the closure to \p Domain.
	bool isReassignmentProfitable(const Closure &C, RegDomain Domain) const;

	/// Calculate the total cost of reassigning the closure to \p Domain.
	double calculateCost(const Closure &C, RegDomain Domain) const;
	};

	char X86DomainReassignment::ID = 0;

	} // End anonymous namespace.

	void X86DomainReassignment::visitRegister(Closure &C, unsigned Reg,
	RegDomain &Domain,
	SmallVectorImpl<unsigned> &Worklist) {
	if (EnclosedEdges.count(Reg))
	return;

	if (!TargetRegisterInfo::isVirtualRegister(Reg))
	return;

	if (!MRI->hasOneDef(Reg))
	return;

	RegDomain RD = getDomain(MRI->getRegClass(Reg), MRI->getTargetRegisterInfo());
	// First edge in closure sets the domain.
	if (Domain == NoDomain)
	Domain = RD;

	if (Domain != RD)
	return;

	Worklist.push_back(Reg);
	}

	void X86DomainReassignment::encloseInstr(Closure &C, MachineInstr *MI) {
	auto I = EnclosedInstrs.find(MI);
	if (I != EnclosedInstrs.end()) {
	if (I->second != &C)
	// Instruction already belongs to another closure, avoid conflicts between
	// closure and mark this closure as illegal.
	C.setAllIllegal();
	return;
	}

	EnclosedInstrs[MI] = &C;
	C.addInstruction(MI);

	// Mark closure as illegal for reassignment to domains, if there is no
	// converter for the instruction or if the converter cannot convert the
	// instruction.
	for (int i = 0; i != NumDomains; ++i) {
	if (C.isLegal((RegDomain)i)) {
	InstrConverterBase *IC = Converters.lookup({i, MI->getOpcode()});
	if (!IC \|\| !IC->isLegal(MI, TII))
	C.setIllegal((RegDomain)i);
	}
	}
	}

	double X86DomainReassignment::calculateCost(const Closure &C,
	RegDomain DstDomain) const {
	assert(C.isLegal(DstDomain) && "Cannot calculate cost for illegal closure");

	double Cost = 0.0;
	for (auto *MI : C.instructions())
	Cost +=
	Converters.lookup({DstDomain, MI->getOpcode()})->getExtraCost(MI, MRI);
	return Cost;
	}

	bool X86DomainReassignment::isReassignmentProfitable(const Closure &C,
	RegDomain Domain) const {
	return calculateCost(C, Domain) < 0.0;
	}

	void X86DomainReassignment::reassign(const Closure &C, RegDomain Domain) const {
	assert(C.isLegal(Domain) && "Cannot convert illegal closure");

	// Iterate all instructions in the closure, convert each one using the
	// appropriate converter.
	SmallVector<MachineInstr *, 8> ToErase;
	for (auto *MI : C.instructions())
	if (Converters.lookup({Domain, MI->getOpcode()})
	->convertInstr(MI, TII, MRI))
	ToErase.push_back(MI);

	// Iterate all registers in the closure, replace them with registers in the
	// destination domain.
	for (unsigned Reg : C.edges()) {
	MRI->setRegClass(Reg, getDstRC(MRI->getRegClass(Reg), Domain));
	for (auto &MO : MRI->use_operands(Reg)) {
	if (MO.isReg())
	// Remove all subregister references as they are not valid in the
	// destination domain.
	MO.setSubReg(0);
	}
	}

	for (auto MI : ToErase)
	MI->eraseFromParent();
	}

	/// \returns true when \p Reg is used as part of an address calculation in \p
	/// MI.
	static bool usedAsAddr(const MachineInstr &MI, unsigned Reg,
	const TargetInstrInfo *TII) {
	if (!MI.mayLoadOrStore())
	return false;

	const MCInstrDesc &Desc = TII->get(MI.getOpcode());
	int MemOpStart = X86II::getMemoryOperandNo(Desc.TSFlags);
	if (MemOpStart == -1)
	return false;

	MemOpStart += X86II::getOperandBias(Desc);
	for (unsigned MemOpIdx = MemOpStart,
	MemOpEnd = MemOpStart + X86::AddrNumOperands;
	MemOpIdx < MemOpEnd; ++MemOpIdx) {
	auto &Op = MI.getOperand(MemOpIdx);
	if (Op.isReg() && Op.getReg() == Reg)
	return true;
	}
	return false;
	}

	void X86DomainReassignment::buildClosure(Closure &C, unsigned Reg) {
	SmallVector<unsigned, 4> Worklist;
	RegDomain Domain = NoDomain;
	visitRegister(C, Reg, Domain, Worklist);
	while (!Worklist.empty()) {
	unsigned CurReg = Worklist.pop_back_val();

	// Register already in this closure.
	if (!C.insertEdge(CurReg))
	continue;

	MachineInstr *DefMI = MRI->getVRegDef(CurReg);
	encloseInstr(C, DefMI);

	// Add register used by the defining MI to the worklist.
	// Do not add registers which are used in address calculation, they will be
	// added to a different closure.
	int OpEnd = DefMI->getNumOperands();
	const MCInstrDesc &Desc = DefMI->getDesc();
	int MemOp = X86II::getMemoryOperandNo(Desc.TSFlags);
	if (MemOp != -1)
	MemOp += X86II::getOperandBias(Desc);
	for (int OpIdx = 0; OpIdx < OpEnd; ++OpIdx) {
	if (OpIdx == MemOp) {
	// skip address calculation.
	OpIdx += (X86::AddrNumOperands - 1);
	continue;
	}
	auto &Op = DefMI->getOperand(OpIdx);
	if (!Op.isReg() \|\| !Op.isUse())
	continue;
	visitRegister(C, Op.getReg(), Domain, Worklist);
	}

	// Expand closure through register uses.
	for (auto &UseMI : MRI->use_nodbg_instructions(CurReg)) {
	// We would like to avoid converting closures which calculare addresses,
	// as this should remain in GPRs.
	if (usedAsAddr(UseMI, CurReg, TII)) {
	C.setAllIllegal();
	continue;
	}
	encloseInstr(C, &UseMI);

	for (auto &DefOp : UseMI.defs()) {
	if (!DefOp.isReg())
	continue;

	unsigned DefReg = DefOp.getReg();
	if (!TargetRegisterInfo::isVirtualRegister(DefReg)) {
	C.setAllIllegal();
	continue;
	}
	visitRegister(C, DefReg, Domain, Worklist);
	}
	}
	}
	}

	void X86DomainReassignment::initConverters() {
	Converters[{MaskDomain, TargetOpcode::PHI}] =
	new InstrIgnore(TargetOpcode::PHI);

	Converters[{MaskDomain, TargetOpcode::IMPLICIT_DEF}] =
	new InstrDeleter(TargetOpcode::IMPLICIT_DEF);

	Converters[{MaskDomain, TargetOpcode::INSERT_SUBREG}] =
	new InstrReplaceWithCopy(TargetOpcode::INSERT_SUBREG, 2);

	Converters[{MaskDomain, TargetOpcode::COPY}] =
	new InstrCOPYReplacer(TargetOpcode::COPY, MaskDomain, TargetOpcode::COPY);

	auto createReplacerDstCOPY = [&](unsigned From, unsigned To) {
	Converters[{MaskDomain, From}] = new InstrReplacerDstCOPY(From, To);
	};

	createReplacerDstCOPY(X86::MOVZX32rm16, X86::KMOVWkm);
	createReplacerDstCOPY(X86::MOVZX64rm16, X86::KMOVWkm);

	createReplacerDstCOPY(X86::MOVZX32rr16, X86::KMOVWkk);
	createReplacerDstCOPY(X86::MOVZX64rr16, X86::KMOVWkk);

	if (STI->hasDQI()) {
	createReplacerDstCOPY(X86::MOVZX16rm8, X86::KMOVBkm);
	createReplacerDstCOPY(X86::MOVZX32rm8, X86::KMOVBkm);
	createReplacerDstCOPY(X86::MOVZX64rm8, X86::KMOVBkm);

	createReplacerDstCOPY(X86::MOVZX16rr8, X86::KMOVBkk);
	createReplacerDstCOPY(X86::MOVZX32rr8, X86::KMOVBkk);
	createReplacerDstCOPY(X86::MOVZX64rr8, X86::KMOVBkk);
	}

	auto createReplacer = [&](unsigned From, unsigned To) {
	Converters[{MaskDomain, From}] = new InstrReplacer(From, To);
	};

	createReplacer(X86::MOV16rm, X86::KMOVWkm);
	createReplacer(X86::MOV16mr, X86::KMOVWmk);
	createReplacer(X86::MOV16rr, X86::KMOVWkk);
	createReplacer(X86::SHR16ri, X86::KSHIFTRWri);
	createReplacer(X86::SHL16ri, X86::KSHIFTLWri);
	createReplacer(X86::NOT16r, X86::KNOTWrr);
	createReplacer(X86::OR16rr, X86::KORWrr);
	createReplacer(X86::AND16rr, X86::KANDWrr);
	createReplacer(X86::XOR16rr, X86::KXORWrr);

	if (STI->hasBWI()) {
	createReplacer(X86::MOV32rm, X86::KMOVDkm);
	createReplacer(X86::MOV64rm, X86::KMOVQkm);

	createReplacer(X86::MOV32mr, X86::KMOVDmk);
	createReplacer(X86::MOV64mr, X86::KMOVQmk);

	createReplacer(X86::MOV32rr, X86::KMOVDkk);
	createReplacer(X86::MOV64rr, X86::KMOVQkk);

	createReplacer(X86::SHR32ri, X86::KSHIFTRDri);
	createReplacer(X86::SHR64ri, X86::KSHIFTRQri);

	createReplacer(X86::SHL32ri, X86::KSHIFTLDri);
	createReplacer(X86::SHL64ri, X86::KSHIFTLQri);

	createReplacer(X86::ADD32rr, X86::KADDDrr);
	createReplacer(X86::ADD64rr, X86::KADDQrr);

	createReplacer(X86::NOT32r, X86::KNOTDrr);
	createReplacer(X86::NOT64r, X86::KNOTQrr);

	createReplacer(X86::OR32rr, X86::KORDrr);
	createReplacer(X86::OR64rr, X86::KORQrr);

	createReplacer(X86::AND32rr, X86::KANDDrr);
	createReplacer(X86::AND64rr, X86::KANDQrr);

	createReplacer(X86::ANDN32rr, X86::KANDNDrr);
	createReplacer(X86::ANDN64rr, X86::KANDNQrr);

	createReplacer(X86::XOR32rr, X86::KXORDrr);
	createReplacer(X86::XOR64rr, X86::KXORQrr);

	- createReplacer(X86::TEST32rr, X86::KTESTDrr);
	- createReplacer(X86::TEST64rr, X86::KTESTQrr);
	+ // TODO: KTEST is not a replacement for TEST due to flag differences. Need
	+ // to prove only Z flag is used.
	+ //createReplacer(X86::TEST32rr, X86::KTESTDrr);
	+ //createReplacer(X86::TEST64rr, X86::KTESTQrr);
	}

	if (STI->hasDQI()) {
	createReplacer(X86::ADD8rr, X86::KADDBrr);
	createReplacer(X86::ADD16rr, X86::KADDWrr);

	createReplacer(X86::AND8rr, X86::KANDBrr);

	createReplacer(X86::MOV8rm, X86::KMOVBkm);
	createReplacer(X86::MOV8mr, X86::KMOVBmk);
	createReplacer(X86::MOV8rr, X86::KMOVBkk);

	createReplacer(X86::NOT8r, X86::KNOTBrr);

	createReplacer(X86::OR8rr, X86::KORBrr);

	createReplacer(X86::SHR8ri, X86::KSHIFTRBri);
	createReplacer(X86::SHL8ri, X86::KSHIFTLBri);

	- createReplacer(X86::TEST8rr, X86::KTESTBrr);
	- createReplacer(X86::TEST16rr, X86::KTESTWrr);
	+ // TODO: KTEST is not a replacement for TEST due to flag differences. Need
	+ // to prove only Z flag is used.
	+ //createReplacer(X86::TEST8rr, X86::KTESTBrr);
	+ //createReplacer(X86::TEST16rr, X86::KTESTWrr);

	createReplacer(X86::XOR8rr, X86::KXORBrr);
	}
	}

	bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
	if (skipFunction(MF.getFunction()))
	return false;
	if (DisableX86DomainReassignment)
	return false;

	DEBUG(dbgs() << "*** Machine Function before Domain Reassignment ***\n");
	DEBUG(MF.print(dbgs()));

	STI = &MF.getSubtarget<X86Subtarget>();
	// GPR->K is the only transformation currently supported, bail out early if no
	// AVX512.
	if (!STI->hasAVX512())
	return false;

	MRI = &MF.getRegInfo();
	assert(MRI->isSSA() && "Expected MIR to be in SSA form");

	TII = STI->getInstrInfo();
	initConverters();
	bool Changed = false;

	EnclosedEdges.clear();
	EnclosedInstrs.clear();

	std::vector<Closure> Closures;

	// Go over all virtual registers and calculate a closure.
	for (unsigned Idx = 0; Idx < MRI->getNumVirtRegs(); ++Idx) {
	unsigned Reg = TargetRegisterInfo::index2VirtReg(Idx);

	// GPR only current source domain supported.
	if (!isGPR(MRI->getRegClass(Reg)))
	continue;

	// Register already in closure.
	if (EnclosedEdges.count(Reg))
	continue;

	// Calculate closure starting with Reg.
	Closure C({MaskDomain});
	buildClosure(C, Reg);

	// Collect all closures that can potentially be converted.
	if (!C.empty() && C.isLegal(MaskDomain))
	Closures.push_back(std::move(C));
	}

	for (Closure &C : Closures)
	if (isReassignmentProfitable(C, MaskDomain)) {
	reassign(C, MaskDomain);
	++NumClosuresConverted;
	Changed = true;
	}

	for (auto I : Converters)
	delete I.second;

	DEBUG(dbgs() << "*** Machine Function after Domain Reassignment ***\n");
	DEBUG(MF.print(dbgs()));

	return Changed;
	}

	INITIALIZE_PASS(X86DomainReassignment, "x86-domain-reassignment",
	"X86 Domain Reassignment Pass", false, false)

	/// Returns an instance of the Domain Reassignment pass.
	FunctionPass *llvm::createX86DomainReassignmentPass() {
	return new X86DomainReassignment();
	}
	Index: head/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
	===================================================================
	--- head/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp (revision 329409)
	+++ head/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp (revision 329410)
	@@ -1,38860 +1,38847 @@
	//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the interfaces that X86 uses to lower LLVM code into a
	// selection DAG.
	//
	//===----------------------------------------------------------------------===//

	#include "X86ISelLowering.h"
	#include "Utils/X86ShuffleDecode.h"
	#include "X86CallingConv.h"
	#include "X86FrameLowering.h"
	#include "X86InstrBuilder.h"
	#include "X86IntrinsicsInfo.h"
	#include "X86MachineFunctionInfo.h"
	#include "X86ShuffleDecodeConstantPool.h"
	#include "X86TargetMachine.h"
	#include "X86TargetObjectFile.h"
	#include "llvm/ADT/SmallBitVector.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/Analysis/EHPersonalities.h"
	#include "llvm/CodeGen/IntrinsicLowering.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineJumpTableInfo.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/TargetLowering.h"
	#include "llvm/CodeGen/WinEHFuncInfo.h"
	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/DiagnosticInfo.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GlobalAlias.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCExpr.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Target/TargetOptions.h"
	#include <algorithm>
	#include <bitset>
	#include <cctype>
	#include <numeric>
	using namespace llvm;

	#define DEBUG_TYPE "x86-isel"

	STATISTIC(NumTailCalls, "Number of tail calls");

	static cl::opt<bool> ExperimentalVectorWideningLegalization(
	"x86-experimental-vector-widening-legalization", cl::init(false),
	cl::desc("Enable an experimental vector type legalization through widening "
	"rather than promotion."),
	cl::Hidden);

	static cl::opt<int> ExperimentalPrefLoopAlignment(
	"x86-experimental-pref-loop-alignment", cl::init(4),
	cl::desc("Sets the preferable loop alignment for experiments "
	"(the last x86-experimental-pref-loop-alignment bits"
	" of the loop header PC will be 0)."),
	cl::Hidden);

	static cl::opt<bool> MulConstantOptimization(
	"mul-constant-optimization", cl::init(true),
	cl::desc("Replace 'mul x, Const' with more effective instructions like "
	"SHIFT, LEA, etc."),
	cl::Hidden);

	/// Call this when the user attempts to do something unsupported, like
	/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
	/// report_fatal_error, so calling code should attempt to recover without
	/// crashing.
	static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
	const char *Msg) {
	MachineFunction &MF = DAG.getMachineFunction();
	DAG.getContext()->diagnose(
	DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
	}

	X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
	const X86Subtarget &STI)
	: TargetLowering(TM), Subtarget(STI) {
	bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
	X86ScalarSSEf64 = Subtarget.hasSSE2();
	X86ScalarSSEf32 = Subtarget.hasSSE1();
	MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());

	// Set up the TargetLowering object.

	// X86 is weird. It always uses i8 for shift amounts and setcc results.
	setBooleanContents(ZeroOrOneBooleanContent);
	// X86-SSE is even stranger. It uses -1 or 0 for vector masks.
	setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

	// For 64-bit, since we have so many registers, use the ILP scheduler.
	// For 32-bit, use the register pressure specific scheduling.
	// For Atom, always use ILP scheduling.
	if (Subtarget.isAtom())
	setSchedulingPreference(Sched::ILP);
	else if (Subtarget.is64Bit())
	setSchedulingPreference(Sched::ILP);
	else
	setSchedulingPreference(Sched::RegPressure);
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());

	// Bypass expensive divides and use cheaper ones.
	if (TM.getOptLevel() >= CodeGenOpt::Default) {
	if (Subtarget.hasSlowDivide32())
	addBypassSlowDiv(32, 8);
	if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
	addBypassSlowDiv(64, 32);
	}

	if (Subtarget.isTargetKnownWindowsMSVC() \|\|
	Subtarget.isTargetWindowsItanium()) {
	// Setup Windows compiler runtime calls.
	setLibcallName(RTLIB::SDIV_I64, "_alldiv");
	setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
	setLibcallName(RTLIB::SREM_I64, "_allrem");
	setLibcallName(RTLIB::UREM_I64, "_aullrem");
	setLibcallName(RTLIB::MUL_I64, "_allmul");
	setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
	}

	if (Subtarget.isTargetDarwin()) {
	// Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
	setUseUnderscoreSetJmp(false);
	setUseUnderscoreLongJmp(false);
	} else if (Subtarget.isTargetWindowsGNU()) {
	// MS runtime is weird: it exports _setjmp, but longjmp!
	setUseUnderscoreSetJmp(true);
	setUseUnderscoreLongJmp(false);
	} else {
	setUseUnderscoreSetJmp(true);
	setUseUnderscoreLongJmp(true);
	}

	// Set up the register classes.
	addRegisterClass(MVT::i8, &X86::GR8RegClass);
	addRegisterClass(MVT::i16, &X86::GR16RegClass);
	addRegisterClass(MVT::i32, &X86::GR32RegClass);
	if (Subtarget.is64Bit())
	addRegisterClass(MVT::i64, &X86::GR64RegClass);

	for (MVT VT : MVT::integer_valuetypes())
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);

	// We don't accept any truncstore of integer registers.
	setTruncStoreAction(MVT::i64, MVT::i32, Expand);
	setTruncStoreAction(MVT::i64, MVT::i16, Expand);
	setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
	setTruncStoreAction(MVT::i32, MVT::i16, Expand);
	setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
	setTruncStoreAction(MVT::i16, MVT::i8, Expand);

	setTruncStoreAction(MVT::f64, MVT::f32, Expand);

	// SETOEQ and SETUNE require checking two conditions.
	setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
	setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
	setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
	setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
	setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
	setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);

	// Integer absolute.
	if (Subtarget.hasCMov()) {
	setOperationAction(ISD::ABS , MVT::i16 , Custom);
	setOperationAction(ISD::ABS , MVT::i32 , Custom);
	if (Subtarget.is64Bit())
	setOperationAction(ISD::ABS , MVT::i64 , Custom);
	}

	// Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
	// operation.
	setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
	setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
	setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);

	if (Subtarget.is64Bit()) {
	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
	// f32/f64 are legal, f80 is custom.
	setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
	else
	setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
	setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
	} else if (!Subtarget.useSoftFloat()) {
	// We have an algorithm for SSE2->double, and we turn this into a
	// 64-bit FILD followed by conditional FADD for other targets.
	setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
	// We have an algorithm for SSE2, and we turn this into a 64-bit
	// FILD or VCVTUSI2SS/SD for other targets.
	setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
	}

	// Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
	// this operation.
	setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
	setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);

	if (!Subtarget.useSoftFloat()) {
	// SSE has no i16 to fp conversion, only i32.
	if (X86ScalarSSEf32) {
	setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
	// f32 and f64 cases are Legal, f80 case is not
	setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
	} else {
	setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
	setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
	}
	} else {
	setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
	setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
	}

	// Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
	// this operation.
	setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
	setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);

	if (!Subtarget.useSoftFloat()) {
	// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
	// are Legal, f80 is custom lowered.
	setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
	setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);

	if (X86ScalarSSEf32) {
	setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
	// f32 and f64 cases are Legal, f80 case is not
	setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
	} else {
	setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
	setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
	}
	} else {
	setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
	setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
	setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
	}

	// Handle FP_TO_UINT by promoting the destination to a larger signed
	// conversion.
	setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
	setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
	setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);

	if (Subtarget.is64Bit()) {
	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
	// FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
	setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
	} else {
	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
	setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
	}
	} else if (!Subtarget.useSoftFloat()) {
	// Since AVX is a superset of SSE3, only check for SSE here.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
	// Expand FP_TO_UINT into a select.
	// FIXME: We would like to use a Custom expander here eventually to do
	// the optimal thing for SSE vs. the default expansion in the legalizer.
	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
	else
	// With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
	// With SSE3 we can use fisttpll to convert to a signed i64; without
	// SSE, we're stuck with a fistpll.
	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);

	setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
	}

	// TODO: when we have SSE, these could be more efficient, by using movd/movq.
	if (!X86ScalarSSEf64) {
	setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
	setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
	// Without SSE, i64->f64 goes through memory.
	setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
	}
	} else if (!Subtarget.is64Bit())
	setOperationAction(ISD::BITCAST , MVT::i64 , Custom);

	// Scalar integer divide and remainder are lowered to use operations that
	// produce two results, to match the available instructions. This exposes
	// the two-result form to trivial CSE, which is able to combine x/y and x%y
	// into a single instruction.
	//
	// Scalar integer multiply-high is also lowered to use two-result
	// operations, to match the available instructions. However, plain multiply
	// (low) operations are left as Legal, as there are single-result
	// instructions for this in x86. Using the two-result multiply instructions
	// when both high and low results are needed must be arranged by dagcombine.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	}

	setOperationAction(ISD::BR_JT , MVT::Other, Expand);
	setOperationAction(ISD::BRCOND , MVT::Other, Custom);
	for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
	MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::BR_CC, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);
	}
	if (Subtarget.is64Bit())
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
	setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);

	setOperationAction(ISD::FREM , MVT::f32 , Expand);
	setOperationAction(ISD::FREM , MVT::f64 , Expand);
	setOperationAction(ISD::FREM , MVT::f80 , Expand);
	setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);

	// Promote the i8 variants and force them on up to i32 which has a shorter
	// encoding.
	setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
	setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
	if (!Subtarget.hasBMI()) {
	setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
	setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
	}
	}

	if (Subtarget.hasLZCNT()) {
	// When promoting the i8 variants, force them to i32 for a shorter
	// encoding.
	setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
	setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
	} else {
	setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
	setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
	setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
	}
	}

	// Special handling for half-precision floating point conversions.
	// If we don't have F16C support, then lower half float conversions
	// into library calls.
	if (Subtarget.useSoftFloat() \|\| !Subtarget.hasF16C()) {
	setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
	}

	// There's never any support for operations beyond MVT::f32.
	setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
	setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);

	setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
	setTruncStoreAction(MVT::f32, MVT::f16, Expand);
	setTruncStoreAction(MVT::f64, MVT::f16, Expand);
	setTruncStoreAction(MVT::f80, MVT::f16, Expand);

	if (Subtarget.hasPOPCNT()) {
	setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
	} else {
	setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
	setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
	setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
	if (Subtarget.is64Bit())
	setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
	}

	setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);

	if (!Subtarget.hasMOVBE())
	setOperationAction(ISD::BSWAP , MVT::i16 , Expand);

	// These should be promoted to a larger select which is supported.
	setOperationAction(ISD::SELECT , MVT::i1 , Promote);
	// X86 wants to expand cmov itself.
	for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	}
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	}

	// Custom action for SELECT MMX and expand action for SELECT_CC MMX
	setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);

	setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
	// NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
	// LLVM/Clang supports zero-cost DWARF and SEH exception handling.
	setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
	setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
	setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
	if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
	setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");

	// Darwin ABI issue.
	for (auto VT : { MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::ConstantPool , VT, Custom);
	setOperationAction(ISD::JumpTable , VT, Custom);
	setOperationAction(ISD::GlobalAddress , VT, Custom);
	setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
	setOperationAction(ISD::ExternalSymbol , VT, Custom);
	setOperationAction(ISD::BlockAddress , VT, Custom);
	}

	// 64-bit shl, sra, srl (iff 32-bit x86)
	for (auto VT : { MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::SHL_PARTS, VT, Custom);
	setOperationAction(ISD::SRA_PARTS, VT, Custom);
	setOperationAction(ISD::SRL_PARTS, VT, Custom);
	}

	if (Subtarget.hasSSEPrefetch() \|\| Subtarget.has3DNow())
	setOperationAction(ISD::PREFETCH , MVT::Other, Legal);

	setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);

	// Expand certain atomics
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
	setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
	}

	if (Subtarget.hasCmpxchg16b()) {
	setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
	}

	// FIXME - use subtarget debug flags
	if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
	!Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
	TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
	setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
	}

	setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
	setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);

	setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
	setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);

	setOperationAction(ISD::TRAP, MVT::Other, Legal);
	setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);

	// VASTART needs to be custom lowered to use the VarArgsFrameIndex
	setOperationAction(ISD::VASTART , MVT::Other, Custom);
	setOperationAction(ISD::VAEND , MVT::Other, Expand);
	bool Is64Bit = Subtarget.is64Bit();
	setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
	setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);

	setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
	setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);

	setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);

	// GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
	setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
	setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);

	if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
	// f32 and f64 use SSE.
	// Set up the FP register classes.
	addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
	: &X86::FR32RegClass);
	addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
	: &X86::FR64RegClass);

	for (auto VT : { MVT::f32, MVT::f64 }) {
	// Use ANDPD to simulate FABS.
	setOperationAction(ISD::FABS, VT, Custom);

	// Use XORP to simulate FNEG.
	setOperationAction(ISD::FNEG, VT, Custom);

	// Use ANDPD and ORPD to simulate FCOPYSIGN.
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);

	// We don't support sin/cos/fmod
	setOperationAction(ISD::FSIN , VT, Expand);
	setOperationAction(ISD::FCOS , VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	}

	// Lower this to MOVMSK plus an AND.
	setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
	setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);

	// Expand FP immediates into loads from the stack, except for the special
	// cases we handle.
	addLegalFPImmediate(APFloat(+0.0)); // xorpd
	addLegalFPImmediate(APFloat(+0.0f)); // xorps
	} else if (UseX87 && X86ScalarSSEf32) {
	// Use SSE for f32, x87 for f64.
	// Set up the FP register classes.
	addRegisterClass(MVT::f32, &X86::FR32RegClass);
	addRegisterClass(MVT::f64, &X86::RFP64RegClass);

	// Use ANDPS to simulate FABS.
	setOperationAction(ISD::FABS , MVT::f32, Custom);

	// Use XORP to simulate FNEG.
	setOperationAction(ISD::FNEG , MVT::f32, Custom);

	setOperationAction(ISD::UNDEF, MVT::f64, Expand);

	// Use ANDPS and ORPS to simulate FCOPYSIGN.
	setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);

	// We don't support sin/cos/fmod
	setOperationAction(ISD::FSIN , MVT::f32, Expand);
	setOperationAction(ISD::FCOS , MVT::f32, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f32, Expand);

	// Special cases we handle for FP constants.
	addLegalFPImmediate(APFloat(+0.0f)); // xorps
	addLegalFPImmediate(APFloat(+0.0)); // FLD0
	addLegalFPImmediate(APFloat(+1.0)); // FLD1
	addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
	addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS

	// Always expand sin/cos functions even though x87 has an instruction.
	setOperationAction(ISD::FSIN , MVT::f64, Expand);
	setOperationAction(ISD::FCOS , MVT::f64, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
	} else if (UseX87) {
	// f32 and f64 in x87.
	// Set up the FP register classes.
	addRegisterClass(MVT::f64, &X86::RFP64RegClass);
	addRegisterClass(MVT::f32, &X86::RFP32RegClass);

	for (auto VT : { MVT::f32, MVT::f64 }) {
	setOperationAction(ISD::UNDEF, VT, Expand);
	setOperationAction(ISD::FCOPYSIGN, VT, Expand);

	// Always expand sin/cos functions even though x87 has an instruction.
	setOperationAction(ISD::FSIN , VT, Expand);
	setOperationAction(ISD::FCOS , VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	}
	addLegalFPImmediate(APFloat(+0.0)); // FLD0
	addLegalFPImmediate(APFloat(+1.0)); // FLD1
	addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
	addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
	addLegalFPImmediate(APFloat(+0.0f)); // FLD0
	addLegalFPImmediate(APFloat(+1.0f)); // FLD1
	addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
	addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
	}

	// We don't support FMA.
	setOperationAction(ISD::FMA, MVT::f64, Expand);
	setOperationAction(ISD::FMA, MVT::f32, Expand);

	// Long double always uses X87, except f128 in MMX.
	if (UseX87) {
	if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
	addRegisterClass(MVT::f128, &X86::FR128RegClass);
	ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
	setOperationAction(ISD::FABS , MVT::f128, Custom);
	setOperationAction(ISD::FNEG , MVT::f128, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
	}

	addRegisterClass(MVT::f80, &X86::RFP80RegClass);
	setOperationAction(ISD::UNDEF, MVT::f80, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
	{
	APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
	addLegalFPImmediate(TmpFlt); // FLD0
	TmpFlt.changeSign();
	addLegalFPImmediate(TmpFlt); // FLD0/FCHS

	bool ignored;
	APFloat TmpFlt2(+1.0);
	TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
	&ignored);
	addLegalFPImmediate(TmpFlt2); // FLD1
	TmpFlt2.changeSign();
	addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
	}

	// Always expand sin/cos functions even though x87 has an instruction.
	setOperationAction(ISD::FSIN , MVT::f80, Expand);
	setOperationAction(ISD::FCOS , MVT::f80, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f80, Expand);

	setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
	setOperationAction(ISD::FCEIL, MVT::f80, Expand);
	setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
	setOperationAction(ISD::FRINT, MVT::f80, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
	setOperationAction(ISD::FMA, MVT::f80, Expand);
	}

	// Always use a library call for pow.
	setOperationAction(ISD::FPOW , MVT::f32 , Expand);
	setOperationAction(ISD::FPOW , MVT::f64 , Expand);
	setOperationAction(ISD::FPOW , MVT::f80 , Expand);

	setOperationAction(ISD::FLOG, MVT::f80, Expand);
	setOperationAction(ISD::FLOG2, MVT::f80, Expand);
	setOperationAction(ISD::FLOG10, MVT::f80, Expand);
	setOperationAction(ISD::FEXP, MVT::f80, Expand);
	setOperationAction(ISD::FEXP2, MVT::f80, Expand);
	setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
	setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);

	// Some FP actions are always expanded for vector types.
	for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
	MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
	setOperationAction(ISD::FSIN, VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	setOperationAction(ISD::FCOS, VT, Expand);
	setOperationAction(ISD::FREM, VT, Expand);
	setOperationAction(ISD::FCOPYSIGN, VT, Expand);
	setOperationAction(ISD::FPOW, VT, Expand);
	setOperationAction(ISD::FLOG, VT, Expand);
	setOperationAction(ISD::FLOG2, VT, Expand);
	setOperationAction(ISD::FLOG10, VT, Expand);
	setOperationAction(ISD::FEXP, VT, Expand);
	setOperationAction(ISD::FEXP2, VT, Expand);
	}

	// First set operation action for all vector types to either promote
	// (for widening) or expand (for scalarization). Then we will selectively
	// turn on ones that can be effectively codegen'd.
	for (MVT VT : MVT::vector_valuetypes()) {
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
	setOperationAction(ISD::FMA, VT, Expand);
	setOperationAction(ISD::FFLOOR, VT, Expand);
	setOperationAction(ISD::FCEIL, VT, Expand);
	setOperationAction(ISD::FTRUNC, VT, Expand);
	setOperationAction(ISD::FRINT, VT, Expand);
	setOperationAction(ISD::FNEARBYINT, VT, Expand);
	setOperationAction(ISD::SMUL_LOHI, VT, Expand);
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::UMUL_LOHI, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	setOperationAction(ISD::SDIVREM, VT, Expand);
	setOperationAction(ISD::UDIVREM, VT, Expand);
	setOperationAction(ISD::CTPOP, VT, Expand);
	setOperationAction(ISD::CTTZ, VT, Expand);
	setOperationAction(ISD::CTLZ, VT, Expand);
	setOperationAction(ISD::ROTL, VT, Expand);
	setOperationAction(ISD::ROTR, VT, Expand);
	setOperationAction(ISD::BSWAP, VT, Expand);
	setOperationAction(ISD::SETCC, VT, Expand);
	setOperationAction(ISD::FP_TO_UINT, VT, Expand);
	setOperationAction(ISD::FP_TO_SINT, VT, Expand);
	setOperationAction(ISD::UINT_TO_FP, VT, Expand);
	setOperationAction(ISD::SINT_TO_FP, VT, Expand);
	setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
	setOperationAction(ISD::TRUNCATE, VT, Expand);
	setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
	setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
	setOperationAction(ISD::ANY_EXTEND, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);
	for (MVT InnerVT : MVT::vector_valuetypes()) {
	setTruncStoreAction(InnerVT, VT, Expand);

	setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
	setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);

	// N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
	// types, we have to deal with them whether we ask for Expansion or not.
	// Setting Expand causes its own optimisation problems though, so leave
	// them legal.
	if (VT.getVectorElementType() == MVT::i1)
	setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

	// EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
	// split/scalarized right now.
	if (VT.getVectorElementType() == MVT::f16)
	setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
	}
	}

	// FIXME: In order to prevent SSE instructions being expanded to MMX ones
	// with -msoft-float, disable use of MMX as well.
	if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
	addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
	// No operations on x86mmx supported, everything uses intrinsics.
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
	addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
	setOperationAction(ISD::FABS, MVT::v4f32, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
	setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
	setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
	addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	// FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
	// registers cannot be used even for integer operations.
	addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	setOperationAction(ISD::MUL, MVT::v16i8, Custom);
	setOperationAction(ISD::MUL, MVT::v4i32, Custom);
	setOperationAction(ISD::MUL, MVT::v2i64, Custom);
	setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
	setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
	setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
	setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
	setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
	setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
	setOperationAction(ISD::MUL, MVT::v8i16, Legal);
	setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
	setOperationAction(ISD::FABS, MVT::v2f64, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);

	setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
	setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
	setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
	setOperationAction(ISD::UMIN, MVT::v16i8, Legal);

	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::CTTZ, VT, Custom);
	}

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	}

	// We support custom legalizing of sext and anyext loads for specific
	// memory vector types which we can load as a scalar (or sequence of
	// scalars) and extend in-register to a legal 128-bit vector type. For sext
	// loads these must work with a single scalar load.
	for (MVT VT : MVT::integer_vector_valuetypes()) {
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
	}

	for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);

	if (VT == MVT::v2i64 && !Subtarget.is64Bit())
	continue;

	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	}

	// Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
	setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
	setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
	setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
	setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
	setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
	}

	// Custom lower v2i64 and v2f64 selects.
	setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
	setOperationAction(ISD::SELECT, MVT::v2i64, Custom);

	setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);

	setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);

	setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);

	// Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
	setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);

	setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);

	for (MVT VT : MVT::fp_vector_valuetypes())
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);

	setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
	setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
	setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);

	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);

	// In the customized shift lowering, the legal v4i32/v2i64 cases
	// in AVX2 will be recognized.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
	setOperationAction(ISD::ABS, MVT::v16i8, Legal);
	setOperationAction(ISD::ABS, MVT::v8i16, Legal);
	setOperationAction(ISD::ABS, MVT::v4i32, Legal);
	setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
	setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
	setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
	setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
	setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
	for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
	setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
	setOperationAction(ISD::FCEIL, RoundedTy, Legal);
	setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
	setOperationAction(ISD::FRINT, RoundedTy, Legal);
	setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
	}

	setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
	setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
	setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
	setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
	setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
	setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
	setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
	setOperationAction(ISD::UMIN, MVT::v4i32, Legal);

	// FIXME: Do we need to handle scalar-to-vector here?
	setOperationAction(ISD::MUL, MVT::v4i32, Legal);

	// We directly match byte blends in the backend as they match the VSELECT
	// condition form.
	setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);

	// SSE41 brings specific instructions for doing vector sign extend even in
	// cases where we don't have SRA.
	for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
	}

	for (MVT VT : MVT::integer_vector_valuetypes()) {
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
	}

	// SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
	for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
	setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
	}

	// i8 vectors are custom because the source register and source
	// source memory operand types are not the same width.
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
	setOperationAction(ISD::ROTL, VT, Custom);

	// XOP can efficiently perform BITREVERSE with VPPERM.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
	setOperationAction(ISD::BITREVERSE, VT, Custom);

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
	setOperationAction(ISD::BITREVERSE, VT, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
	bool HasInt256 = Subtarget.hasInt256();

	addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);

	for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
	setOperationAction(ISD::FFLOOR, VT, Legal);
	setOperationAction(ISD::FCEIL, VT, Legal);
	setOperationAction(ISD::FTRUNC, VT, Legal);
	setOperationAction(ISD::FRINT, VT, Legal);
	setOperationAction(ISD::FNEARBYINT, VT, Legal);
	setOperationAction(ISD::FNEG, VT, Custom);
	setOperationAction(ISD::FABS, VT, Custom);
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
	}

	// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
	// even though v8i16 is a legal type.
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
	setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);

	setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
	setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);

	for (MVT VT : MVT::fp_vector_valuetypes())
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);

	// In the customized shift lowering, the legal v8i32/v4i64 cases
	// in AVX2 will be recognized.
	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	}

	setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
	setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
	setOperationAction(ISD::SELECT, MVT::v8f32, Custom);

	for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
	setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
	setOperationAction(ISD::ANY_EXTEND, VT, Custom);
	}

	setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
	setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::CTTZ, VT, Custom);
	setOperationAction(ISD::CTLZ, VT, Custom);
	}

	if (Subtarget.hasAnyFMA()) {
	for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
	MVT::v2f64, MVT::v4f64 })
	setOperationAction(ISD::FMA, VT, Legal);
	}

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
	}

	setOperationAction(ISD::MUL, MVT::v4i64, Custom);
	setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MUL, MVT::v32i8, Custom);

	setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
	setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);

	setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
	setOperationAction(ISD::MULHS, MVT::v32i8, Custom);

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
	setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
	}

	if (HasInt256) {
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);

	// The custom lowering for UINT_TO_FP for v8i32 becomes interesting
	// when we have a 256bit-wide blend with immediate.
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);

	// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
	for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
	setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
	}
	}

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
	setOperationAction(ISD::MLOAD, VT, Legal);
	setOperationAction(ISD::MSTORE, VT, Legal);
	}

	// Extract subvector is special because the value type
	// (result) is 128-bit but the source is 256-bit wide.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v4f32, MVT::v2f64 }) {
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
	}

	// Custom lower several nodes for 256-bit types.
	for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
	MVT::v8f32, MVT::v4f64 }) {
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
	setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
	}

	if (HasInt256)
	setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);

	// Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
	setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
	setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
	setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
	setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
	setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
	}

	if (HasInt256) {
	// Custom legalize 2x32 to get a little better code.
	setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
	setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
	setOperationAction(ISD::MGATHER, VT, Custom);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
	addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
	addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
	addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
	addRegisterClass(MVT::v8f64, &X86::VR512RegClass);

	addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
	addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
	addRegisterClass(MVT::v16i1, &X86::VK16RegClass);

	setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);

	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i1, MVT::v16i32);
	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i1, MVT::v16i32);
	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i1, MVT::v8i32);
	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i1, MVT::v8i32);
	setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i1, MVT::v4i32);
	setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i1, MVT::v4i32);
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);

	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32);
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
	if (Subtarget.hasVLX()) {
	setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
	}

	// Extends of v16i1/v8i1 to 128-bit vectors.
	setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v16i8, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v16i8, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v8i16, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v8i16, Custom);

	for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
	setOperationAction(ISD::ADD, VT, Custom);
	setOperationAction(ISD::SUB, VT, Custom);
	setOperationAction(ISD::MUL, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::TRUNCATE, VT, Custom);

	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Expand);
	}

	setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
	for (auto VT : { MVT::v1i1, MVT::v8i1 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

	for (MVT VT : MVT::fp_vector_valuetypes())
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);

	for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
	setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
	setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
	}

	for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
	MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
	MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
	setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
	setTruncStoreAction(VT, MaskVT, Custom);
	}

	for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::FNEG, VT, Custom);
	setOperationAction(ISD::FABS, VT, Custom);
	setOperationAction(ISD::FMA, VT, Legal);
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
	}

	setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32);
	setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32);
	setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32);
	setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32);
	setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);

	setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
	setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
	setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
	setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
	setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);

	if (!Subtarget.hasVLX()) {
	// With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
	// to 512-bit rather than use the AVX2 instructions so that we can use
	// k-masks.
	for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
	setOperationAction(ISD::MLOAD, VT, Custom);
	setOperationAction(ISD::MSTORE, VT, Custom);
	}
	}

	setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);

	for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::FFLOOR, VT, Legal);
	setOperationAction(ISD::FCEIL, VT, Legal);
	setOperationAction(ISD::FTRUNC, VT, Legal);
	setOperationAction(ISD::FRINT, VT, Legal);
	setOperationAction(ISD::FNEARBYINT, VT, Legal);
	}

	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);

	// Without BWI we need to use custom lowering to handle MVT::v64i8 input.
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);

	setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);

	setOperationAction(ISD::MUL, MVT::v8i64, Custom);
	setOperationAction(ISD::MUL, MVT::v16i32, Legal);

	setOperationAction(ISD::UMUL_LOHI, MVT::v16i32, Custom);
	setOperationAction(ISD::SMUL_LOHI, MVT::v16i32, Custom);

	setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
	setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
	setOperationAction(ISD::SELECT, MVT::v16f32, Custom);

	for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);
	setOperationAction(ISD::ABS, VT, Legal);
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::CTTZ, VT, Custom);
	setOperationAction(ISD::ROTL, VT, Custom);
	setOperationAction(ISD::ROTR, VT, Custom);
	}

	// Need to promote to 64-bit even though we have 32-bit masked instructions
	// because the IR optimizers rearrange bitcasts around logic ops leaving
	// too many variations to handle if we don't promote them.
	setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
	setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
	setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);

	if (Subtarget.hasDQI()) {
	setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
	}

	if (Subtarget.hasCDI()) {
	// NonVLX sub-targets extend 128/256 vectors to use the 512 version.
	for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
	setOperationAction(ISD::CTLZ, VT, Legal);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
	}
	} // Subtarget.hasCDI()

	if (Subtarget.hasVPOPCNTDQ()) {
	for (auto VT : { MVT::v16i32, MVT::v8i64 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}

	// Extract subvector is special because the value type
	// (result) is 256-bit but the source is 512-bit wide.
	// 128-bit was made Legal under AVX1.
	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
	MVT::v8f32, MVT::v4f64 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

	for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
	setOperationAction(ISD::MLOAD, VT, Legal);
	setOperationAction(ISD::MSTORE, VT, Legal);
	setOperationAction(ISD::MGATHER, VT, Custom);
	setOperationAction(ISD::MSCATTER, VT, Custom);
	}
	for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
	setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
	setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
	}
	}// has AVX-512

	if (!Subtarget.useSoftFloat() &&
	(Subtarget.hasAVX512() \|\| Subtarget.hasVLX())) {
	// These operations are handled on non-VLX by artificially widening in
	// isel patterns.
	// TODO: Custom widen in lowering on non-VLX and drop the isel patterns?

	setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);

	for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);
	setOperationAction(ISD::ABS, VT, Legal);
	}

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::ROTL, VT, Custom);
	setOperationAction(ISD::ROTR, VT, Custom);
	}

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
	setOperationAction(ISD::MSCATTER, VT, Custom);

	if (Subtarget.hasDQI()) {
	for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::SINT_TO_FP, VT, Legal);
	setOperationAction(ISD::UINT_TO_FP, VT, Legal);
	setOperationAction(ISD::FP_TO_SINT, VT, Legal);
	setOperationAction(ISD::FP_TO_UINT, VT, Legal);
	}
	}

	if (Subtarget.hasCDI()) {
	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::CTLZ, VT, Legal);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
	}
	} // Subtarget.hasCDI()

	if (Subtarget.hasVPOPCNTDQ()) {
	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
	addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
	addRegisterClass(MVT::v64i8, &X86::VR512RegClass);

	addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
	addRegisterClass(MVT::v64i1, &X86::VK64RegClass);

	for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
	setOperationAction(ISD::ADD, VT, Custom);
	setOperationAction(ISD::SUB, VT, Custom);
	setOperationAction(ISD::MUL, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Expand);

	setOperationAction(ISD::TRUNCATE, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	}

	setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
	for (auto VT : { MVT::v16i1, MVT::v32i1 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

	// Extends from v32i1 masks to 256-bit vectors.
	setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
	// Extends from v64i1 masks to 512-bit vectors.
	setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);

	setOperationAction(ISD::MUL, MVT::v32i16, Legal);
	setOperationAction(ISD::MUL, MVT::v64i8, Custom);
	setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
	setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
	setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
	setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
	setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);

	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);

	setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);

	for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::ABS, VT, Legal);
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	setOperationAction(ISD::MLOAD, VT, Legal);
	setOperationAction(ISD::MSTORE, VT, Legal);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::CTTZ, VT, Custom);
	setOperationAction(ISD::CTLZ, VT, Custom);
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);

	setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
	setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
	setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
	}

	for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
	setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
	}

	if (Subtarget.hasBITALG()) {
	for (auto VT : { MVT::v64i8, MVT::v32i16 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasBWI() &&
	(Subtarget.hasAVX512() \|\| Subtarget.hasVLX())) {
	for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
	setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
	setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
	}

	// These operations are handled on non-VLX by artificially widening in
	// isel patterns.
	// TODO: Custom widen in lowering on non-VLX and drop the isel patterns?

	if (Subtarget.hasBITALG()) {
	for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
	setOperationAction(ISD::CTPOP, VT, Legal);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
	addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
	addRegisterClass(MVT::v2i1, &X86::VK2RegClass);

	for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
	setOperationAction(ISD::ADD, VT, Custom);
	setOperationAction(ISD::SUB, VT, Custom);
	setOperationAction(ISD::MUL, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Expand);

	setOperationAction(ISD::TRUNCATE, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	}

	// TODO: v8i1 concat should be legal without VLX to support concats of
	// v1i1, but we won't legalize it correctly currently without introducing
	// a v4i1 concat in the middle.
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
	for (auto VT : { MVT::v2i1, MVT::v4i1 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

	// Extends from v2i1/v4i1 masks to 128-bit vectors.
	setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Custom);

	setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
	setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
	setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
	setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
	setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);

	setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
	setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
	setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
	setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
	setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);

	if (Subtarget.hasDQI()) {
	// Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
	// v2f32 UINT_TO_FP is already custom under SSE2.
	setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
	assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
	"Unexpected operation action!");
	// v2i64 FP_TO_S/UINT(v2f32) custom conversion.
	setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
	}

	if (Subtarget.hasBWI()) {
	setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
	setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
	}
	}

	// We want to custom lower some of our intrinsics.
	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
	setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
	setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
	if (!Subtarget.is64Bit()) {
	setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
	}

	// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
	// handle type legalization for these operations here.
	//
	// FIXME: We really should do custom legalization for addition and
	// subtraction on x86-32 once PR3203 is fixed. We really can't do much better
	// than generic legalization for 64-bit multiplication-with-overflow, though.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	// Add/Sub/Mul with overflow operations are custom lowered.
	setOperationAction(ISD::SADDO, VT, Custom);
	setOperationAction(ISD::UADDO, VT, Custom);
	setOperationAction(ISD::SSUBO, VT, Custom);
	setOperationAction(ISD::USUBO, VT, Custom);
	setOperationAction(ISD::SMULO, VT, Custom);
	setOperationAction(ISD::UMULO, VT, Custom);

	// Support carry in as value rather than glue.
	setOperationAction(ISD::ADDCARRY, VT, Custom);
	setOperationAction(ISD::SUBCARRY, VT, Custom);
	setOperationAction(ISD::SETCCCARRY, VT, Custom);
	}

	if (!Subtarget.is64Bit()) {
	// These libcalls are not available in 32-bit.
	setLibcallName(RTLIB::SHL_I128, nullptr);
	setLibcallName(RTLIB::SRL_I128, nullptr);
	setLibcallName(RTLIB::SRA_I128, nullptr);
	setLibcallName(RTLIB::MUL_I128, nullptr);
	}

	// Combine sin / cos into _sincos_stret if it is available.
	if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
	getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
	setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
	setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
	}

	if (Subtarget.isTargetWin64()) {
	setOperationAction(ISD::SDIV, MVT::i128, Custom);
	setOperationAction(ISD::UDIV, MVT::i128, Custom);
	setOperationAction(ISD::SREM, MVT::i128, Custom);
	setOperationAction(ISD::UREM, MVT::i128, Custom);
	setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
	setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
	}

	// On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
	// is. We should promote the value to 64-bits to solve this.
	// This is what the CRT headers do - `fmodf` is an inline header
	// function casting to f64 and calling `fmod`.
	if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() \|\|
	Subtarget.isTargetWindowsItanium()))
	for (ISD::NodeType Op :
	{ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
	ISD::FLOG10, ISD::FPOW, ISD::FSIN})
	if (isOperationExpand(Op, MVT::f32))
	setOperationAction(Op, MVT::f32, Promote);

	// We have target-specific dag combine patterns for the following nodes:
	setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
	setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
	setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
	setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
	setTargetDAGCombine(ISD::BITCAST);
	setTargetDAGCombine(ISD::VSELECT);
	setTargetDAGCombine(ISD::SELECT);
	setTargetDAGCombine(ISD::SHL);
	setTargetDAGCombine(ISD::SRA);
	setTargetDAGCombine(ISD::SRL);
	setTargetDAGCombine(ISD::OR);
	setTargetDAGCombine(ISD::AND);
	setTargetDAGCombine(ISD::ADD);
	setTargetDAGCombine(ISD::FADD);
	setTargetDAGCombine(ISD::FSUB);
	setTargetDAGCombine(ISD::FNEG);
	setTargetDAGCombine(ISD::FMA);
	setTargetDAGCombine(ISD::FMINNUM);
	setTargetDAGCombine(ISD::FMAXNUM);
	setTargetDAGCombine(ISD::SUB);
	setTargetDAGCombine(ISD::LOAD);
	setTargetDAGCombine(ISD::MLOAD);
	setTargetDAGCombine(ISD::STORE);
	setTargetDAGCombine(ISD::MSTORE);
	setTargetDAGCombine(ISD::TRUNCATE);
	setTargetDAGCombine(ISD::ZERO_EXTEND);
	setTargetDAGCombine(ISD::ANY_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
	setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
	setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
	setTargetDAGCombine(ISD::SINT_TO_FP);
	setTargetDAGCombine(ISD::UINT_TO_FP);
	setTargetDAGCombine(ISD::SETCC);
	setTargetDAGCombine(ISD::MUL);
	setTargetDAGCombine(ISD::XOR);
	setTargetDAGCombine(ISD::MSCATTER);
	setTargetDAGCombine(ISD::MGATHER);

	computeRegisterProperties(Subtarget.getRegisterInfo());

	MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
	MaxStoresPerMemsetOptSize = 8;
	MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
	MaxStoresPerMemcpyOptSize = 4;
	MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
	MaxStoresPerMemmoveOptSize = 4;

	// TODO: These control memcmp expansion in CGP and could be raised higher, but
	// that needs to benchmarked and balanced with the potential use of vector
	// load/store types (PR33329, PR33914).
	MaxLoadsPerMemcmp = 2;
	MaxLoadsPerMemcmpOptSize = 2;

	// Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
	setPrefLoopAlignment(ExperimentalPrefLoopAlignment);

	// An out-of-order CPU can speculatively execute past a predictable branch,
	// but a conditional move could be stalled by an expensive earlier operation.
	PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
	EnableExtLdPromotion = true;
	setPrefFunctionAlignment(4); // 2^4 bytes.

	verifyIntrinsicTables();
	}

	// This has so far only been implemented for 64-bit MachO.
	bool X86TargetLowering::useLoadStackGuardNode() const {
	return Subtarget.isTargetMachO() && Subtarget.is64Bit();
	}

	bool X86TargetLowering::useStackGuardXorFP() const {
	// Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
	return Subtarget.getTargetTriple().isOSMSVCRT();
	}

	SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
	const SDLoc &DL) const {
	EVT PtrTy = getPointerTy(DAG.getDataLayout());
	unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
	MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
	return SDValue(Node, 0);
	}

	TargetLoweringBase::LegalizeTypeAction
	X86TargetLowering::getPreferredVectorAction(EVT VT) const {
	if (ExperimentalVectorWideningLegalization &&
	VT.getVectorNumElements() != 1 &&
	VT.getVectorElementType().getSimpleVT() != MVT::i1)
	return TypeWidenVector;

	return TargetLoweringBase::getPreferredVectorAction(VT);
	}

	EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
	LLVMContext& Context,
	EVT VT) const {
	if (!VT.isVector())
	return MVT::i8;

	if (Subtarget.hasAVX512()) {
	const unsigned NumElts = VT.getVectorNumElements();

	// Figure out what this type will be legalized to.
	EVT LegalVT = VT;
	while (getTypeAction(Context, LegalVT) != TypeLegal)
	LegalVT = getTypeToTransformTo(Context, LegalVT);

	// If we got a 512-bit vector then we'll definitely have a vXi1 compare.
	if (LegalVT.getSimpleVT().is512BitVector())
	return EVT::getVectorVT(Context, MVT::i1, NumElts);

	if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
	// If we legalized to less than a 512-bit vector, then we will use a vXi1
	// compare for vXi32/vXi64 for sure. If we have BWI we will also support
	// vXi16/vXi8.
	MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
	if (Subtarget.hasBWI() \|\| EltVT.getSizeInBits() >= 32)
	return EVT::getVectorVT(Context, MVT::i1, NumElts);
	}
	}

	return VT.changeVectorElementTypeToInteger();
	}

	/// Helper for getByValTypeAlignment to determine
	/// the desired ByVal argument alignment.
	static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
	if (MaxAlign == 16)
	return;
	if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
	if (VTy->getBitWidth() == 128)
	MaxAlign = 16;
	} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
	unsigned EltAlign = 0;
	getMaxByValAlign(ATy->getElementType(), EltAlign);
	if (EltAlign > MaxAlign)
	MaxAlign = EltAlign;
	} else if (StructType *STy = dyn_cast<StructType>(Ty)) {
	for (auto *EltTy : STy->elements()) {
	unsigned EltAlign = 0;
	getMaxByValAlign(EltTy, EltAlign);
	if (EltAlign > MaxAlign)
	MaxAlign = EltAlign;
	if (MaxAlign == 16)
	break;
	}
	}
	}

	/// Return the desired alignment for ByVal aggregate
	/// function arguments in the caller parameter area. For X86, aggregates
	/// that contain SSE vectors are placed at 16-byte boundaries while the rest
	/// are at 4-byte boundaries.
	unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
	const DataLayout &DL) const {
	if (Subtarget.is64Bit()) {
	// Max of 8 and alignment of type.
	unsigned TyAlign = DL.getABITypeAlignment(Ty);
	if (TyAlign > 8)
	return TyAlign;
	return 8;
	}

	unsigned Align = 4;
	if (Subtarget.hasSSE1())
	getMaxByValAlign(Ty, Align);
	return Align;
	}

	/// Returns the target specific optimal type for load
	/// and store operations as a result of memset, memcpy, and memmove
	/// lowering. If DstAlign is zero that means it's safe to destination
	/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
	/// means there isn't a need to check it against alignment requirement,
	/// probably because the source does not need to be loaded. If 'IsMemset' is
	/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
	/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
	/// source is constant so it does not need to be loaded.
	/// It returns EVT::Other if the type should be determined using generic
	/// target-independent logic.
	EVT
	X86TargetLowering::getOptimalMemOpType(uint64_t Size,
	unsigned DstAlign, unsigned SrcAlign,
	bool IsMemset, bool ZeroMemset,
	bool MemcpyStrSrc,
	MachineFunction &MF) const {
	const Function &F = MF.getFunction();
	if (!F.hasFnAttribute(Attribute::NoImplicitFloat)) {
	if (Size >= 16 &&
	(!Subtarget.isUnalignedMem16Slow() \|\|
	((DstAlign == 0 \|\| DstAlign >= 16) &&
	(SrcAlign == 0 \|\| SrcAlign >= 16)))) {
	// FIXME: Check if unaligned 32-byte accesses are slow.
	if (Size >= 32 && Subtarget.hasAVX()) {
	// Although this isn't a well-supported type for AVX1, we'll let
	// legalization and shuffle lowering produce the optimal codegen. If we
	// choose an optimal type with a vector element larger than a byte,
	// getMemsetStores() may create an intermediate splat (using an integer
	// multiply) before we splat as a vector.
	return MVT::v32i8;
	}
	if (Subtarget.hasSSE2())
	return MVT::v16i8;
	// TODO: Can SSE1 handle a byte vector?
	if (Subtarget.hasSSE1())
	return MVT::v4f32;
	} else if ((!IsMemset \|\| ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
	!Subtarget.is64Bit() && Subtarget.hasSSE2()) {
	// Do not use f64 to lower memcpy if source is string constant. It's
	// better to use i32 to avoid the loads.
	// Also, do not use f64 to lower memset unless this is a memset of zeros.
	// The gymnastics of splatting a byte value into an XMM register and then
	// only using 8-byte stores (because this is a CPU with slow unaligned
	// 16-byte accesses) makes that a loser.
	return MVT::f64;
	}
	}
	// This is a compromise. If we reach here, unaligned accesses may be slow on
	// this target. However, creating smaller, aligned accesses could be even
	// slower and would certainly be a lot more code.
	if (Subtarget.is64Bit() && Size >= 8)
	return MVT::i64;
	return MVT::i32;
	}

	bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
	if (VT == MVT::f32)
	return X86ScalarSSEf32;
	else if (VT == MVT::f64)
	return X86ScalarSSEf64;
	return true;
	}

	bool
	X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
	unsigned,
	unsigned,
	bool *Fast) const {
	if (Fast) {
	switch (VT.getSizeInBits()) {
	default:
	// 8-byte and under are always assumed to be fast.
	*Fast = true;
	break;
	case 128:
	*Fast = !Subtarget.isUnalignedMem16Slow();
	break;
	case 256:
	*Fast = !Subtarget.isUnalignedMem32Slow();
	break;
	// TODO: What about AVX-512 (512-bit) accesses?
	}
	}
	// Misaligned accesses of any size are always allowed.
	return true;
	}

	/// Return the entry encoding for a jump table in the
	/// current function. The returned value is a member of the
	/// MachineJumpTableInfo::JTEntryKind enum.
	unsigned X86TargetLowering::getJumpTableEncoding() const {
	// In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
	// symbol.
	if (isPositionIndependent() && Subtarget.isPICStyleGOT())
	return MachineJumpTableInfo::EK_Custom32;

	// Otherwise, use the normal jump table encoding heuristics.
	return TargetLowering::getJumpTableEncoding();
	}

	bool X86TargetLowering::useSoftFloat() const {
	return Subtarget.useSoftFloat();
	}

	void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
	ArgListTy &Args) const {

	// Only relabel X86-32 for C / Stdcall CCs.
	if (Subtarget.is64Bit())
	return;
	if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
	return;
	unsigned ParamRegs = 0;
	if (auto *M = MF->getFunction().getParent())
	ParamRegs = M->getNumberRegisterParameters();

	// Mark the first N int arguments as having reg
	for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
	Type *T = Args[Idx].Ty;
	if (T->isPointerTy() \|\| T->isIntegerTy())
	if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
	unsigned numRegs = 1;
	if (MF->getDataLayout().getTypeAllocSize(T) > 4)
	numRegs = 2;
	if (ParamRegs < numRegs)
	return;
	ParamRegs -= numRegs;
	Args[Idx].IsInReg = true;
	}
	}
	}

	const MCExpr *
	X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
	const MachineBasicBlock *MBB,
	unsigned uid,MCContext &Ctx) const{
	assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
	// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
	// entries.
	return MCSymbolRefExpr::create(MBB->getSymbol(),
	MCSymbolRefExpr::VK_GOTOFF, Ctx);
	}

	/// Returns relocation base for the given PIC jumptable.
	SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
	SelectionDAG &DAG) const {
	if (!Subtarget.is64Bit())
	// This doesn't have SDLoc associated with it, but is not really the
	// same as a Register.
	return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
	getPointerTy(DAG.getDataLayout()));
	return Table;
	}

	/// This returns the relocation base for the given PIC jumptable,
	/// the same as getPICJumpTableRelocBase, but as an MCExpr.
	const MCExpr *X86TargetLowering::
	getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
	MCContext &Ctx) const {
	// X86-64 uses RIP relative addressing based on the jump table label.
	if (Subtarget.isPICStyleRIPRel())
	return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);

	// Otherwise, the reference is relative to the PIC base.
	return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
	}

	std::pair<const TargetRegisterClass *, uint8_t>
	X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
	MVT VT) const {
	const TargetRegisterClass *RRC = nullptr;
	uint8_t Cost = 1;
	switch (VT.SimpleTy) {
	default:
	return TargetLowering::findRepresentativeClass(TRI, VT);
	case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
	RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
	break;
	case MVT::x86mmx:
	RRC = &X86::VR64RegClass;
	break;
	case MVT::f32: case MVT::f64:
	case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
	case MVT::v4f32: case MVT::v2f64:
	case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
	case MVT::v8f32: case MVT::v4f64:
	case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
	case MVT::v16f32: case MVT::v8f64:
	RRC = &X86::VR128XRegClass;
	break;
	}
	return std::make_pair(RRC, Cost);
	}

	unsigned X86TargetLowering::getAddressSpace() const {
	if (Subtarget.is64Bit())
	return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
	return 256;
	}

	static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
	return TargetTriple.isOSGlibc() \|\| TargetTriple.isOSFuchsia() \|\|
	(TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
	}

	static Constant* SegmentOffset(IRBuilder<> &IRB,
	unsigned Offset, unsigned AddressSpace) {
	return ConstantExpr::getIntToPtr(
	ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
	Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
	}

	Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
	// glibc, bionic, and Fuchsia have a special slot for the stack guard in
	// tcbhead_t; use it instead of the usual global variable (see
	// sysdeps/{i386,x86_64}/nptl/tls.h)
	if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
	if (Subtarget.isTargetFuchsia()) {
	// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
	return SegmentOffset(IRB, 0x10, getAddressSpace());
	} else {
	// %fs:0x28, unless we're using a Kernel code model, in which case
	// it's %gs:0x28. gs:0x14 on i386.
	unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
	return SegmentOffset(IRB, Offset, getAddressSpace());
	}
	}

	return TargetLowering::getIRStackGuard(IRB);
	}

	void X86TargetLowering::insertSSPDeclarations(Module &M) const {
	// MSVC CRT provides functionalities for stack protection.
	if (Subtarget.getTargetTriple().isOSMSVCRT()) {
	// MSVC CRT has a global variable holding security cookie.
	M.getOrInsertGlobal("__security_cookie",
	Type::getInt8PtrTy(M.getContext()));

	// MSVC CRT has a function to validate security cookie.
	auto *SecurityCheckCookie = cast<Function>(
	M.getOrInsertFunction("__security_check_cookie",
	Type::getVoidTy(M.getContext()),
	Type::getInt8PtrTy(M.getContext())));
	SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
	SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
	return;
	}
	// glibc, bionic, and Fuchsia have a special slot for the stack guard.
	if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
	return;
	TargetLowering::insertSSPDeclarations(M);
	}

	Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
	// MSVC CRT has a global variable holding security cookie.
	if (Subtarget.getTargetTriple().isOSMSVCRT())
	return M.getGlobalVariable("__security_cookie");
	return TargetLowering::getSDagStackGuard(M);
	}

	Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
	// MSVC CRT has a function to validate security cookie.
	if (Subtarget.getTargetTriple().isOSMSVCRT())
	return M.getFunction("__security_check_cookie");
	return TargetLowering::getSSPStackGuardCheck(M);
	}

	Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
	if (Subtarget.getTargetTriple().isOSContiki())
	return getDefaultSafeStackPointerLocation(IRB, false);

	// Android provides a fixed TLS slot for the SafeStack pointer. See the
	// definition of TLS_SLOT_SAFESTACK in
	// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
	if (Subtarget.isTargetAndroid()) {
	// %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
	// %gs:0x24 on i386
	unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
	return SegmentOffset(IRB, Offset, getAddressSpace());
	}

	// Fuchsia is similar.
	if (Subtarget.isTargetFuchsia()) {
	// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
	return SegmentOffset(IRB, 0x18, getAddressSpace());
	}

	return TargetLowering::getSafeStackPointerLocation(IRB);
	}

	bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
	unsigned DestAS) const {
	assert(SrcAS != DestAS && "Expected different address spaces!");

	return SrcAS < 256 && DestAS < 256;
	}

	//===----------------------------------------------------------------------===//
	// Return Value Calling Convention Implementation
	//===----------------------------------------------------------------------===//

	#include "X86GenCallingConv.inc"

	bool X86TargetLowering::CanLowerReturn(
	CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
	return CCInfo.CheckReturn(Outs, RetCC_X86);
	}

	const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
	static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
	return ScratchRegs;
	}

	/// Lowers masks values (v*i1) to the local register values
	/// \returns DAG node after lowering to register type
	static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
	const SDLoc &Dl, SelectionDAG &DAG) {
	EVT ValVT = ValArg.getValueType();

	if (ValVT == MVT::v1i1)
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
	DAG.getIntPtrConstant(0, Dl));

	if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 \|\| ValLoc == MVT::i32)) \|\|
	(ValVT == MVT::v16i1 && (ValLoc == MVT::i16 \|\| ValLoc == MVT::i32))) {
	// Two stage lowering might be required
	// bitcast: v8i1 -> i8 / v16i1 -> i16
	// anyextend: i8 -> i32 / i16 -> i32
	EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
	SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
	if (ValLoc == MVT::i32)
	ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
	return ValToCopy;
	} else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) \|\|
	(ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
	// One stage lowering is required
	// bitcast: v32i1 -> i32 / v64i1 -> i64
	return DAG.getBitcast(ValLoc, ValArg);
	} else
	return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
	}

	/// Breaks v64i1 value into two registers and adds the new node to the DAG
	static void Passv64i1ArgInRegs(
	const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
	SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
	CCValAssign &NextVA, const X86Subtarget &Subtarget) {
	assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
	assert(Subtarget.is32Bit() && "Expecting 32 bit target");
	assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
	assert(VA.isRegLoc() && NextVA.isRegLoc() &&
	"The value should reside in two registers");

	// Before splitting the value we cast it to i64
	Arg = DAG.getBitcast(MVT::i64, Arg);

	// Splitting the value into two i32 types
	SDValue Lo, Hi;
	Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
	DAG.getConstant(0, Dl, MVT::i32));
	Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
	DAG.getConstant(1, Dl, MVT::i32));

	// Attach the two i32 types into corresponding registers
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
	RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
	}

	SDValue
	X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &dl, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

	// In some cases we need to disable registers from the default CSR list.
	// For example, when they are used for argument passing.
	bool ShouldDisableCalleeSavedRegister =
	CallConv == CallingConv::X86_RegCall \|\|
	MF.getFunction().hasFnAttribute("no_caller_saved_registers");

	if (CallConv == CallingConv::X86_INTR && !Outs.empty())
	report_fatal_error("X86 interrupts may not return any value");

	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
	CCInfo.AnalyzeReturn(Outs, RetCC_X86);

	SDValue Flag;
	SmallVector<SDValue, 6> RetOps;
	RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
	// Operand #1 = Bytes To Pop
	RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
	MVT::i32));

	// Copy the result values into the output registers.
	for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
	++I, ++OutsIndex) {
	CCValAssign &VA = RVLocs[I];
	assert(VA.isRegLoc() && "Can only return in registers!");

	// Add the register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());

	SDValue ValToCopy = OutVals[OutsIndex];
	EVT ValVT = ValToCopy.getValueType();

	// Promote values to the appropriate types.
	if (VA.getLocInfo() == CCValAssign::SExt)
	ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
	else if (VA.getLocInfo() == CCValAssign::ZExt)
	ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
	else if (VA.getLocInfo() == CCValAssign::AExt) {
	if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
	ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
	else
	ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
	}
	else if (VA.getLocInfo() == CCValAssign::BCvt)
	ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);

	assert(VA.getLocInfo() != CCValAssign::FPExt &&
	"Unexpected FP-extend for return value.");

	// If this is x86-64, and we disabled SSE, we can't return FP values,
	// or SSE or MMX vectors.
	if ((ValVT == MVT::f32 \|\| ValVT == MVT::f64 \|\|
	VA.getLocReg() == X86::XMM0 \|\| VA.getLocReg() == X86::XMM1) &&
	(Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
	errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	} else if (ValVT == MVT::f64 &&
	(Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
	// Likewise we can't return F64 values with SSE1 only. gcc does so, but
	// llvm-gcc has never done it right and no one has noticed, so this
	// should be OK for now.
	errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	}

	// Returns in ST0/ST1 are handled specially: these are pushed as operands to
	// the RET instruction and handled by the FP Stackifier.
	if (VA.getLocReg() == X86::FP0 \|\|
	VA.getLocReg() == X86::FP1) {
	// If this is a copy from an xmm register to ST(0), use an FPExtend to
	// change the value to the FP stack register class.
	if (isScalarFPTypeInSSEReg(VA.getValVT()))
	ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
	RetOps.push_back(ValToCopy);
	// Don't emit a copytoreg.
	continue;
	}

	// 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
	// which is returned in RAX / RDX.
	if (Subtarget.is64Bit()) {
	if (ValVT == MVT::x86mmx) {
	if (VA.getLocReg() == X86::XMM0 \|\| VA.getLocReg() == X86::XMM1) {
	ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
	ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
	ValToCopy);
	// If we don't have SSE2 available, convert to v4f32 so the generated
	// register is legal.
	if (!Subtarget.hasSSE2())
	ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
	}
	}
	}

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;

	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");

	Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
	Subtarget);

	assert(2 == RegsToPass.size() &&
	"Expecting two registers after Pass64BitArgInRegs");

	// Add the second register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
	} else {
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
	}

	// Add nodes to the DAG and add the values into the RetOps list
	for (auto &Reg : RegsToPass) {
	Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
	Flag = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
	}
	}

	// Swift calling convention does not require we copy the sret argument
	// into %rax/%eax for the return, and SRetReturnReg is not set for Swift.

	// All x86 ABIs require that for returning structs by value we copy
	// the sret argument into %rax/%eax (depending on ABI) for the return.
	// We saved the argument into a virtual register in the entry block,
	// so now we copy the value out and into %rax/%eax.
	//
	// Checking Function.hasStructRetAttr() here is insufficient because the IR
	// may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
	// false, then an sret argument may be implicitly inserted in the SelDAG. In
	// either case FuncInfo->setSRetReturnReg() will have been called.
	if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
	// When we have both sret and another return value, we should use the
	// original Chain stored in RetOps[0], instead of the current Chain updated
	// in the above loop. If we only have sret, RetOps[0] equals to Chain.

	// For the case of sret and another return value, we have
	// Chain_0 at the function entry
	// Chain_1 = getCopyToReg(Chain_0) in the above loop
	// If we use Chain_1 in getCopyFromReg, we will have
	// Val = getCopyFromReg(Chain_1)
	// Chain_2 = getCopyToReg(Chain_1, Val) from below

	// getCopyToReg(Chain_0) will be glued together with
	// getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
	// in Unit B, and we will have cyclic dependency between Unit A and Unit B:
	// Data dependency from Unit B to Unit A due to usage of Val in
	// getCopyToReg(Chain_1, Val)
	// Chain dependency from Unit A to Unit B

	// So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
	SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
	getPointerTy(MF.getDataLayout()));

	unsigned RetValReg
	= (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
	X86::RAX : X86::EAX;
	Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
	Flag = Chain.getValue(1);

	// RAX/EAX now acts like a return value.
	RetOps.push_back(
	DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));

	// Add the returned register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
	}

	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const MCPhysReg *I =
	TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
	if (I) {
	for (; *I; ++I) {
	if (X86::GR64RegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::i64));
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
	}
	}

	RetOps[0] = Chain; // Update chain.

	// Add the flag if we have it.
	if (Flag.getNode())
	RetOps.push_back(Flag);

	X86ISD::NodeType opcode = X86ISD::RET_FLAG;
	if (CallConv == CallingConv::X86_INTR)
	opcode = X86ISD::IRET;
	return DAG.getNode(opcode, dl, MVT::Other, RetOps);
	}

	bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
	if (N->getNumValues() != 1 \|\| !N->hasNUsesOfValue(1, 0))
	return false;

	SDValue TCChain = Chain;
	SDNode Copy = N->use_begin();
	if (Copy->getOpcode() == ISD::CopyToReg) {
	// If the copy has a glue operand, we conservatively assume it isn't safe to
	// perform a tail call.
	if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
	return false;
	TCChain = Copy->getOperand(0);
	} else if (Copy->getOpcode() != ISD::FP_EXTEND)
	return false;

	bool HasRet = false;
	for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
	UI != UE; ++UI) {
	if (UI->getOpcode() != X86ISD::RET_FLAG)
	return false;
	// If we are returning more than one value, we can definitely
	// not make a tail call see PR19530
	if (UI->getNumOperands() > 4)
	return false;
	if (UI->getNumOperands() == 4 &&
	UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
	return false;
	HasRet = true;
	}

	if (!HasRet)
	return false;

	Chain = TCChain;
	return true;
	}

	EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
	ISD::NodeType ExtendKind) const {
	MVT ReturnMVT = MVT::i32;

	bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
	if (VT == MVT::i1 \|\| (!Darwin && (VT == MVT::i8 \|\| VT == MVT::i16))) {
	// The ABI does not require i1, i8 or i16 to be extended.
	//
	// On Darwin, there is code in the wild relying on Clang's old behaviour of
	// always extending i8/i16 return values, so keep doing that for now.
	// (PR26665).
	ReturnMVT = MVT::i8;
	}

	EVT MinVT = getRegisterType(Context, ReturnMVT);
	return VT.bitsLT(MinVT) ? MinVT : VT;
	}

	/// Reads two 32 bit registers and creates a 64 bit mask value.
	/// \param VA The current 32 bit value that need to be assigned.
	/// \param NextVA The next 32 bit value that need to be assigned.
	/// \param Root The parent DAG node.
	/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
	/// glue purposes. In the case the DAG is already using
	/// physical register instead of virtual, we should glue
	/// our new SDValue to InFlag SDvalue.
	/// \return a new SDvalue of size 64bit.
	static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
	SDValue &Root, SelectionDAG &DAG,
	const SDLoc &Dl, const X86Subtarget &Subtarget,
	SDValue *InFlag = nullptr) {
	assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
	assert(Subtarget.is32Bit() && "Expecting 32 bit target");
	assert(VA.getValVT() == MVT::v64i1 &&
	"Expecting first location of 64 bit width type");
	assert(NextVA.getValVT() == VA.getValVT() &&
	"The locations should have the same type");
	assert(VA.isRegLoc() && NextVA.isRegLoc() &&
	"The values should reside in two registers");

	SDValue Lo, Hi;
	unsigned Reg;
	SDValue ArgValueLo, ArgValueHi;

	MachineFunction &MF = DAG.getMachineFunction();
	const TargetRegisterClass *RC = &X86::GR32RegClass;

	// Read a 32 bit value from the registers
	if (nullptr == InFlag) {
	// When no physical register is present,
	// create an intermediate virtual register
	Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
	Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
	ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
	} else {
	// When a physical register is available read the value from it and glue
	// the reads together.
	ArgValueLo =
	DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
	*InFlag = ArgValueLo.getValue(2);
	ArgValueHi =
	DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
	*InFlag = ArgValueHi.getValue(2);
	}

	// Convert the i32 type into v32i1 type
	Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);

	// Convert the i32 type into v32i1 type
	Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);

	// Concatenate the two values together
	return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
	}

	/// The function will lower a register of various sizes (8/16/32/64)
	/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
	/// \returns a DAG node contains the operand after lowering to mask type.
	static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
	const EVT &ValLoc, const SDLoc &Dl,
	SelectionDAG &DAG) {
	SDValue ValReturned = ValArg;

	if (ValVT == MVT::v1i1)
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);

	if (ValVT == MVT::v64i1) {
	// In 32 bit machine, this case is handled by getv64i1Argument
	assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
	// In 64 bit machine, There is no need to truncate the value only bitcast
	} else {
	MVT maskLen;
	switch (ValVT.getSimpleVT().SimpleTy) {
	case MVT::v8i1:
	maskLen = MVT::i8;
	break;
	case MVT::v16i1:
	maskLen = MVT::i16;
	break;
	case MVT::v32i1:
	maskLen = MVT::i32;
	break;
	default:
	llvm_unreachable("Expecting a vector of i1 types");
	}

	ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
	}
	return DAG.getBitcast(ValVT, ValReturned);
	}

	/// Lower the result values of a call into the
	/// appropriate copies out of appropriate physical registers.
	///
	SDValue X86TargetLowering::LowerCallResult(
	SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
	uint32_t *RegMask) const {

	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	// Assign locations to each value returned by this call.
	SmallVector<CCValAssign, 16> RVLocs;
	bool Is64Bit = Subtarget.is64Bit();
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());
	CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

	// Copy all of the result registers out of their specified physreg.
	for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
	++I, ++InsIndex) {
	CCValAssign &VA = RVLocs[I];
	EVT CopyVT = VA.getLocVT();

	// In some calling conventions we need to remove the used registers
	// from the register mask.
	if (RegMask) {
	for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /IncludeSelf=/true);
	SubRegs.isValid(); ++SubRegs)
	RegMask[SubRegs / 32] &= ~(1u << (SubRegs % 32));
	}

	// If this is x86-64, and we disabled SSE, we can't return FP values
	if ((CopyVT == MVT::f32 \|\| CopyVT == MVT::f64 \|\| CopyVT == MVT::f128) &&
	((Is64Bit \|\| Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
	errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	}

	// If we prefer to use the value in xmm registers, copy it out as f80 and
	// use a truncate to move it from fp stack reg to xmm reg.
	bool RoundAfterCopy = false;
	if ((VA.getLocReg() == X86::FP0 \|\| VA.getLocReg() == X86::FP1) &&
	isScalarFPTypeInSSEReg(VA.getValVT())) {
	if (!Subtarget.hasX87())
	report_fatal_error("X87 register return with X87 disabled");
	CopyVT = MVT::f80;
	RoundAfterCopy = (CopyVT != VA.getLocVT());
	}

	SDValue Val;
	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");
	Val =
	getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
	} else {
	Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
	.getValue(1);
	Val = Chain.getValue(0);
	InFlag = Chain.getValue(2);
	}

	if (RoundAfterCopy)
	Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
	// This truncation won't change the value.
	DAG.getIntPtrConstant(1, dl));

	if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
	if (VA.getValVT().isVector() &&
	((VA.getLocVT() == MVT::i64) \|\| (VA.getLocVT() == MVT::i32) \|\|
	(VA.getLocVT() == MVT::i16) \|\| (VA.getLocVT() == MVT::i8))) {
	// promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
	Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
	} else
	Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
	}

	InVals.push_back(Val);
	}

	return Chain;
	}

	//===----------------------------------------------------------------------===//
	// C & StdCall & Fast Calling Convention implementation
	//===----------------------------------------------------------------------===//
	// StdCall calling convention seems to be standard for many Windows' API
	// routines and around. It differs from C calling convention just a little:
	// callee should clean up the stack, not caller. Symbols should be also
	// decorated in some fancy way :) It doesn't support any vector arguments.
	// For info on fast calling convention see Fast Calling Convention (tail call)
	// implementation LowerX86_32FastCCCallTo.

	/// CallIsStructReturn - Determines whether a call uses struct return
	/// semantics.
	enum StructReturnType {
	NotStructReturn,
	RegStructReturn,
	StackStructReturn
	};
	static StructReturnType
	callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
	if (Outs.empty())
	return NotStructReturn;

	const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
	if (!Flags.isSRet())
	return NotStructReturn;
	if (Flags.isInReg() \|\| IsMCU)
	return RegStructReturn;
	return StackStructReturn;
	}

	/// Determines whether a function uses struct return semantics.
	static StructReturnType
	argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
	if (Ins.empty())
	return NotStructReturn;

	const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
	if (!Flags.isSRet())
	return NotStructReturn;
	if (Flags.isInReg() \|\| IsMCU)
	return RegStructReturn;
	return StackStructReturn;
	}

	/// Make a copy of an aggregate at address specified by "Src" to address
	/// "Dst" with size and alignment information specified by the specific
	/// parameter attribute. The copy will be passed as a byval function parameter.
	static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
	SDValue Chain, ISD::ArgFlagsTy Flags,
	SelectionDAG &DAG, const SDLoc &dl) {
	SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);

	return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
	/isVolatile/false, /AlwaysInline=/true,
	/isTailCall/false,
	MachinePointerInfo(), MachinePointerInfo());
	}

	/// Return true if the calling convention is one that we can guarantee TCO for.
	static bool canGuaranteeTCO(CallingConv::ID CC) {
	return (CC == CallingConv::Fast \|\| CC == CallingConv::GHC \|\|
	CC == CallingConv::X86_RegCall \|\| CC == CallingConv::HiPE \|\|
	CC == CallingConv::HHVM);
	}

	/// Return true if we might ever do TCO for calls with this calling convention.
	static bool mayTailCallThisCC(CallingConv::ID CC) {
	switch (CC) {
	// C calling conventions:
	case CallingConv::C:
	case CallingConv::Win64:
	case CallingConv::X86_64_SysV:
	// Callee pop conventions:
	case CallingConv::X86_ThisCall:
	case CallingConv::X86_StdCall:
	case CallingConv::X86_VectorCall:
	case CallingConv::X86_FastCall:
	return true;
	default:
	return canGuaranteeTCO(CC);
	}
	}

	/// Return true if the function is being made into a tailcall target by
	/// changing its ABI.
	static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
	return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
	}

	bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
	auto Attr =
	CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
	if (!CI->isTailCall() \|\| Attr.getValueAsString() == "true")
	return false;

	ImmutableCallSite CS(CI);
	CallingConv::ID CalleeCC = CS.getCallingConv();
	if (!mayTailCallThisCC(CalleeCC))
	return false;

	return true;
	}

	SDValue
	X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &dl, SelectionDAG &DAG,
	const CCValAssign &VA,
	MachineFrameInfo &MFI, unsigned i) const {
	// Create the nodes corresponding to a load from this parameter slot.
	ISD::ArgFlagsTy Flags = Ins[i].Flags;
	bool AlwaysUseMutable = shouldGuaranteeTCO(
	CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
	bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
	EVT ValVT;
	MVT PtrVT = getPointerTy(DAG.getDataLayout());

	// If value is passed by pointer we have address passed instead of the value
	// itself. No need to extend if the mask value and location share the same
	// absolute size.
	bool ExtendedInMem =
	VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
	VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();

	if (VA.getLocInfo() == CCValAssign::Indirect \|\| ExtendedInMem)
	ValVT = VA.getLocVT();
	else
	ValVT = VA.getValVT();

	// Calculate SP offset of interrupt parameter, re-arrange the slot normally
	// taken by a return address.
	int Offset = 0;
	if (CallConv == CallingConv::X86_INTR) {
	// X86 interrupts may take one or two arguments.
	// On the stack there will be no return address as in regular call.
	// Offset of last argument need to be set to -4/-8 bytes.
	// Where offset of the first argument out of two, should be set to 0 bytes.
	Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
	if (Subtarget.is64Bit() && Ins.size() == 2) {
	// The stack pointer needs to be realigned for 64 bit handlers with error
	// code, so the argument offset changes by 8 bytes.
	Offset += 8;
	}
	}

	// FIXME: For now, all byval parameter objects are marked mutable. This can be
	// changed with more analysis.
	// In case of tail call optimization mark all arguments mutable. Since they
	// could be overwritten by lowering of arguments in case of a tail call.
	if (Flags.isByVal()) {
	unsigned Bytes = Flags.getByValSize();
	if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
	int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
	// Adjust SP offset of interrupt parameter.
	if (CallConv == CallingConv::X86_INTR) {
	MFI.setObjectOffset(FI, Offset);
	}
	return DAG.getFrameIndex(FI, PtrVT);
	}

	// This is an argument in memory. We might be able to perform copy elision.
	if (Flags.isCopyElisionCandidate()) {
	EVT ArgVT = Ins[i].ArgVT;
	SDValue PartAddr;
	if (Ins[i].PartOffset == 0) {
	// If this is a one-part value or the first part of a multi-part value,
	// create a stack object for the entire argument value type and return a
	// load from our portion of it. This assumes that if the first part of an
	// argument is in memory, the rest will also be in memory.
	int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
	/Immutable=/false);
	PartAddr = DAG.getFrameIndex(FI, PtrVT);
	return DAG.getLoad(
	ValVT, dl, Chain, PartAddr,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	} else {
	// This is not the first piece of an argument in memory. See if there is
	// already a fixed stack object including this offset. If so, assume it
	// was created by the PartOffset == 0 branch above and create a load from
	// the appropriate offset into it.
	int64_t PartBegin = VA.getLocMemOffset();
	int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
	int FI = MFI.getObjectIndexBegin();
	for (; MFI.isFixedObjectIndex(FI); ++FI) {
	int64_t ObjBegin = MFI.getObjectOffset(FI);
	int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
	if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
	break;
	}
	if (MFI.isFixedObjectIndex(FI)) {
	SDValue Addr =
	DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
	DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
	return DAG.getLoad(
	ValVT, dl, Chain, Addr,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
	Ins[i].PartOffset));
	}
	}
	}

	int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
	VA.getLocMemOffset(), isImmutable);

	// Set SExt or ZExt flag.
	if (VA.getLocInfo() == CCValAssign::ZExt) {
	MFI.setObjectZExt(FI, true);
	} else if (VA.getLocInfo() == CCValAssign::SExt) {
	MFI.setObjectSExt(FI, true);
	}

	// Adjust SP offset of interrupt parameter.
	if (CallConv == CallingConv::X86_INTR) {
	MFI.setObjectOffset(FI, Offset);
	}

	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	SDValue Val = DAG.getLoad(
	ValVT, dl, Chain, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	return ExtendedInMem
	? (VA.getValVT().isVector()
	? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
	: DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
	: Val;
	}

	// FIXME: Get this from tablegen.
	static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
	const X86Subtarget &Subtarget) {
	assert(Subtarget.is64Bit());

	if (Subtarget.isCallingConvWin64(CallConv)) {
	static const MCPhysReg GPR64ArgRegsWin64[] = {
	X86::RCX, X86::RDX, X86::R8, X86::R9
	};
	return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
	}

	static const MCPhysReg GPR64ArgRegs64Bit[] = {
	X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
	};
	return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
	}

	// FIXME: Get this from tablegen.
	static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
	CallingConv::ID CallConv,
	const X86Subtarget &Subtarget) {
	assert(Subtarget.is64Bit());
	if (Subtarget.isCallingConvWin64(CallConv)) {
	// The XMM registers which might contain var arg parameters are shadowed
	// in their paired GPR. So we only need to save the GPR to their home
	// slots.
	// TODO: __vectorcall will change this.
	return None;
	}

	const Function &F = MF.getFunction();
	bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
	bool isSoftFloat = Subtarget.useSoftFloat();
	assert(!(isSoftFloat && NoImplicitFloatOps) &&
	"SSE register cannot be used when SSE is disabled!");
	if (isSoftFloat \|\| NoImplicitFloatOps \|\| !Subtarget.hasSSE1())
	// Kernel mode asks for SSE to be disabled, so there are no XMM argument
	// registers.
	return None;

	static const MCPhysReg XMMArgRegs64Bit[] = {
	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
	};
	return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
	}

	#ifndef NDEBUG
	static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
	return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
	[](const CCValAssign &A, const CCValAssign &B) -> bool {
	return A.getValNo() < B.getValNo();
	});
	}
	#endif

	SDValue X86TargetLowering::LowerFormalArguments(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	MachineFunction &MF = DAG.getMachineFunction();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

	const Function &F = MF.getFunction();
	if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
	F.getName() == "main")
	FuncInfo->setForceFramePointer(true);

	MachineFrameInfo &MFI = MF.getFrameInfo();
	bool Is64Bit = Subtarget.is64Bit();
	bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);

	assert(
	!(isVarArg && canGuaranteeTCO(CallConv)) &&
	"Var args not supported with calling conv' regcall, fastcc, ghc or hipe");

	if (CallConv == CallingConv::X86_INTR) {
	bool isLegal = Ins.size() == 1 \|\|
	(Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) \|\|
	(!Is64Bit && Ins[1].VT == MVT::i32)));
	if (!isLegal)
	report_fatal_error("X86 interrupts may take one or two arguments");
	}

	// Assign locations to all of the incoming arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

	// Allocate shadow area for Win64.
	if (IsWin64)
	CCInfo.AllocateStack(32, 8);

	CCInfo.AnalyzeArguments(Ins, CC_X86);

	// In vectorcall calling convention a second pass is required for the HVA
	// types.
	if (CallingConv::X86_VectorCall == CallConv) {
	CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
	}

	// The next loop assumes that the locations are in the same order of the
	// input arguments.
	assert(isSortedByValueNo(ArgLocs) &&
	"Argument Location list must be sorted before lowering");

	SDValue ArgValue;
	for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++InsIndex) {
	assert(InsIndex < Ins.size() && "Invalid Ins index");
	CCValAssign &VA = ArgLocs[I];

	if (VA.isRegLoc()) {
	EVT RegVT = VA.getLocVT();
	if (VA.needsCustom()) {
	assert(
	VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");

	// v64i1 values, in regcall calling convention, that are
	// compiled to 32 bit arch, are split up into two registers.
	ArgValue =
	getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
	} else {
	const TargetRegisterClass *RC;
	if (RegVT == MVT::i32)
	RC = &X86::GR32RegClass;
	else if (Is64Bit && RegVT == MVT::i64)
	RC = &X86::GR64RegClass;
	else if (RegVT == MVT::f32)
	RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
	else if (RegVT == MVT::f64)
	RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
	else if (RegVT == MVT::f80)
	RC = &X86::RFP80RegClass;
	else if (RegVT == MVT::f128)
	RC = &X86::FR128RegClass;
	else if (RegVT.is512BitVector())
	RC = &X86::VR512RegClass;
	else if (RegVT.is256BitVector())
	RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
	else if (RegVT.is128BitVector())
	RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
	else if (RegVT == MVT::x86mmx)
	RC = &X86::VR64RegClass;
	else if (RegVT == MVT::v1i1)
	RC = &X86::VK1RegClass;
	else if (RegVT == MVT::v8i1)
	RC = &X86::VK8RegClass;
	else if (RegVT == MVT::v16i1)
	RC = &X86::VK16RegClass;
	else if (RegVT == MVT::v32i1)
	RC = &X86::VK32RegClass;
	else if (RegVT == MVT::v64i1)
	RC = &X86::VK64RegClass;
	else
	llvm_unreachable("Unknown argument type!");

	unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
	}

	// If this is an 8 or 16-bit value, it is really passed promoted to 32
	// bits. Insert an assert[sz]ext to capture this, then truncate to the
	// right size.
	if (VA.getLocInfo() == CCValAssign::SExt)
	ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
	DAG.getValueType(VA.getValVT()));
	else if (VA.getLocInfo() == CCValAssign::ZExt)
	ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
	DAG.getValueType(VA.getValVT()));
	else if (VA.getLocInfo() == CCValAssign::BCvt)
	ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);

	if (VA.isExtInLoc()) {
	// Handle MMX values passed in XMM regs.
	if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
	ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
	else if (VA.getValVT().isVector() &&
	VA.getValVT().getScalarType() == MVT::i1 &&
	((VA.getLocVT() == MVT::i64) \|\| (VA.getLocVT() == MVT::i32) \|\|
	(VA.getLocVT() == MVT::i16) \|\| (VA.getLocVT() == MVT::i8))) {
	// Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
	ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
	} else
	ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
	}
	} else {
	assert(VA.isMemLoc());
	ArgValue =
	LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
	}

	// If value is passed via pointer - do a load.
	if (VA.getLocInfo() == CCValAssign::Indirect)
	ArgValue =
	DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());

	InVals.push_back(ArgValue);
	}

	for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
	// Swift calling convention does not require we copy the sret argument
	// into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
	if (CallConv == CallingConv::Swift)
	continue;

	// All x86 ABIs require that for returning structs by value we copy the
	// sret argument into %rax/%eax (depending on ABI) for the return. Save
	// the argument into a virtual register so that we can access it from the
	// return points.
	if (Ins[I].Flags.isSRet()) {
	unsigned Reg = FuncInfo->getSRetReturnReg();
	if (!Reg) {
	MVT PtrTy = getPointerTy(DAG.getDataLayout());
	Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
	FuncInfo->setSRetReturnReg(Reg);
	}
	SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
	break;
	}
	}

	unsigned StackSize = CCInfo.getNextStackOffset();
	// Align stack specially for tail calls.
	if (shouldGuaranteeTCO(CallConv,
	MF.getTarget().Options.GuaranteedTailCallOpt))
	StackSize = GetAlignedArgumentStackSize(StackSize, DAG);

	// If the function takes variable number of arguments, make a frame index for
	// the start of the first vararg value... for expansion of llvm.va_start. We
	// can skip this if there are no va_start calls.
	if (MFI.hasVAStart() &&
	(Is64Bit \|\| (CallConv != CallingConv::X86_FastCall &&
	CallConv != CallingConv::X86_ThisCall))) {
	FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
	}

	// Figure out if XMM registers are in use.
	assert(!(Subtarget.useSoftFloat() &&
	F.hasFnAttribute(Attribute::NoImplicitFloat)) &&
	"SSE register cannot be used when SSE is disabled!");

	// 64-bit calling conventions support varargs and register parameters, so we
	// have to do extra work to spill them in the prologue.
	if (Is64Bit && isVarArg && MFI.hasVAStart()) {
	// Find the first unallocated argument registers.
	ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
	ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
	unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
	unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
	assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
	"SSE register cannot be used when SSE is disabled!");

	// Gather all the live in physical registers.
	SmallVector<SDValue, 6> LiveGPRs;
	SmallVector<SDValue, 8> LiveXMMRegs;
	SDValue ALVal;
	for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
	unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
	LiveGPRs.push_back(
	DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
	}
	if (!ArgXMMs.empty()) {
	unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
	ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
	for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
	unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
	LiveXMMRegs.push_back(
	DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
	}
	}

	if (IsWin64) {
	// Get to the caller-allocated home save location. Add 8 to account
	// for the return address.
	int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
	FuncInfo->setRegSaveFrameIndex(
	MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
	// Fixup to set vararg frame on shadow area (4 x i64).
	if (NumIntRegs < 4)
	FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
	} else {
	// For X86-64, if there are vararg parameters that are passed via
	// registers, then we must store them to their spots on the stack so
	// they may be loaded by dereferencing the result of va_next.
	FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
	FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
	FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
	ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
	}

	// Store the integer parameter registers.
	SmallVector<SDValue, 8> MemOps;
	SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
	getPointerTy(DAG.getDataLayout()));
	unsigned Offset = FuncInfo->getVarArgsGPOffset();
	for (SDValue Val : LiveGPRs) {
	SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	RSFIN, DAG.getIntPtrConstant(Offset, dl));
	SDValue Store =
	DAG.getStore(Val.getValue(1), dl, Val, FIN,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(),
	FuncInfo->getRegSaveFrameIndex(), Offset));
	MemOps.push_back(Store);
	Offset += 8;
	}

	if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
	// Now store the XMM (fp + vector) parameter registers.
	SmallVector<SDValue, 12> SaveXMMOps;
	SaveXMMOps.push_back(Chain);
	SaveXMMOps.push_back(ALVal);
	SaveXMMOps.push_back(DAG.getIntPtrConstant(
	FuncInfo->getRegSaveFrameIndex(), dl));
	SaveXMMOps.push_back(DAG.getIntPtrConstant(
	FuncInfo->getVarArgsFPOffset(), dl));
	SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
	LiveXMMRegs.end());
	MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
	MVT::Other, SaveXMMOps));
	}

	if (!MemOps.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
	}

	if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
	// Find the largest legal vector type.
	MVT VecVT = MVT::Other;
	// FIXME: Only some x86_32 calling conventions support AVX512.
	if (Subtarget.hasAVX512() &&
	(Is64Bit \|\| (CallConv == CallingConv::X86_VectorCall \|\|
	CallConv == CallingConv::Intel_OCL_BI)))
	VecVT = MVT::v16f32;
	else if (Subtarget.hasAVX())
	VecVT = MVT::v8f32;
	else if (Subtarget.hasSSE2())
	VecVT = MVT::v4f32;

	// We forward some GPRs and some vector types.
	SmallVector<MVT, 2> RegParmTypes;
	MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
	RegParmTypes.push_back(IntVT);
	if (VecVT != MVT::Other)
	RegParmTypes.push_back(VecVT);

	// Compute the set of forwarded registers. The rest are scratch.
	SmallVectorImpl<ForwardedRegister> &Forwards =
	FuncInfo->getForwardedMustTailRegParms();
	CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);

	// Conservatively forward AL on x86_64, since it might be used for varargs.
	if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
	unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
	Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
	}

	// Copy all forwards from physical to virtual registers.
	for (ForwardedRegister &F : Forwards) {
	// FIXME: Can we use a less constrained schedule?
	SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
	F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
	Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
	}
	}

	// Some CCs need callee pop.
	if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
	MF.getTarget().Options.GuaranteedTailCallOpt)) {
	FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
	} else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
	// X86 interrupts must pop the error code (and the alignment padding) if
	// present.
	FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
	} else {
	FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
	// If this is an sret function, the return should pop the hidden pointer.
	if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
	!Subtarget.getTargetTriple().isOSMSVCRT() &&
	argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
	FuncInfo->setBytesToPopOnReturn(4);
	}

	if (!Is64Bit) {
	// RegSaveFrameIndex is X86-64 only.
	FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
	if (CallConv == CallingConv::X86_FastCall \|\|
	CallConv == CallingConv::X86_ThisCall)
	// fastcc functions can't have varargs.
	FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
	}

	FuncInfo->setArgumentStackSize(StackSize);

	if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
	EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
	if (Personality == EHPersonality::CoreCLR) {
	assert(Is64Bit);
	// TODO: Add a mechanism to frame lowering that will allow us to indicate
	// that we'd prefer this slot be allocated towards the bottom of the frame
	// (i.e. near the stack pointer after allocating the frame). Every
	// funclet needs a copy of this slot in its (mostly empty) frame, and the
	// offset from the bottom of this and each funclet's frame must be the
	// same, so the size of funclets' (mostly empty) frames is dictated by
	// how far this slot is from the bottom (since they allocate just enough
	// space to accommodate holding this slot at the correct offset).
	int PSPSymFI = MFI.CreateStackObject(8, 8, /isSS=/false);
	EHInfo->PSPSymFrameIdx = PSPSymFI;
	}
	}

	if (CallConv == CallingConv::X86_RegCall \|\|
	F.hasFnAttribute("no_caller_saved_registers")) {
	MachineRegisterInfo &MRI = MF.getRegInfo();
	for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
	MRI.disableCalleeSavedRegister(Pair.first);
	}

	return Chain;
	}

	SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
	SDValue Arg, const SDLoc &dl,
	SelectionDAG &DAG,
	const CCValAssign &VA,
	ISD::ArgFlagsTy Flags) const {
	unsigned LocMemOffset = VA.getLocMemOffset();
	SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
	PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	StackPtr, PtrOff);
	if (Flags.isByVal())
	return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);

	return DAG.getStore(
	Chain, dl, Arg, PtrOff,
	MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
	}

	/// Emit a load of return address if tail call
	/// optimization is performed and it is required.
	SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
	SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
	bool Is64Bit, int FPDiff, const SDLoc &dl) const {
	// Adjust the Return address stack slot.
	EVT VT = getPointerTy(DAG.getDataLayout());
	OutRetAddr = getReturnAddressFrameIndex(DAG);

	// Load the "old" Return address.
	OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
	return SDValue(OutRetAddr.getNode(), 1);
	}

	/// Emit a store of the return address if tail call
	/// optimization is performed and it is required (FPDiff!=0).
	static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
	SDValue Chain, SDValue RetAddrFrIdx,
	EVT PtrVT, unsigned SlotSize,
	int FPDiff, const SDLoc &dl) {
	// Store the return address to the appropriate stack slot.
	if (!FPDiff) return Chain;
	// Calculate the new stack slot for the return address.
	int NewReturnAddrFI =
	MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
	false);
	SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
	Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(), NewReturnAddrFI));
	return Chain;
	}

	/// Returns a vector_shuffle mask for an movs{s\|d}, movd
	/// operation of specified width.
	static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
	SDValue V2) {
	unsigned NumElems = VT.getVectorNumElements();
	SmallVector<int, 8> Mask;
	Mask.push_back(NumElems);
	for (unsigned i = 1; i != NumElems; ++i)
	Mask.push_back(i);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	SDValue
	X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	SelectionDAG &DAG = CLI.DAG;
	SDLoc &dl = CLI.DL;
	SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
	SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
	SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
	SDValue Chain = CLI.Chain;
	SDValue Callee = CLI.Callee;
	CallingConv::ID CallConv = CLI.CallConv;
	bool &isTailCall = CLI.IsTailCall;
	bool isVarArg = CLI.IsVarArg;

	MachineFunction &MF = DAG.getMachineFunction();
	bool Is64Bit = Subtarget.is64Bit();
	bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
	StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
	bool IsSibcall = false;
	X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
	auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
	const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
	const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
	bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) \|\|
	(Fn && Fn->hasFnAttribute("no_caller_saved_registers"));

	if (CallConv == CallingConv::X86_INTR)
	report_fatal_error("X86 interrupts may not be called directly");

	if (Attr.getValueAsString() == "true")
	isTailCall = false;

	if (Subtarget.isPICStyleGOT() &&
	!MF.getTarget().Options.GuaranteedTailCallOpt) {
	// If we are using a GOT, disable tail calls to external symbols with
	// default visibility. Tail calling such a symbol requires using a GOT
	// relocation, which forces early binding of the symbol. This breaks code
	// that require lazy function symbol resolution. Using musttail or
	// GuaranteedTailCallOpt will override this.
	GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
	if (!G \|\| (!G->getGlobal()->hasLocalLinkage() &&
	G->getGlobal()->hasDefaultVisibility()))
	isTailCall = false;
	}

	bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
	if (IsMustTail) {
	// Force this to be a tail call. The verifier rules are enough to ensure
	// that we can lower this successfully without moving the return address
	// around.
	isTailCall = true;
	} else if (isTailCall) {
	// Check if it's really possible to do a tail call.
	isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
	isVarArg, SR != NotStructReturn,
	MF.getFunction().hasStructRetAttr(), CLI.RetTy,
	Outs, OutVals, Ins, DAG);

	// Sibcalls are automatically detected tailcalls which do not require
	// ABI changes.
	if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
	IsSibcall = true;

	if (isTailCall)
	++NumTailCalls;
	}

	assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
	"Var args not supported with calling convention fastcc, ghc or hipe");

	// Analyze operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

	// Allocate shadow area for Win64.
	if (IsWin64)
	CCInfo.AllocateStack(32, 8);

	CCInfo.AnalyzeArguments(Outs, CC_X86);

	// In vectorcall calling convention a second pass is required for the HVA
	// types.
	if (CallingConv::X86_VectorCall == CallConv) {
	CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
	}

	// Get a count of how many bytes are to be pushed on the stack.
	unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
	if (IsSibcall)
	// This is a sibcall. The memory operands are available in caller's
	// own caller's stack.
	NumBytes = 0;
	else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
	canGuaranteeTCO(CallConv))
	NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);

	int FPDiff = 0;
	if (isTailCall && !IsSibcall && !IsMustTail) {
	// Lower arguments at fp - stackoffset + fpdiff.
	unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();

	FPDiff = NumBytesCallerPushed - NumBytes;

	// Set the delta of movement of the returnaddr stackslot.
	// But only set if delta is greater than previous delta.
	if (FPDiff < X86Info->getTCReturnAddrDelta())
	X86Info->setTCReturnAddrDelta(FPDiff);
	}

	unsigned NumBytesToPush = NumBytes;
	unsigned NumBytesToPop = NumBytes;

	// If we have an inalloca argument, all stack space has already been allocated
	// for us and be right at the top of the stack. We don't support multiple
	// arguments passed in memory when using inalloca.
	if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
	NumBytesToPush = 0;
	if (!ArgLocs.back().isMemLoc())
	report_fatal_error("cannot use inalloca attribute on a register "
	"parameter");
	if (ArgLocs.back().getLocMemOffset() != 0)
	report_fatal_error("any parameter with the inalloca attribute must be "
	"the only memory argument");
	}

	if (!IsSibcall)
	Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
	NumBytes - NumBytesToPush, dl);

	SDValue RetAddrFrIdx;
	// Load return address for tail calls.
	if (isTailCall && FPDiff)
	Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
	Is64Bit, FPDiff, dl);

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
	SmallVector<SDValue, 8> MemOpChains;
	SDValue StackPtr;

	// The next loop assumes that the locations are in the same order of the
	// input arguments.
	assert(isSortedByValueNo(ArgLocs) &&
	"Argument Location list must be sorted before lowering");

	// Walk the register/memloc assignments, inserting copies/loads. In the case
	// of tail call optimization arguments are handle later.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++OutIndex) {
	assert(OutIndex < Outs.size() && "Invalid Out index");
	// Skip inalloca arguments, they have already been written.
	ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
	if (Flags.isInAlloca())
	continue;

	CCValAssign &VA = ArgLocs[I];
	EVT RegVT = VA.getLocVT();
	SDValue Arg = OutVals[OutIndex];
	bool isByVal = Flags.isByVal();

	// Promote the value if needed.
	switch (VA.getLocInfo()) {
	default: llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full: break;
	case CCValAssign::SExt:
	Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::ZExt:
	Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::AExt:
	if (Arg.getValueType().isVector() &&
	Arg.getValueType().getVectorElementType() == MVT::i1)
	Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
	else if (RegVT.is128BitVector()) {
	// Special case: passing MMX values in XMM registers.
	Arg = DAG.getBitcast(MVT::i64, Arg);
	Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
	Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
	} else
	Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::BCvt:
	Arg = DAG.getBitcast(RegVT, Arg);
	break;
	case CCValAssign::Indirect: {
	// Store the argument.
	SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
	int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
	Chain = DAG.getStore(
	Chain, dl, Arg, SpillSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	Arg = SpillSlot;
	break;
	}
	}

	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");
	// Split v64i1 value into two registers
	Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
	Subtarget);
	} else if (VA.isRegLoc()) {
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
	if (isVarArg && IsWin64) {
	// Win64 ABI requires argument XMM reg to be copied to the corresponding
	// shadow reg if callee is a varargs function.
	unsigned ShadowReg = 0;
	switch (VA.getLocReg()) {
	case X86::XMM0: ShadowReg = X86::RCX; break;
	case X86::XMM1: ShadowReg = X86::RDX; break;
	case X86::XMM2: ShadowReg = X86::R8; break;
	case X86::XMM3: ShadowReg = X86::R9; break;
	}
	if (ShadowReg)
	RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
	}
	} else if (!IsSibcall && (!isTailCall \|\| isByVal)) {
	assert(VA.isMemLoc());
	if (!StackPtr.getNode())
	StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
	getPointerTy(DAG.getDataLayout()));
	MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
	dl, DAG, VA, Flags));
	}
	}

	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

	if (Subtarget.isPICStyleGOT()) {
	// ELF / PIC requires GOT in the EBX register before function calls via PLT
	// GOT pointer.
	if (!isTailCall) {
	RegsToPass.push_back(std::make_pair(
	unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
	getPointerTy(DAG.getDataLayout()))));
	} else {
	// If we are tail calling and generating PIC/GOT style code load the
	// address of the callee into ECX. The value in ecx is used as target of
	// the tail jump. This is done to circumvent the ebx/callee-saved problem
	// for tail calls on PIC/GOT architectures. Normally we would just put the
	// address of GOT into ebx and then call target@PLT. But for tail calls
	// ebx would be restored (since ebx is callee saved) before jumping to the
	// target@PLT.

	// Note: The actual moving to ECX is done further down.
	GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
	if (G && !G->getGlobal()->hasLocalLinkage() &&
	G->getGlobal()->hasDefaultVisibility())
	Callee = LowerGlobalAddress(Callee, DAG);
	else if (isa<ExternalSymbolSDNode>(Callee))
	Callee = LowerExternalSymbol(Callee, DAG);
	}
	}

	if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
	// From AMD64 ABI document:
	// For calls that may call functions that use varargs or stdargs
	// (prototype-less calls or calls to functions containing ellipsis (...) in
	// the declaration) %al is used as hidden argument to specify the number
	// of SSE registers used. The contents of %al do not need to match exactly
	// the number of registers, but must be an ubound on the number of SSE
	// registers used and is in the range 0 - 8 inclusive.

	// Count the number of XMM registers allocated.
	static const MCPhysReg XMMArgRegs[] = {
	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
	};
	unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
	assert((Subtarget.hasSSE1() \|\| !NumXMMRegs)
	&& "SSE registers cannot be used when SSE is disabled");

	RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
	DAG.getConstant(NumXMMRegs, dl,
	MVT::i8)));
	}

	if (isVarArg && IsMustTail) {
	const auto &Forwards = X86Info->getForwardedMustTailRegParms();
	for (const auto &F : Forwards) {
	SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
	RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
	}
	}

	// For tail calls lower the arguments to the 'real' stack slots. Sibcalls
	// don't need this because the eligibility check rejects calls that require
	// shuffling arguments passed in memory.
	if (!IsSibcall && isTailCall) {
	// Force all the incoming stack arguments to be loaded from the stack
	// before any new outgoing arguments are stored to the stack, because the
	// outgoing stack slots may alias the incoming argument stack slots, and
	// the alias isn't otherwise explicit. This is slightly more conservative
	// than necessary, because it means that each store effectively depends
	// on every argument instead of just those arguments it would clobber.
	SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);

	SmallVector<SDValue, 8> MemOpChains2;
	SDValue FIN;
	int FI = 0;
	for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++OutsIndex) {
	CCValAssign &VA = ArgLocs[I];

	if (VA.isRegLoc()) {
	if (VA.needsCustom()) {
	assert((CallConv == CallingConv::X86_RegCall) &&
	"Expecting custom case only in regcall calling convention");
	// This means that we are in special case where one argument was
	// passed through two register locations - Skip the next location
	++I;
	}

	continue;
	}

	assert(VA.isMemLoc());
	SDValue Arg = OutVals[OutsIndex];
	ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
	// Skip inalloca arguments. They don't require any work.
	if (Flags.isInAlloca())
	continue;
	// Create frame index.
	int32_t Offset = VA.getLocMemOffset()+FPDiff;
	uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
	FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
	FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));

	if (Flags.isByVal()) {
	// Copy relative to framepointer.
	SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
	if (!StackPtr.getNode())
	StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
	getPointerTy(DAG.getDataLayout()));
	Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	StackPtr, Source);

	MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
	ArgChain,
	Flags, DAG, dl));
	} else {
	// Store relative to framepointer.
	MemOpChains2.push_back(DAG.getStore(
	ArgChain, dl, Arg, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
	}
	}

	if (!MemOpChains2.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);

	// Store the return address to the appropriate stack slot.
	Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
	getPointerTy(DAG.getDataLayout()),
	RegInfo->getSlotSize(), FPDiff, dl);
	}

	// Build a sequence of copy-to-reg nodes chained together with token chain
	// and flag operands which copy the outgoing args into registers.
	SDValue InFlag;
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
	Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
	RegsToPass[i].second, InFlag);
	InFlag = Chain.getValue(1);
	}

	if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
	assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
	// In the 64-bit large code model, we have to make all calls
	// through a register, since the call instruction's 32-bit
	// pc-relative offset may not be large enough to hold the whole
	// address.
	} else if (Callee->getOpcode() == ISD::GlobalAddress) {
	// If the callee is a GlobalAddress node (quite common, every direct call
	// is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
	// it.
	GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);

	// We should use extra load for direct calls to dllimported functions in
	// non-JIT mode.
	const GlobalValue *GV = G->getGlobal();
	if (!GV->hasDLLImportStorageClass()) {
	unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);

	Callee = DAG.getTargetGlobalAddress(
	GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);

	if (OpFlags == X86II::MO_GOTPCREL) {
	// Add a wrapper.
	Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
	getPointerTy(DAG.getDataLayout()), Callee);
	// Add extra indirection
	Callee = DAG.getLoad(
	getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
	}
	}
	} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
	const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
	unsigned char OpFlags =
	Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);

	Callee = DAG.getTargetExternalSymbol(
	S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
	} else if (Subtarget.isTarget64BitILP32() &&
	Callee->getValueType(0) == MVT::i32) {
	// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
	Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
	}

	// Returns a chain & a flag for retval copy to use.
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	SmallVector<SDValue, 8> Ops;

	if (!IsSibcall && isTailCall) {
	Chain = DAG.getCALLSEQ_END(Chain,
	DAG.getIntPtrConstant(NumBytesToPop, dl, true),
	DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
	InFlag = Chain.getValue(1);
	}

	Ops.push_back(Chain);
	Ops.push_back(Callee);

	if (isTailCall)
	Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));

	// Add argument registers to the end of the list so that they are known live
	// into the call.
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
	Ops.push_back(DAG.getRegister(RegsToPass[i].first,
	RegsToPass[i].second.getValueType()));

	// Add a register mask operand representing the call-preserved registers.
	// If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
	// set X86_INTR calling convention because it has the same CSR mask
	// (same preserved registers).
	const uint32_t *Mask = RegInfo->getCallPreservedMask(
	MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
	assert(Mask && "Missing call preserved mask for calling convention");

	// If this is an invoke in a 32-bit function using a funclet-based
	// personality, assume the function clobbers all registers. If an exception
	// is thrown, the runtime will not restore CSRs.
	// FIXME: Model this more precisely so that we can register allocate across
	// the normal edge and spill and fill across the exceptional edge.
	if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
	const Function &CallerFn = MF.getFunction();
	EHPersonality Pers =
	CallerFn.hasPersonalityFn()
	? classifyEHPersonality(CallerFn.getPersonalityFn())
	: EHPersonality::Unknown;
	if (isFuncletEHPersonality(Pers))
	Mask = RegInfo->getNoPreservedMask();
	}

	// Define a new register mask from the existing mask.
	uint32_t *RegMask = nullptr;

	// In some calling conventions we need to remove the used physical registers
	// from the reg mask.
	if (CallConv == CallingConv::X86_RegCall \|\| HasNCSR) {
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

	// Allocate a new Reg Mask and copy Mask.
	RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
	unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
	memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);

	// Make sure all sub registers of the argument registers are reset
	// in the RegMask.
	for (auto const &RegPair : RegsToPass)
	for (MCSubRegIterator SubRegs(RegPair.first, TRI, /IncludeSelf=/true);
	SubRegs.isValid(); ++SubRegs)
	RegMask[SubRegs / 32] &= ~(1u << (SubRegs % 32));

	// Create the RegMask Operand according to our updated mask.
	Ops.push_back(DAG.getRegisterMask(RegMask));
	} else {
	// Create the RegMask Operand according to the static mask.
	Ops.push_back(DAG.getRegisterMask(Mask));
	}

	if (InFlag.getNode())
	Ops.push_back(InFlag);

	if (isTailCall) {
	// We used to do:
	//// If this is the first return lowered for this function, add the regs
	//// to the liveout set for the function.
	// This isn't right, although it's probably harmless on x86; liveouts
	// should be computed from returns not tail calls. Consider a void
	// function making a tail call to a function returning int.
	MF.getFrameInfo().setHasTailCall();
	return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
	}

	Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
	InFlag = Chain.getValue(1);

	// Create the CALLSEQ_END node.
	unsigned NumBytesForCalleeToPop;
	if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
	DAG.getTarget().Options.GuaranteedTailCallOpt))
	NumBytesForCalleeToPop = NumBytes; // Callee pops everything
	else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
	!Subtarget.getTargetTriple().isOSMSVCRT() &&
	SR == StackStructReturn)
	// If this is a call to a struct-return function, the callee
	// pops the hidden struct pointer, so we have to push it back.
	// This is common for Darwin/X86, Linux & Mingw32 targets.
	// For MSVC Win32 targets, the caller pops the hidden struct pointer.
	NumBytesForCalleeToPop = 4;
	else
	NumBytesForCalleeToPop = 0; // Callee pops nothing.

	if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
	// No need to reset the stack after the call if the call doesn't return. To
	// make the MI verify, we'll pretend the callee does it for us.
	NumBytesForCalleeToPop = NumBytes;
	}

	// Returns a flag for retval copy to use.
	if (!IsSibcall) {
	Chain = DAG.getCALLSEQ_END(Chain,
	DAG.getIntPtrConstant(NumBytesToPop, dl, true),
	DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
	true),
	InFlag, dl);
	InFlag = Chain.getValue(1);
	}

	// Handle result values, copying them out of physregs into vregs that we
	// return.
	return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
	InVals, RegMask);
	}

	//===----------------------------------------------------------------------===//
	// Fast Calling Convention (tail call) implementation
	//===----------------------------------------------------------------------===//

	// Like std call, callee cleans arguments, convention except that ECX is
	// reserved for storing the tail called function address. Only 2 registers are
	// free for argument passing (inreg). Tail call optimization is performed
	// provided:
	// * tailcallopt is enabled
	// * caller/callee are fastcc
	// On X86_64 architecture with GOT-style position independent code only local
	// (within module) calls are supported at the moment.
	// To keep the stack aligned according to platform abi the function
	// GetAlignedArgumentStackSize ensures that argument delta is always multiples
	// of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
	// If a tail called function callee has more arguments than the caller the
	// caller needs to make sure that there is room to move the RETADDR to. This is
	// achieved by reserving an area the size of the argument delta right after the
	// original RETADDR, but before the saved framepointer or the spilled registers
	// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
	// stack layout:
	// arg1
	// arg2
	// RETADDR
	// [ new RETADDR
	// move area ]
	// (possible EBP)
	// ESI
	// EDI
	// local1 ..

	/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
	/// requirement.
	unsigned
	X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
	SelectionDAG& DAG) const {
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	unsigned StackAlignment = TFI.getStackAlignment();
	uint64_t AlignMask = StackAlignment - 1;
	int64_t Offset = StackSize;
	unsigned SlotSize = RegInfo->getSlotSize();
	if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
	// Number smaller than 12 so just add the difference.
	Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
	} else {
	// Mask out lower bits, add stackalignment once plus the 12 bytes.
	Offset = ((~AlignMask) & Offset) + StackAlignment +
	(StackAlignment-SlotSize);
	}
	return Offset;
	}

	/// Return true if the given stack call argument is already available in the
	/// same position (relatively) of the caller's incoming argument stack.
	static
	bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
	MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
	const X86InstrInfo *TII, const CCValAssign &VA) {
	unsigned Bytes = Arg.getValueSizeInBits() / 8;

	for (;;) {
	// Look through nodes that don't alter the bits of the incoming value.
	unsigned Op = Arg.getOpcode();
	if (Op == ISD::ZERO_EXTEND \|\| Op == ISD::ANY_EXTEND \|\| Op == ISD::BITCAST) {
	Arg = Arg.getOperand(0);
	continue;
	}
	if (Op == ISD::TRUNCATE) {
	const SDValue &TruncInput = Arg.getOperand(0);
	if (TruncInput.getOpcode() == ISD::AssertZext &&
	cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
	Arg.getValueType()) {
	Arg = TruncInput.getOperand(0);
	continue;
	}
	}
	break;
	}

	int FI = INT_MAX;
	if (Arg.getOpcode() == ISD::CopyFromReg) {
	unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
	if (!TargetRegisterInfo::isVirtualRegister(VR))
	return false;
	MachineInstr *Def = MRI->getVRegDef(VR);
	if (!Def)
	return false;
	if (!Flags.isByVal()) {
	if (!TII->isLoadFromStackSlot(*Def, FI))
	return false;
	} else {
	unsigned Opcode = Def->getOpcode();
	if ((Opcode == X86::LEA32r \|\| Opcode == X86::LEA64r \|\|
	Opcode == X86::LEA64_32r) &&
	Def->getOperand(1).isFI()) {
	FI = Def->getOperand(1).getIndex();
	Bytes = Flags.getByValSize();
	} else
	return false;
	}
	} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
	if (Flags.isByVal())
	// ByVal argument is passed in as a pointer but it's now being
	// dereferenced. e.g.
	// define @foo(%struct.X* %A) {
	// tail call @bar(%struct.X* byval %A)
	// }
	return false;
	SDValue Ptr = Ld->getBasePtr();
	FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
	if (!FINode)
	return false;
	FI = FINode->getIndex();
	} else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
	FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
	FI = FINode->getIndex();
	Bytes = Flags.getByValSize();
	} else
	return false;

	assert(FI != INT_MAX);
	if (!MFI.isFixedObjectIndex(FI))
	return false;

	if (Offset != MFI.getObjectOffset(FI))
	return false;

	// If this is not byval, check that the argument stack object is immutable.
	// inalloca and argument copy elision can create mutable argument stack
	// objects. Byval objects can be mutated, but a byval call intends to pass the
	// mutated memory.
	if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
	return false;

	if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
	// If the argument location is wider than the argument type, check that any
	// extension flags match.
	if (Flags.isZExt() != MFI.isObjectZExt(FI) \|\|
	Flags.isSExt() != MFI.isObjectSExt(FI)) {
	return false;
	}
	}

	return Bytes == MFI.getObjectSize(FI);
	}

	/// Check whether the call is eligible for tail call optimization. Targets
	/// that want to do tail call optimization should implement this function.
	bool X86TargetLowering::IsEligibleForTailCallOptimization(
	SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
	bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
	if (!mayTailCallThisCC(CalleeCC))
	return false;

	// If -tailcallopt is specified, make fastcc functions tail-callable.
	MachineFunction &MF = DAG.getMachineFunction();
	const Function &CallerF = MF.getFunction();

	// If the function return type is x86_fp80 and the callee return type is not,
	// then the FP_EXTEND of the call result is not a nop. It's not safe to
	// perform a tailcall optimization here.
	if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
	return false;

	CallingConv::ID CallerCC = CallerF.getCallingConv();
	bool CCMatch = CallerCC == CalleeCC;
	bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
	bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);

	// Win64 functions have extra shadow space for argument homing. Don't do the
	// sibcall if the caller and callee have mismatched expectations for this
	// space.
	if (IsCalleeWin64 != IsCallerWin64)
	return false;

	if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
	if (canGuaranteeTCO(CalleeCC) && CCMatch)
	return true;
	return false;
	}

	// Look for obvious safe cases to perform tail call optimization that do not
	// require ABI changes. This is what gcc calls sibcall.

	// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
	// emit a special epilogue.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	if (RegInfo->needsStackRealignment(MF))
	return false;

	// Also avoid sibcall optimization if either caller or callee uses struct
	// return semantics.
	if (isCalleeStructRet \|\| isCallerStructRet)
	return false;

	// Do not sibcall optimize vararg calls unless all arguments are passed via
	// registers.
	LLVMContext &C = *DAG.getContext();
	if (isVarArg && !Outs.empty()) {
	// Optimizing for varargs on Win64 is unlikely to be safe without
	// additional testing.
	if (IsCalleeWin64 \|\| IsCallerWin64)
	return false;

	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	CCInfo.AnalyzeCallOperands(Outs, CC_X86);
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
	if (!ArgLocs[i].isRegLoc())
	return false;
	}

	// If the call result is in ST0 / ST1, it needs to be popped off the x87
	// stack. Therefore, if it's not used by the call it is not safe to optimize
	// this into a sibcall.
	bool Unused = false;
	for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
	if (!Ins[i].Used) {
	Unused = true;
	break;
	}
	}
	if (Unused) {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
	CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
	for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
	CCValAssign &VA = RVLocs[i];
	if (VA.getLocReg() == X86::FP0 \|\| VA.getLocReg() == X86::FP1)
	return false;
	}
	}

	// Check that the call results are passed in the same way.
	if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
	RetCC_X86, RetCC_X86))
	return false;
	// The callee has to preserve all registers the caller needs to preserve.
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
	if (!CCMatch) {
	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
	if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
	return false;
	}

	unsigned StackArgsSize = 0;

	// If the callee takes no arguments then go on to check the results of the
	// call.
	if (!Outs.empty()) {
	// Check if stack adjustment is needed. For now, do not do this if any
	// argument is passed on the stack.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	// Allocate shadow area for Win64
	if (IsCalleeWin64)
	CCInfo.AllocateStack(32, 8);

	CCInfo.AnalyzeCallOperands(Outs, CC_X86);
	StackArgsSize = CCInfo.getNextStackOffset();

	if (CCInfo.getNextStackOffset()) {
	// Check if the arguments are already laid out in the right way as
	// the caller's fixed stack objects.
	MachineFrameInfo &MFI = MF.getFrameInfo();
	const MachineRegisterInfo *MRI = &MF.getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	SDValue Arg = OutVals[i];
	ISD::ArgFlagsTy Flags = Outs[i].Flags;
	if (VA.getLocInfo() == CCValAssign::Indirect)
	return false;
	if (!VA.isRegLoc()) {
	if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
	MFI, MRI, TII, VA))
	return false;
	}
	}
	}

	bool PositionIndependent = isPositionIndependent();
	// If the tailcall address may be in a register, then make sure it's
	// possible to register allocate for it. In 32-bit, the call address can
	// only target EAX, EDX, or ECX since the tail call must be scheduled after
	// callee-saved registers are restored. These happen to be the same
	// registers used to pass 'inreg' arguments so watch out for those.
	if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
	!isa<ExternalSymbolSDNode>(Callee)) \|\|
	PositionIndependent)) {
	unsigned NumInRegs = 0;
	// In PIC we need an extra register to formulate the address computation
	// for the callee.
	unsigned MaxInRegs = PositionIndependent ? 2 : 3;

	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	if (!VA.isRegLoc())
	continue;
	unsigned Reg = VA.getLocReg();
	switch (Reg) {
	default: break;
	case X86::EAX: case X86::EDX: case X86::ECX:
	if (++NumInRegs == MaxInRegs)
	return false;
	break;
	}
	}
	}

	const MachineRegisterInfo &MRI = MF.getRegInfo();
	if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
	return false;
	}

	bool CalleeWillPop =
	X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
	MF.getTarget().Options.GuaranteedTailCallOpt);

	if (unsigned BytesToPop =
	MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
	// If we have bytes to pop, the callee must pop them.
	bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
	if (!CalleePopMatches)
	return false;
	} else if (CalleeWillPop && StackArgsSize > 0) {
	// If we don't have bytes to pop, make sure the callee doesn't pop any.
	return false;
	}

	return true;
	}

	FastISel *
	X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo) const {
	return X86::createFastISel(funcInfo, libInfo);
	}

	//===----------------------------------------------------------------------===//
	// Other Lowering Hooks
	//===----------------------------------------------------------------------===//

	static bool MayFoldLoad(SDValue Op) {
	return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
	}

	static bool MayFoldIntoStore(SDValue Op) {
	return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
	}

	static bool MayFoldIntoZeroExtend(SDValue Op) {
	if (Op.hasOneUse()) {
	unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
	return (ISD::ZERO_EXTEND == Opcode);
	}
	return false;
	}

	static bool isTargetShuffle(unsigned Opcode) {
	switch(Opcode) {
	default: return false;
	case X86ISD::BLENDI:
	case X86ISD::PSHUFB:
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFHW:
	case X86ISD::PSHUFLW:
	case X86ISD::SHUFP:
	case X86ISD::INSERTPS:
	case X86ISD::EXTRQI:
	case X86ISD::INSERTQI:
	case X86ISD::PALIGNR:
	case X86ISD::VSHLDQ:
	case X86ISD::VSRLDQ:
	case X86ISD::MOVLHPS:
	case X86ISD::MOVHLPS:
	case X86ISD::MOVLPS:
	case X86ISD::MOVLPD:
	case X86ISD::MOVSHDUP:
	case X86ISD::MOVSLDUP:
	case X86ISD::MOVDDUP:
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	case X86ISD::VBROADCAST:
	case X86ISD::VPERMILPI:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERM2X128:
	case X86ISD::VPERMIL2:
	case X86ISD::VPERMI:
	case X86ISD::VPPERM:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	case X86ISD::VPERMIV3:
	case X86ISD::VZEXT_MOVL:
	return true;
	}
	}

	static bool isTargetShuffleVariableMask(unsigned Opcode) {
	switch (Opcode) {
	default: return false;
	// Target Shuffles.
	case X86ISD::PSHUFB:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERMIL2:
	case X86ISD::VPPERM:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	case X86ISD::VPERMIV3:
	return true;
	// 'Faux' Target Shuffles.
	case ISD::AND:
	case X86ISD::ANDNP:
	return true;
	}
	}

	SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
	int ReturnAddrIndex = FuncInfo->getRAIndex();

	if (ReturnAddrIndex == 0) {
	// Set up a frame object for the return address.
	unsigned SlotSize = RegInfo->getSlotSize();
	ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
	-(int64_t)SlotSize,
	false);
	FuncInfo->setRAIndex(ReturnAddrIndex);
	}

	return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
	}

	bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
	bool hasSymbolicDisplacement) {
	// Offset should fit into 32 bit immediate field.
	if (!isInt<32>(Offset))
	return false;

	// If we don't have a symbolic displacement - we don't have any extra
	// restrictions.
	if (!hasSymbolicDisplacement)
	return true;

	// FIXME: Some tweaks might be needed for medium code model.
	if (M != CodeModel::Small && M != CodeModel::Kernel)
	return false;

	// For small code model we assume that latest object is 16MB before end of 31
	// bits boundary. We may also accept pretty large negative constants knowing
	// that all objects are in the positive half of address space.
	if (M == CodeModel::Small && Offset < 1610241024)
	return true;

	// For kernel code model we know that all object resist in the negative half
	// of 32bits address space. We may not accept negative offsets, since they may
	// be just off and we may accept pretty large positive ones.
	if (M == CodeModel::Kernel && Offset >= 0)
	return true;

	return false;
	}

	/// Determines whether the callee is required to pop its own arguments.
	/// Callee pop is necessary to support tail calls.
	bool X86::isCalleePop(CallingConv::ID CallingConv,
	bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
	// If GuaranteeTCO is true, we force some calls to be callee pop so that we
	// can guarantee TCO.
	if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
	return true;

	switch (CallingConv) {
	default:
	return false;
	case CallingConv::X86_StdCall:
	case CallingConv::X86_FastCall:
	case CallingConv::X86_ThisCall:
	case CallingConv::X86_VectorCall:
	return !is64Bit;
	}
	}

	/// \brief Return true if the condition is an unsigned comparison operation.
	static bool isX86CCUnsigned(unsigned X86CC) {
	switch (X86CC) {
	default:
	llvm_unreachable("Invalid integer condition!");
	case X86::COND_E:
	case X86::COND_NE:
	case X86::COND_B:
	case X86::COND_A:
	case X86::COND_BE:
	case X86::COND_AE:
	return true;
	case X86::COND_G:
	case X86::COND_GE:
	case X86::COND_L:
	case X86::COND_LE:
	return false;
	}
	}

	static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
	switch (SetCCOpcode) {
	default: llvm_unreachable("Invalid integer condition!");
	case ISD::SETEQ: return X86::COND_E;
	case ISD::SETGT: return X86::COND_G;
	case ISD::SETGE: return X86::COND_GE;
	case ISD::SETLT: return X86::COND_L;
	case ISD::SETLE: return X86::COND_LE;
	case ISD::SETNE: return X86::COND_NE;
	case ISD::SETULT: return X86::COND_B;
	case ISD::SETUGT: return X86::COND_A;
	case ISD::SETULE: return X86::COND_BE;
	case ISD::SETUGE: return X86::COND_AE;
	}
	}

	/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
	/// condition code, returning the condition code and the LHS/RHS of the
	/// comparison to make.
	static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
	bool isFP, SDValue &LHS, SDValue &RHS,
	SelectionDAG &DAG) {
	if (!isFP) {
	if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
	if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
	// X > -1 -> X == 0, jump !sign.
	RHS = DAG.getConstant(0, DL, RHS.getValueType());
	return X86::COND_NS;
	}
	if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
	// X < 0 -> X == 0, jump on sign.
	return X86::COND_S;
	}
	if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
	// X < 1 -> X <= 0
	RHS = DAG.getConstant(0, DL, RHS.getValueType());
	return X86::COND_LE;
	}
	}

	return TranslateIntegerX86CC(SetCCOpcode);
	}

	// First determine if it is required or is profitable to flip the operands.

	// If LHS is a foldable load, but RHS is not, flip the condition.
	if (ISD::isNON_EXTLoad(LHS.getNode()) &&
	!ISD::isNON_EXTLoad(RHS.getNode())) {
	SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
	std::swap(LHS, RHS);
	}

	switch (SetCCOpcode) {
	default: break;
	case ISD::SETOLT:
	case ISD::SETOLE:
	case ISD::SETUGT:
	case ISD::SETUGE:
	std::swap(LHS, RHS);
	break;
	}

	// On a floating point condition, the flags are set as follows:
	// ZF PF CF op
	// 0 \| 0 \| 0 \| X > Y
	// 0 \| 0 \| 1 \| X < Y
	// 1 \| 0 \| 0 \| X == Y
	// 1 \| 1 \| 1 \| unordered
	switch (SetCCOpcode) {
	default: llvm_unreachable("Condcode should be pre-legalized away");
	case ISD::SETUEQ:
	case ISD::SETEQ: return X86::COND_E;
	case ISD::SETOLT: // flipped
	case ISD::SETOGT:
	case ISD::SETGT: return X86::COND_A;
	case ISD::SETOLE: // flipped
	case ISD::SETOGE:
	case ISD::SETGE: return X86::COND_AE;
	case ISD::SETUGT: // flipped
	case ISD::SETULT:
	case ISD::SETLT: return X86::COND_B;
	case ISD::SETUGE: // flipped
	case ISD::SETULE:
	case ISD::SETLE: return X86::COND_BE;
	case ISD::SETONE:
	case ISD::SETNE: return X86::COND_NE;
	case ISD::SETUO: return X86::COND_P;
	case ISD::SETO: return X86::COND_NP;
	case ISD::SETOEQ:
	case ISD::SETUNE: return X86::COND_INVALID;
	}
	}

	/// Is there a floating point cmov for the specific X86 condition code?
	/// Current x86 isa includes the following FP cmov instructions:
	/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
	static bool hasFPCMov(unsigned X86CC) {
	switch (X86CC) {
	default:
	return false;
	case X86::COND_B:
	case X86::COND_BE:
	case X86::COND_E:
	case X86::COND_P:
	case X86::COND_A:
	case X86::COND_AE:
	case X86::COND_NE:
	case X86::COND_NP:
	return true;
	}
	}


	bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
	const CallInst &I,
	MachineFunction &MF,
	unsigned Intrinsic) const {

	const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
	if (!IntrData)
	return false;

	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.flags = MachineMemOperand::MONone;
	Info.offset = 0;

	switch (IntrData->Type) {
	case EXPAND_FROM_MEM: {
	Info.ptrVal = I.getArgOperand(0);
	Info.memVT = MVT::getVT(I.getType());
	Info.align = 1;
	Info.flags \|= MachineMemOperand::MOLoad;
	break;
	}
	case COMPRESS_TO_MEM: {
	Info.ptrVal = I.getArgOperand(0);
	Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
	Info.align = 1;
	Info.flags \|= MachineMemOperand::MOStore;
	break;
	}
	case TRUNCATE_TO_MEM_VI8:
	case TRUNCATE_TO_MEM_VI16:
	case TRUNCATE_TO_MEM_VI32: {
	Info.ptrVal = I.getArgOperand(0);
	MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
	MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
	if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
	ScalarVT = MVT::i8;
	else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
	ScalarVT = MVT::i16;
	else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
	ScalarVT = MVT::i32;

	Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
	Info.align = 1;
	Info.flags \|= MachineMemOperand::MOStore;
	break;
	}
	default:
	return false;
	}

	return true;
	}

	/// Returns true if the target can instruction select the
	/// specified FP immediate natively. If false, the legalizer will
	/// materialize the FP immediate as a load from a constant pool.
	bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
	for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
	if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
	return true;
	}
	return false;
	}

	bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
	ISD::LoadExtType ExtTy,
	EVT NewVT) const {
	// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
	// relocation target a movq or addq instruction: don't let the load shrink.
	SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
	if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
	if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
	return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
	return true;
	}

	/// \brief Returns true if it is beneficial to convert a load of a constant
	/// to just the constant itself.
	bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const {
	assert(Ty->isIntegerTy());

	unsigned BitSize = Ty->getPrimitiveSizeInBits();
	if (BitSize == 0 \|\| BitSize > 64)
	return false;
	return true;
	}

	bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
	// TODO: It might be a win to ease or lift this restriction, but the generic
	// folds in DAGCombiner conflict with vector folds for an AVX512 target.
	if (VT.isVector() && Subtarget.hasAVX512())
	return false;

	return true;
	}

	bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
	unsigned Index) const {
	if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
	return false;

	// Mask vectors support all subregister combinations and operations that
	// extract half of vector.
	if (ResVT.getVectorElementType() == MVT::i1)
	return Index == 0 \|\| ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
	(Index == ResVT.getVectorNumElements()));

	return (Index % ResVT.getVectorNumElements()) == 0;
	}

	bool X86TargetLowering::isCheapToSpeculateCttz() const {
	// Speculate cttz only if we can directly use TZCNT.
	return Subtarget.hasBMI();
	}

	bool X86TargetLowering::isCheapToSpeculateCtlz() const {
	// Speculate ctlz only if we can directly use LZCNT.
	return Subtarget.hasLZCNT();
	}

	bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT,
	EVT BitcastVT) const {
	if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1)
	return false;

	return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT);
	}

	bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
	const SelectionDAG &DAG) const {
	// Do not merge to float value size (128 bytes) if no implicit
	// float attribute is set.
	bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
	Attribute::NoImplicitFloat);

	if (NoFloat) {
	unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
	return (MemVT.getSizeInBits() <= MaxIntSize);
	}
	return true;
	}

	bool X86TargetLowering::isCtlzFast() const {
	return Subtarget.hasFastLZCNT();
	}

	bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
	const Instruction &AndI) const {
	return true;
	}

	bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
	if (!Subtarget.hasBMI())
	return false;

	// There are only 32-bit and 64-bit forms for 'andn'.
	EVT VT = Y.getValueType();
	if (VT != MVT::i32 && VT != MVT::i64)
	return false;

	return true;
	}

	MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
	MVT VT = MVT::getIntegerVT(NumBits);
	if (isTypeLegal(VT))
	return VT;

	// PMOVMSKB can handle this.
	if (NumBits == 128 && isTypeLegal(MVT::v16i8))
	return MVT::v16i8;

	// VPMOVMSKB can handle this.
	if (NumBits == 256 && isTypeLegal(MVT::v32i8))
	return MVT::v32i8;

	// TODO: Allow 64-bit type for 32-bit target.
	// TODO: 512-bit types should be allowed, but make sure that those
	// cases are handled in combineVectorSizedSetCCEquality().

	return MVT::INVALID_SIMPLE_VALUE_TYPE;
	}

	/// Val is the undef sentinel value or equal to the specified value.
	static bool isUndefOrEqual(int Val, int CmpVal) {
	return ((Val == SM_SentinelUndef) \|\| (Val == CmpVal));
	}

	/// Val is either the undef or zero sentinel value.
	static bool isUndefOrZero(int Val) {
	return ((Val == SM_SentinelUndef) \|\| (Val == SM_SentinelZero));
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size is the undef sentinel value.
	static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
	for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
	if (Mask[i] != SM_SentinelUndef)
	return false;
	return true;
	}

	/// Return true if Val is undef or if its value falls within the
	/// specified range (L, H].
	static bool isUndefOrInRange(int Val, int Low, int Hi) {
	return (Val == SM_SentinelUndef) \|\| (Val >= Low && Val < Hi);
	}

	/// Return true if every element in Mask is undef or if its value
	/// falls within the specified range (L, H].
	static bool isUndefOrInRange(ArrayRef<int> Mask,
	int Low, int Hi) {
	for (int M : Mask)
	if (!isUndefOrInRange(M, Low, Hi))
	return false;
	return true;
	}

	/// Return true if Val is undef, zero or if its value falls within the
	/// specified range (L, H].
	static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
	return isUndefOrZero(Val) \|\| (Val >= Low && Val < Hi);
	}

	/// Return true if every element in Mask is undef, zero or if its value
	/// falls within the specified range (L, H].
	static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
	for (int M : Mask)
	if (!isUndefOrZeroOrInRange(M, Low, Hi))
	return false;
	return true;
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size, falls within the specified
	/// sequential range (Low, Low+Size]. or is undef.
	static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
	unsigned Pos, unsigned Size, int Low) {
	for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
	if (!isUndefOrEqual(Mask[i], Low))
	return false;
	return true;
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size, falls within the specified
	/// sequential range (Low, Low+Size], or is undef or is zero.
	static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
	unsigned Size, int Low) {
	for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
	if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
	return false;
	return true;
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size is undef or is zero.
	static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
	unsigned Size) {
	for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
	if (!isUndefOrZero(Mask[i]))
	return false;
	return true;
	}

	/// \brief Helper function to test whether a shuffle mask could be
	/// simplified by widening the elements being shuffled.
	///
	/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
	/// leaves it in an unspecified state.
	///
	/// NOTE: This must handle normal vector shuffle masks and target vector
	/// shuffle masks. The latter have the special property of a '-2' representing
	/// a zero-ed lane of a vector.
	static bool canWidenShuffleElements(ArrayRef<int> Mask,
	SmallVectorImpl<int> &WidenedMask) {
	WidenedMask.assign(Mask.size() / 2, 0);
	for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
	int M0 = Mask[i];
	int M1 = Mask[i + 1];

	// If both elements are undef, its trivial.
	if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
	WidenedMask[i / 2] = SM_SentinelUndef;
	continue;
	}

	// Check for an undef mask and a mask value properly aligned to fit with
	// a pair of values. If we find such a case, use the non-undef mask's value.
	if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
	WidenedMask[i / 2] = M1 / 2;
	continue;
	}
	if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
	WidenedMask[i / 2] = M0 / 2;
	continue;
	}

	// When zeroing, we need to spread the zeroing across both lanes to widen.
	if (M0 == SM_SentinelZero \|\| M1 == SM_SentinelZero) {
	if ((M0 == SM_SentinelZero \|\| M0 == SM_SentinelUndef) &&
	(M1 == SM_SentinelZero \|\| M1 == SM_SentinelUndef)) {
	WidenedMask[i / 2] = SM_SentinelZero;
	continue;
	}
	return false;
	}

	// Finally check if the two mask values are adjacent and aligned with
	// a pair.
	if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
	WidenedMask[i / 2] = M0 / 2;
	continue;
	}

	// Otherwise we can't safely widen the elements used in this shuffle.
	return false;
	}
	assert(WidenedMask.size() == Mask.size() / 2 &&
	"Incorrect size of mask after widening the elements!");

	return true;
	}

	/// Returns true if Elt is a constant zero or a floating point constant +0.0.
	bool X86::isZeroNode(SDValue Elt) {
	return isNullConstant(Elt) \|\| isNullFPConstant(Elt);
	}

	// Build a vector of constants.
	// Use an UNDEF node if MaskElt == -1.
	// Split 64-bit constants in the 32-bit mode.
	static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
	const SDLoc &dl, bool IsMask = false) {

	SmallVector<SDValue, 32> Ops;
	bool Split = false;

	MVT ConstVecVT = VT;
	unsigned NumElts = VT.getVectorNumElements();
	bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
	if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
	ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
	Split = true;
	}

	MVT EltVT = ConstVecVT.getVectorElementType();
	for (unsigned i = 0; i < NumElts; ++i) {
	bool IsUndef = Values[i] < 0 && IsMask;
	SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
	DAG.getConstant(Values[i], dl, EltVT);
	Ops.push_back(OpNode);
	if (Split)
	Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
	DAG.getConstant(0, dl, EltVT));
	}
	SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
	if (Split)
	ConstsNode = DAG.getBitcast(VT, ConstsNode);
	return ConstsNode;
	}

	static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
	MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
	assert(Bits.size() == Undefs.getBitWidth() &&
	"Unequal constant and undef arrays");
	SmallVector<SDValue, 32> Ops;
	bool Split = false;

	MVT ConstVecVT = VT;
	unsigned NumElts = VT.getVectorNumElements();
	bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
	if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
	ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
	Split = true;
	}

	MVT EltVT = ConstVecVT.getVectorElementType();
	for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
	if (Undefs[i]) {
	Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
	continue;
	}
	const APInt &V = Bits[i];
	assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
	if (Split) {
	Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
	Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
	} else if (EltVT == MVT::f32) {
	APFloat FV(APFloat::IEEEsingle(), V);
	Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
	} else if (EltVT == MVT::f64) {
	APFloat FV(APFloat::IEEEdouble(), V);
	Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
	} else {
	Ops.push_back(DAG.getConstant(V, dl, EltVT));
	}
	}

	SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
	return DAG.getBitcast(VT, ConstsNode);
	}

	/// Returns a vector of specified type with all zero elements.
	static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector() \|\|
	VT.getVectorElementType() == MVT::i1) &&
	"Unexpected vector type");

	// Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
	// type. This ensures they get CSE'd. But if the integer type is not
	// available, use a floating-point +0.0 instead.
	SDValue Vec;
	if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
	Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
	} else if (VT.getVectorElementType() == MVT::i1) {
	assert((Subtarget.hasBWI() \|\| VT.getVectorNumElements() <= 16) &&
	"Unexpected vector type");
	assert((Subtarget.hasVLX() \|\| VT.getVectorNumElements() >= 8) &&
	"Unexpected vector type");
	Vec = DAG.getConstant(0, dl, VT);
	} else {
	unsigned Num32BitElts = VT.getSizeInBits() / 32;
	Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
	}
	return DAG.getBitcast(VT, Vec);
	}

	static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
	const SDLoc &dl, unsigned vectorWidth) {
	EVT VT = Vec.getValueType();
	EVT ElVT = VT.getVectorElementType();
	unsigned Factor = VT.getSizeInBits()/vectorWidth;
	EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
	VT.getVectorNumElements()/Factor);

	// Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
	unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// This is the index of the first element of the vectorWidth-bit chunk
	// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
	IdxVal &= ~(ElemsPerChunk - 1);

	// If the input is a buildvector just emit a smaller one.
	if (Vec.getOpcode() == ISD::BUILD_VECTOR)
	return DAG.getBuildVector(ResultVT, dl,
	Vec->ops().slice(IdxVal, ElemsPerChunk));

	SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
	}

	/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
	/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
	/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
	/// instructions or a simple subregister reference. Idx is an index in the
	/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
	/// lowering EXTRACT_VECTOR_ELT operations easier.
	static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert((Vec.getValueType().is256BitVector() \|\|
	Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
	return extractSubVector(Vec, IdxVal, DAG, dl, 128);
	}

	/// Generate a DAG to grab 256-bits from a 512-bit vector.
	static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
	return extractSubVector(Vec, IdxVal, DAG, dl, 256);
	}

	static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl,
	unsigned vectorWidth) {
	assert((vectorWidth == 128 \|\| vectorWidth == 256) &&
	"Unsupported vector width");
	// Inserting UNDEF is Result
	if (Vec.isUndef())
	return Result;
	EVT VT = Vec.getValueType();
	EVT ElVT = VT.getVectorElementType();
	EVT ResultVT = Result.getValueType();

	// Insert the relevant vectorWidth bits.
	unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// This is the index of the first element of the vectorWidth-bit chunk
	// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
	IdxVal &= ~(ElemsPerChunk - 1);

	SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
	}

	/// Generate a DAG to put 128-bits into a vector > 128 bits. This
	/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
	/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
	/// simple superregister reference. Idx is an index in the 128 bits
	/// we want. It need not be aligned to a 128-bit boundary. That makes
	/// lowering INSERT_VECTOR_ELT operations easier.
	static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
	return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
	}

	static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
	return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
	}

	// Return true if the instruction zeroes the unused upper part of the
	// destination and accepts mask.
	static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
	switch (Opcode) {
	default:
	return false;
	case X86ISD::TESTM:
	case X86ISD::TESTNM:
	case X86ISD::PCMPEQM:
	case X86ISD::PCMPGTM:
	case X86ISD::CMPM:
	case X86ISD::CMPMU:
	case X86ISD::CMPM_RND:
	return true;
	}
	}

	/// Insert i1-subvector to i1-vector.
	static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {

	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	SDValue SubVec = Op.getOperand(1);
	SDValue Idx = Op.getOperand(2);

	if (!isa<ConstantSDNode>(Idx))
	return SDValue();

	// Inserting undef is a nop. We can just return the original vector.
	if (SubVec.isUndef())
	return Vec;

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
	return Op;

	MVT OpVT = Op.getSimpleValueType();
	unsigned NumElems = OpVT.getVectorNumElements();

	SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);

	// Extend to natively supported kshift.
	MVT WideOpVT = OpVT;
	if ((!Subtarget.hasDQI() && NumElems == 8) \|\| NumElems < 8)
	WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

	// Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
	// if necessary.
	if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
	// May need to promote to a legal type.
	Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	getZeroVector(WideOpVT, Subtarget, DAG, dl),
	SubVec, Idx);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	MVT SubVecVT = SubVec.getSimpleValueType();
	unsigned SubVecNumElems = SubVecVT.getVectorNumElements();

	assert(IdxVal + SubVecNumElems <= NumElems &&
	IdxVal % SubVecVT.getSizeInBits() == 0 &&
	"Unexpected index value in INSERT_SUBVECTOR");

	SDValue Undef = DAG.getUNDEF(WideOpVT);

	if (IdxVal == 0) {
	// Zero lower bits of the Vec
	SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
	ZeroIdx);
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
	// Merge them together, SubVec should be zero extended.
	SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	getZeroVector(WideOpVT, Subtarget, DAG, dl),
	SubVec, ZeroIdx);
	Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	Undef, SubVec, ZeroIdx);

	if (Vec.isUndef()) {
	assert(IdxVal != 0 && "Unexpected index");
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
	}

	if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
	assert(IdxVal != 0 && "Unexpected index");
	NumElems = WideOpVT.getVectorNumElements();
	unsigned ShiftLeft = NumElems - SubVecNumElems;
	unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getConstant(ShiftLeft, dl, MVT::i8));
	if (ShiftRight != 0)
	SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
	DAG.getConstant(ShiftRight, dl, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
	}

	// Simple case when we put subvector in the upper part
	if (IdxVal + SubVecNumElems == NumElems) {
	SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	if (SubVecNumElems * 2 == NumElems) {
	// Special case, use legal zero extending insert_subvector. This allows
	// isel to opimitize when bits are known zero.
	Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	getZeroVector(WideOpVT, Subtarget, DAG, dl),
	Vec, ZeroIdx);
	} else {
	// Otherwise use explicit shifts to zero the bits.
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	Undef, Vec, ZeroIdx);
	NumElems = WideOpVT.getVectorNumElements();
	SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8);
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
	}
	Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	// Inserting into the middle is more complicated.

	NumElems = WideOpVT.getVectorNumElements();

	// Widen the vector if needed.
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
	// Move the current value of the bit to be replace to the lsbs.
	Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	// Xor with the new bit.
	Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec);
	// Shift to MSB, filling bottom bits with 0.
	unsigned ShiftLeft = NumElems - SubVecNumElems;
	Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op,
	DAG.getConstant(ShiftLeft, dl, MVT::i8));
	// Shift to the final position, filling upper bits with 0.
	unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
	Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
	DAG.getConstant(ShiftRight, dl, MVT::i8));
	// Xor with original vector leaving the new value.
	Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op);
	// Reduce to original width if needed.
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
	}

	/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
	/// instructions. This is used because creating CONCAT_VECTOR nodes of
	/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
	/// large BUILD_VECTORS.
	static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
	unsigned NumElems, SelectionDAG &DAG,
	const SDLoc &dl) {
	SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
	return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
	}

	static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
	unsigned NumElems, SelectionDAG &DAG,
	const SDLoc &dl) {
	SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
	return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
	}

	/// Returns a vector of specified type with all bits set.
	/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
	/// Then bitcast to their original type, ensuring they get CSE'd.
	static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) &&
	"Expected a 128/256/512-bit vector type");

	APInt Ones = APInt::getAllOnesValue(32);
	unsigned NumElts = VT.getSizeInBits() / 32;
	SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
	return DAG.getBitcast(VT, Vec);
	}

	static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
	SelectionDAG &DAG) {
	EVT InVT = In.getValueType();
	assert((X86ISD::VSEXT == Opc \|\| X86ISD::VZEXT == Opc) && "Unexpected opcode");

	if (VT.is128BitVector() && InVT.is128BitVector())
	return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
	: DAG.getZeroExtendVectorInReg(In, DL, VT);

	// For 256-bit vectors, we only need the lower (128-bit) input half.
	// For 512-bit vectors, we only need the lower input half or quarter.
	if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
	int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
	In = extractSubVector(In, 0, DAG, DL,
	std::max(128, (int)VT.getSizeInBits() / Scale));
	}

	return DAG.getNode(Opc, DL, VT, In);
	}

	/// Returns a vector_shuffle node for an unpackl operation.
	static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
	SDValue V1, SDValue V2) {
	SmallVector<int, 8> Mask;
	createUnpackShuffleMask(VT, Mask, /* Lo = / true, / Unary = */ false);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	/// Returns a vector_shuffle node for an unpackh operation.
	static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
	SDValue V1, SDValue V2) {
	SmallVector<int, 8> Mask;
	createUnpackShuffleMask(VT, Mask, /* Lo = / false, / Unary = */ false);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	/// Return a vector_shuffle of the specified vector of zero or undef vector.
	/// This produces a shuffle where the low element of V2 is swizzled into the
	/// zero/undef vector, landing at element Idx.
	/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
	static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
	bool IsZero,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = V2.getSimpleValueType();
	SDValue V1 = IsZero
	? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
	int NumElems = VT.getVectorNumElements();
	SmallVector<int, 16> MaskVec(NumElems);
	for (int i = 0; i != NumElems; ++i)
	// If this is the insertion idx, put the low elt of V2 here.
	MaskVec[i] = (i == Idx) ? NumElems : i;
	return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
	}

	static SDValue peekThroughBitcasts(SDValue V) {
	while (V.getNode() && V.getOpcode() == ISD::BITCAST)
	V = V.getOperand(0);
	return V;
	}

	static SDValue peekThroughOneUseBitcasts(SDValue V) {
	while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
	V.getOperand(0).hasOneUse())
	V = V.getOperand(0);
	return V;
	}

	static const Constant *getTargetConstantFromNode(SDValue Op) {
	Op = peekThroughBitcasts(Op);

	auto *Load = dyn_cast<LoadSDNode>(Op);
	if (!Load)
	return nullptr;

	SDValue Ptr = Load->getBasePtr();
	if (Ptr->getOpcode() == X86ISD::Wrapper \|\|
	Ptr->getOpcode() == X86ISD::WrapperRIP)
	Ptr = Ptr->getOperand(0);

	auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
	if (!CNode \|\| CNode->isMachineConstantPoolEntry())
	return nullptr;

	return dyn_cast<Constant>(CNode->getConstVal());
	}

	// Extract raw constant bits from constant pools.
	static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
	APInt &UndefElts,
	SmallVectorImpl<APInt> &EltBits,
	bool AllowWholeUndefs = true,
	bool AllowPartialUndefs = true) {
	assert(EltBits.empty() && "Expected an empty EltBits vector");

	Op = peekThroughBitcasts(Op);

	EVT VT = Op.getValueType();
	unsigned SizeInBits = VT.getSizeInBits();
	assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
	unsigned NumElts = SizeInBits / EltSizeInBits;

	// Bitcast a source array of element bits to the target size.
	auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
	unsigned NumSrcElts = UndefSrcElts.getBitWidth();
	unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
	assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
	"Constant bit sizes don't match");

	// Don't split if we don't allow undef bits.
	bool AllowUndefs = AllowWholeUndefs \|\| AllowPartialUndefs;
	if (UndefSrcElts.getBoolValue() && !AllowUndefs)
	return false;

	// If we're already the right size, don't bother bitcasting.
	if (NumSrcElts == NumElts) {
	UndefElts = UndefSrcElts;
	EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
	return true;
	}

	// Extract all the undef/constant element data and pack into single bitsets.
	APInt UndefBits(SizeInBits, 0);
	APInt MaskBits(SizeInBits, 0);

	for (unsigned i = 0; i != NumSrcElts; ++i) {
	unsigned BitOffset = i * SrcEltSizeInBits;
	if (UndefSrcElts[i])
	UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
	MaskBits.insertBits(SrcEltBits[i], BitOffset);
	}

	// Split the undef/constant single bitset data into the target elements.
	UndefElts = APInt(NumElts, 0);
	EltBits.resize(NumElts, APInt(EltSizeInBits, 0));

	for (unsigned i = 0; i != NumElts; ++i) {
	unsigned BitOffset = i * EltSizeInBits;
	APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);

	// Only treat an element as UNDEF if all bits are UNDEF.
	if (UndefEltBits.isAllOnesValue()) {
	if (!AllowWholeUndefs)
	return false;
	UndefElts.setBit(i);
	continue;
	}

	// If only some bits are UNDEF then treat them as zero (or bail if not
	// supported).
	if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
	return false;

	APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
	EltBits[i] = Bits.getZExtValue();
	}
	return true;
	};

	// Collect constant bits and insert into mask/undef bit masks.
	auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
	unsigned UndefBitIndex) {
	if (!Cst)
	return false;
	if (isa<UndefValue>(Cst)) {
	Undefs.setBit(UndefBitIndex);
	return true;
	}
	if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
	Mask = CInt->getValue();
	return true;
	}
	if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
	Mask = CFP->getValueAPF().bitcastToAPInt();
	return true;
	}
	return false;
	};

	// Handle UNDEFs.
	if (Op.isUndef()) {
	APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
	SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract scalar constant bits.
	if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
	APInt UndefSrcElts = APInt::getNullValue(1);
	SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract constant bits from build vector.
	if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
	unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
	for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
	const SDValue &Src = Op.getOperand(i);
	if (Src.isUndef()) {
	UndefSrcElts.setBit(i);
	continue;
	}
	auto *Cst = cast<ConstantSDNode>(Src);
	SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
	}
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract constant bits from constant pool vector.
	if (auto *Cst = getTargetConstantFromNode(Op)) {
	Type *CstTy = Cst->getType();
	if (!CstTy->isVectorTy() \|\| (SizeInBits != CstTy->getPrimitiveSizeInBits()))
	return false;

	unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
	unsigned NumSrcElts = CstTy->getVectorNumElements();

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
	for (unsigned i = 0; i != NumSrcElts; ++i)
	if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
	UndefSrcElts, i))
	return false;

	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract constant bits from a broadcasted constant pool scalar.
	if (Op.getOpcode() == X86ISD::VBROADCAST &&
	EltSizeInBits <= VT.getScalarSizeInBits()) {
	if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
	unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
	if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
	if (UndefSrcElts[0])
	UndefSrcElts.setBits(0, NumSrcElts);
	SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
	return CastBitData(UndefSrcElts, SrcEltBits);
	}
	}
	}

	// Extract a rematerialized scalar constant insertion.
	if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
	Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
	isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
	unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits;
	auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
	SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
	SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	return false;
	}

	static bool getTargetShuffleMaskIndices(SDValue MaskNode,
	unsigned MaskEltSizeInBits,
	SmallVectorImpl<uint64_t> &RawMask) {
	APInt UndefElts;
	SmallVector<APInt, 64> EltBits;

	// Extract the raw target constant bits.
	// FIXME: We currently don't support UNDEF bits or mask entries.
	if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
	EltBits, /* AllowWholeUndefs */ false,
	/* AllowPartialUndefs */ false))
	return false;

	// Insert the extracted elements into the mask.
	for (APInt Elt : EltBits)
	RawMask.push_back(Elt.getZExtValue());

	return true;
	}

	/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
	/// Note: This ignores saturation, so inputs must be checked first.
	static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
	bool Unary) {
	assert(Mask.empty() && "Expected an empty shuffle mask vector");
	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumLanes = VT.getSizeInBits() / 128;
	unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
	unsigned Offset = Unary ? 0 : NumElts;

	for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
	for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
	Mask.push_back(Elt + (Lane * NumEltsPerLane));
	for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
	Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
	}
	}

	/// Calculates the shuffle mask corresponding to the target-specific opcode.
	/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
	/// operands in \p Ops, and returns true.
	/// Sets \p IsUnary to true if only one source is used. Note that this will set
	/// IsUnary for shuffles which use a single input multiple times, and in those
	/// cases it will adjust the mask to only have indices within that single input.
	/// It is an error to call this with non-empty Mask/Ops vectors.
	static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
	SmallVectorImpl<SDValue> &Ops,
	SmallVectorImpl<int> &Mask, bool &IsUnary) {
	unsigned NumElems = VT.getVectorNumElements();
	SDValue ImmN;

	assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
	assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");

	IsUnary = false;
	bool IsFakeUnary = false;
	switch(N->getOpcode()) {
	case X86ISD::BLENDI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::SHUFP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::INSERTPS:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::EXTRQI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	if (isa<ConstantSDNode>(N->getOperand(1)) &&
	isa<ConstantSDNode>(N->getOperand(2))) {
	int BitLen = N->getConstantOperandVal(1);
	int BitIdx = N->getConstantOperandVal(2);
	DecodeEXTRQIMask(VT, BitLen, BitIdx, Mask);
	IsUnary = true;
	}
	break;
	case X86ISD::INSERTQI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	if (isa<ConstantSDNode>(N->getOperand(2)) &&
	isa<ConstantSDNode>(N->getOperand(3))) {
	int BitLen = N->getConstantOperandVal(2);
	int BitIdx = N->getConstantOperandVal(3);
	DecodeINSERTQIMask(VT, BitLen, BitIdx, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	}
	break;
	case X86ISD::UNPCKH:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeUNPCKHMask(VT, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::UNPCKL:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeUNPCKLMask(VT, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVHLPS:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeMOVHLPSMask(NumElems, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVLHPS:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeMOVLHPSMask(NumElems, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::PALIGNR:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	Ops.push_back(N->getOperand(1));
	Ops.push_back(N->getOperand(0));
	break;
	case X86ISD::VSHLDQ:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::VSRLDQ:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFD:
	case X86ISD::VPERMILPI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFHW:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFLW:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::VZEXT_MOVL:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeZeroMoveLowMask(VT, Mask);
	IsUnary = true;
	break;
	case X86ISD::VBROADCAST: {
	SDValue N0 = N->getOperand(0);
	// See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
	// add the pre-extracted value to the Ops vector.
	if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N0.getOperand(0).getValueType() == VT &&
	N0.getConstantOperandVal(1) == 0)
	Ops.push_back(N0.getOperand(0));

	// We only decode broadcasts of same-sized vectors, unless the broadcast
	// came from an extract from the original width. If we found one, we
	// pushed it the Ops vector above.
	if (N0.getValueType() == VT \|\| !Ops.empty()) {
	DecodeVectorBroadcast(VT, Mask);
	IsUnary = true;
	break;
	}
	return false;
	}
	case X86ISD::VPERMILPV: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	IsUnary = true;
	SDValue MaskNode = N->getOperand(1);
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	SmallVector<uint64_t, 32> RawMask;
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
	DecodeVPERMILPMask(VT, RawMask, Mask);
	break;
	}
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPERMILPMask(C, MaskEltSize, Mask);
	break;
	}
	return false;
	}
	case X86ISD::PSHUFB: {
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = true;
	SDValue MaskNode = N->getOperand(1);
	SmallVector<uint64_t, 32> RawMask;
	if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
	DecodePSHUFBMask(RawMask, Mask);
	break;
	}
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodePSHUFBMask(C, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMI:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
	break;
	case X86ISD::VPERM2X128:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVSLDUP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeMOVSLDUPMask(VT, Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVSHDUP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeMOVSHDUPMask(VT, Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVDDUP:
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	DecodeMOVDDUPMask(VT, Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVLPD:
	case X86ISD::MOVLPS:
	// Not yet implemented
	return false;
	case X86ISD::VPERMIL2: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	SDValue MaskNode = N->getOperand(2);
	SDValue CtrlNode = N->getOperand(3);
	if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
	unsigned CtrlImm = CtrlOp->getZExtValue();
	SmallVector<uint64_t, 32> RawMask;
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
	DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
	break;
	}
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
	break;
	}
	}
	return false;
	}
	case X86ISD::VPPERM: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	SDValue MaskNode = N->getOperand(2);
	SmallVector<uint64_t, 32> RawMask;
	if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
	DecodeVPPERMMask(RawMask, Mask);
	break;
	}
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPPERMMask(C, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMV: {
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	IsUnary = true;
	// Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
	Ops.push_back(N->getOperand(1));
	SDValue MaskNode = N->getOperand(0);
	SmallVector<uint64_t, 32> RawMask;
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
	DecodeVPERMVMask(RawMask, Mask);
	break;
	}
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPERMVMask(C, MaskEltSize, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMV3: {
	assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
	// Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
	Ops.push_back(N->getOperand(0));
	Ops.push_back(N->getOperand(2));
	SDValue MaskNode = N->getOperand(1);
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPERMV3Mask(C, MaskEltSize, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMIV3: {
	assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
	assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
	IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
	// Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
	Ops.push_back(N->getOperand(1));
	Ops.push_back(N->getOperand(2));
	SDValue MaskNode = N->getOperand(0);
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPERMV3Mask(C, MaskEltSize, Mask);
	break;
	}
	return false;
	}
	default: llvm_unreachable("unknown target shuffle node");
	}

	// Empty mask indicates the decode failed.
	if (Mask.empty())
	return false;

	// Check if we're getting a shuffle mask with zero'd elements.
	if (!AllowSentinelZero)
	if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
	return false;

	// If we have a fake unary shuffle, the shuffle mask is spread across two
	// inputs that are actually the same node. Re-map the mask to always point
	// into the first input.
	if (IsFakeUnary)
	for (int &M : Mask)
	if (M >= (int)Mask.size())
	M -= Mask.size();

	// If we didn't already add operands in the opcode-specific code, default to
	// adding 1 or 2 operands starting at 0.
	if (Ops.empty()) {
	Ops.push_back(N->getOperand(0));
	if (!IsUnary \|\| IsFakeUnary)
	Ops.push_back(N->getOperand(1));
	}

	return true;
	}

	/// Check a target shuffle mask's inputs to see if we can set any values to
	/// SM_SentinelZero - this is for elements that are known to be zero
	/// (not just zeroable) from their inputs.
	/// Returns true if the target shuffle mask was decoded.
	static bool setTargetShuffleZeroElements(SDValue N,
	SmallVectorImpl<int> &Mask,
	SmallVectorImpl<SDValue> &Ops) {
	bool IsUnary;
	if (!isTargetShuffle(N.getOpcode()))
	return false;

	MVT VT = N.getSimpleValueType();
	if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
	return false;

	SDValue V1 = Ops[0];
	SDValue V2 = IsUnary ? V1 : Ops[1];

	V1 = peekThroughBitcasts(V1);
	V2 = peekThroughBitcasts(V2);

	assert((VT.getSizeInBits() % Mask.size()) == 0 &&
	"Illegal split of shuffle value type");
	unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();

	// Extract known constant input data.
	APInt UndefSrcElts[2];
	SmallVector<APInt, 32> SrcEltBits[2];
	bool IsSrcConstant[2] = {
	getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
	SrcEltBits[0], true, false),
	getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
	SrcEltBits[1], true, false)};

	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	int M = Mask[i];

	// Already decoded as SM_SentinelZero / SM_SentinelUndef.
	if (M < 0)
	continue;

	// Determine shuffle input and normalize the mask.
	unsigned SrcIdx = M / Size;
	SDValue V = M < Size ? V1 : V2;
	M %= Size;

	// We are referencing an UNDEF input.
	if (V.isUndef()) {
	Mask[i] = SM_SentinelUndef;
	continue;
	}

	// SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
	// TODO: We currently only set UNDEF for integer types - floats use the same
	// registers as vectors and many of the scalar folded loads rely on the
	// SCALAR_TO_VECTOR pattern.
	if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	(Size % V.getValueType().getVectorNumElements()) == 0) {
	int Scale = Size / V.getValueType().getVectorNumElements();
	int Idx = M / Scale;
	if (Idx != 0 && !VT.isFloatingPoint())
	Mask[i] = SM_SentinelUndef;
	else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
	Mask[i] = SM_SentinelZero;
	continue;
	}

	// Attempt to extract from the source's constant bits.
	if (IsSrcConstant[SrcIdx]) {
	if (UndefSrcElts[SrcIdx][M])
	Mask[i] = SM_SentinelUndef;
	else if (SrcEltBits[SrcIdx][M] == 0)
	Mask[i] = SM_SentinelZero;
	}
	}

	assert(VT.getVectorNumElements() == Mask.size() &&
	"Different mask size from vector size!");
	return true;
	}

	// Attempt to decode ops that could be represented as a shuffle mask.
	// The decoded shuffle mask may contain a different number of elements to the
	// destination value type.
	static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
	SmallVectorImpl<SDValue> &Ops,
	SelectionDAG &DAG) {
	Mask.clear();
	Ops.clear();

	MVT VT = N.getSimpleValueType();
	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumSizeInBits = VT.getSizeInBits();
	unsigned NumBitsPerElt = VT.getScalarSizeInBits();
	assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
	"Expected byte aligned value types");

	unsigned Opcode = N.getOpcode();
	switch (Opcode) {
	case ISD::AND:
	case X86ISD::ANDNP: {
	// Attempt to decode as a per-byte mask.
	APInt UndefElts;
	SmallVector<APInt, 32> EltBits;
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);
	bool IsAndN = (X86ISD::ANDNP == Opcode);
	uint64_t ZeroMask = IsAndN ? 255 : 0;
	if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
	return false;
	for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
	if (UndefElts[i]) {
	Mask.push_back(SM_SentinelUndef);
	continue;
	}
	uint64_t ByteBits = EltBits[i].getZExtValue();
	if (ByteBits != 0 && ByteBits != 255)
	return false;
	Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
	}
	Ops.push_back(IsAndN ? N1 : N0);
	return true;
	}
	case ISD::SCALAR_TO_VECTOR: {
	// Match against a scalar_to_vector of an extract from a vector,
	// for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
	SDValue N0 = N.getOperand(0);
	SDValue SrcExtract;

	if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	N0.getOperand(0).getValueType() == VT) \|\|
	(N0.getOpcode() == X86ISD::PEXTRW &&
	N0.getOperand(0).getValueType() == MVT::v8i16) \|\|
	(N0.getOpcode() == X86ISD::PEXTRB &&
	N0.getOperand(0).getValueType() == MVT::v16i8)) {
	SrcExtract = N0;
	}

	if (!SrcExtract \|\| !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
	return false;

	SDValue SrcVec = SrcExtract.getOperand(0);
	EVT SrcVT = SrcVec.getValueType();
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;

	unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
	if (NumSrcElts <= SrcIdx)
	return false;

	Ops.push_back(SrcVec);
	Mask.push_back(SrcIdx);
	Mask.append(NumZeros, SM_SentinelZero);
	Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
	return true;
	}
	case X86ISD::PINSRB:
	case X86ISD::PINSRW: {
	SDValue InVec = N.getOperand(0);
	SDValue InScl = N.getOperand(1);
	uint64_t InIdx = N.getConstantOperandVal(2);
	assert(InIdx < NumElts && "Illegal insertion index");

	// Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
	if (X86::isZeroNode(InScl)) {
	Ops.push_back(InVec);
	for (unsigned i = 0; i != NumElts; ++i)
	Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
	return true;
	}

	// Attempt to recognise a PINSR(PEXTR) shuffle pattern.
	// TODO: Expand this to support INSERT_VECTOR_ELT/etc.
	unsigned ExOp =
	(X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
	if (InScl.getOpcode() != ExOp)
	return false;

	SDValue ExVec = InScl.getOperand(0);
	uint64_t ExIdx = InScl.getConstantOperandVal(1);
	assert(ExIdx < NumElts && "Illegal extraction index");
	Ops.push_back(InVec);
	Ops.push_back(ExVec);
	for (unsigned i = 0; i != NumElts; ++i)
	Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
	return true;
	}
	case X86ISD::PACKSS:
	case X86ISD::PACKUS: {
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);
	assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
	N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
	"Unexpected input value type");

	// If we know input saturation won't happen we can treat this
	// as a truncation shuffle.
	if (Opcode == X86ISD::PACKSS) {
	if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0) <= NumBitsPerElt) \|\|
	(!N1.isUndef() && DAG.ComputeNumSignBits(N1) <= NumBitsPerElt))
	return false;
	} else {
	APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
	if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask)) \|\|
	(!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask)))
	return false;
	}

	bool IsUnary = (N0 == N1);

	Ops.push_back(N0);
	if (!IsUnary)
	Ops.push_back(N1);

	createPackShuffleMask(VT, Mask, IsUnary);
	return true;
	}
	case X86ISD::VSHLI:
	case X86ISD::VSRLI: {
	uint64_t ShiftVal = N.getConstantOperandVal(1);
	// Out of range bit shifts are guaranteed to be zero.
	if (NumBitsPerElt <= ShiftVal) {
	Mask.append(NumElts, SM_SentinelZero);
	return true;
	}

	// We can only decode 'whole byte' bit shifts as shuffles.
	if ((ShiftVal % 8) != 0)
	break;

	uint64_t ByteShift = ShiftVal / 8;
	unsigned NumBytes = NumSizeInBits / 8;
	unsigned NumBytesPerElt = NumBitsPerElt / 8;
	Ops.push_back(N.getOperand(0));

	// Clear mask to all zeros and insert the shifted byte indices.
	Mask.append(NumBytes, SM_SentinelZero);

	if (X86ISD::VSHLI == Opcode) {
	for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
	for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
	Mask[i + j] = i + j - ByteShift;
	} else {
	for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
	for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
	Mask[i + j - ByteShift] = i + j;
	}
	return true;
	}
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	case X86ISD::VZEXT: {
	// TODO - add support for VPMOVZX with smaller input vector types.
	SDValue Src = N.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	if (NumSizeInBits != SrcVT.getSizeInBits())
	break;
	DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
	Ops.push_back(Src);
	return true;
	}
	}

	return false;
	}

	/// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
	static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
	SmallVectorImpl<int> &Mask) {
	int MaskWidth = Mask.size();
	SmallVector<SDValue, 16> UsedInputs;
	for (int i = 0, e = Inputs.size(); i < e; ++i) {
	int lo = UsedInputs.size() * MaskWidth;
	int hi = lo + MaskWidth;

	// Strip UNDEF input usage.
	if (Inputs[i].isUndef())
	for (int &M : Mask)
	if ((lo <= M) && (M < hi))
	M = SM_SentinelUndef;

	// Check for unused inputs.
	if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
	UsedInputs.push_back(Inputs[i]);
	continue;
	}
	for (int &M : Mask)
	if (lo <= M)
	M -= MaskWidth;
	}
	Inputs = UsedInputs;
	}

	/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
	/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
	/// remaining input indices in case we now have a unary shuffle and adjust the
	/// inputs accordingly.
	/// Returns true if the target shuffle mask was decoded.
	static bool resolveTargetShuffleInputs(SDValue Op,
	SmallVectorImpl<SDValue> &Inputs,
	SmallVectorImpl<int> &Mask,
	SelectionDAG &DAG) {
	if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
	if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
	return false;

	resolveTargetShuffleInputsAndMask(Inputs, Mask);
	return true;
	}

	/// Returns the scalar element that will make up the ith
	/// element of the result of the vector shuffle.
	static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
	unsigned Depth) {
	if (Depth == 6)
	return SDValue(); // Limit search depth.

	SDValue V = SDValue(N, 0);
	EVT VT = V.getValueType();
	unsigned Opcode = V.getOpcode();

	// Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
	if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
	int Elt = SV->getMaskElt(Index);

	if (Elt < 0)
	return DAG.getUNDEF(VT.getVectorElementType());

	unsigned NumElems = VT.getVectorNumElements();
	SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
	: SV->getOperand(1);
	return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
	}

	// Recurse into target specific vector shuffles to find scalars.
	if (isTargetShuffle(Opcode)) {
	MVT ShufVT = V.getSimpleValueType();
	MVT ShufSVT = ShufVT.getVectorElementType();
	int NumElems = (int)ShufVT.getVectorNumElements();
	SmallVector<int, 16> ShuffleMask;
	SmallVector<SDValue, 16> ShuffleOps;
	bool IsUnary;

	if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
	return SDValue();

	int Elt = ShuffleMask[Index];
	if (Elt == SM_SentinelZero)
	return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
	: DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
	if (Elt == SM_SentinelUndef)
	return DAG.getUNDEF(ShufSVT);

	assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
	SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
	return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
	Depth+1);
	}

	// Actual nodes that may contain scalar elements
	if (Opcode == ISD::BITCAST) {
	V = V.getOperand(0);
	EVT SrcVT = V.getValueType();
	unsigned NumElems = VT.getVectorNumElements();

	if (!SrcVT.isVector() \|\| SrcVT.getVectorNumElements() != NumElems)
	return SDValue();
	}

	if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
	return (Index == 0) ? V.getOperand(0)
	: DAG.getUNDEF(VT.getVectorElementType());

	if (V.getOpcode() == ISD::BUILD_VECTOR)
	return V.getOperand(Index);

	return SDValue();
	}

	// Use PINSRB/PINSRW/PINSRD to create a build vector.
	static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
	unsigned NumNonZero, unsigned NumZero,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	unsigned NumElts = VT.getVectorNumElements();
	assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) \|\|
	((VT == MVT::v16i8 \|\| VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
	"Illegal vector insertion");

	SDLoc dl(Op);
	SDValue V;
	bool First = true;

	for (unsigned i = 0; i < NumElts; ++i) {
	bool IsNonZero = (NonZeros & (1 << i)) != 0;
	if (!IsNonZero)
	continue;

	// If the build vector contains zeros or our first insertion is not the
	// first index then insert into zero vector to break any register
	// dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
	if (First) {
	First = false;
	if (NumZero \|\| 0 != i)
	V = getZeroVector(VT, Subtarget, DAG, dl);
	else {
	assert(0 == i && "Expected insertion into zero-index");
	V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
	V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
	V = DAG.getBitcast(VT, V);
	continue;
	}
	}
	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
	DAG.getIntPtrConstant(i, dl));
	}

	return V;
	}

	/// Custom lower build_vector of v16i8.
	static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
	unsigned NumNonZero, unsigned NumZero,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (NumNonZero > 8 && !Subtarget.hasSSE41())
	return SDValue();

	// SSE4.1 - use PINSRB to insert each byte directly.
	if (Subtarget.hasSSE41())
	return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
	Subtarget);

	SDLoc dl(Op);
	SDValue V;
	bool First = true;

	// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
	for (unsigned i = 0; i < 16; ++i) {
	bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
	if (ThisIsNonZero && First) {
	if (NumZero)
	V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
	else
	V = DAG.getUNDEF(MVT::v8i16);
	First = false;
	}

	if ((i & 1) != 0) {
	// FIXME: Investigate extending to i32 instead of just i16.
	// FIXME: Investigate combining the first 4 bytes as a i32 instead.
	SDValue ThisElt, LastElt;
	bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
	if (LastIsNonZero) {
	LastElt =
	DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
	}
	if (ThisIsNonZero) {
	ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
	ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
	DAG.getConstant(8, dl, MVT::i8));
	if (LastIsNonZero)
	ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
	} else
	ThisElt = LastElt;

	if (ThisElt) {
	if (1 == i) {
	V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
	: DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
	V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
	V = DAG.getBitcast(MVT::v8i16, V);
	} else {
	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
	DAG.getIntPtrConstant(i / 2, dl));
	}
	}
	}
	}

	return DAG.getBitcast(MVT::v16i8, V);
	}

	/// Custom lower build_vector of v8i16.
	static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
	unsigned NumNonZero, unsigned NumZero,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (NumNonZero > 4 && !Subtarget.hasSSE41())
	return SDValue();

	// Use PINSRW to insert each byte directly.
	return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
	Subtarget);
	}

	/// Custom lower build_vector of v4i32 or v4f32.
	static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Find all zeroable elements.
	std::bitset<4> Zeroable;
	for (int i=0; i < 4; ++i) {
	SDValue Elt = Op->getOperand(i);
	Zeroable[i] = (Elt.isUndef() \|\| X86::isZeroNode(Elt));
	}
	assert(Zeroable.size() - Zeroable.count() > 1 &&
	"We expect at least two non-zero elements!");

	// We only know how to deal with build_vector nodes where elements are either
	// zeroable or extract_vector_elt with constant index.
	SDValue FirstNonZero;
	unsigned FirstNonZeroIdx;
	for (unsigned i=0; i < 4; ++i) {
	if (Zeroable[i])
	continue;
	SDValue Elt = Op->getOperand(i);
	if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(Elt.getOperand(1)))
	return SDValue();
	// Make sure that this node is extracting from a 128-bit vector.
	MVT VT = Elt.getOperand(0).getSimpleValueType();
	if (!VT.is128BitVector())
	return SDValue();
	if (!FirstNonZero.getNode()) {
	FirstNonZero = Elt;
	FirstNonZeroIdx = i;
	}
	}

	assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
	SDValue V1 = FirstNonZero.getOperand(0);
	MVT VT = V1.getSimpleValueType();

	// See if this build_vector can be lowered as a blend with zero.
	SDValue Elt;
	unsigned EltMaskIdx, EltIdx;
	int Mask[4];
	for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
	if (Zeroable[EltIdx]) {
	// The zero vector will be on the right hand side.
	Mask[EltIdx] = EltIdx+4;
	continue;
	}

	Elt = Op->getOperand(EltIdx);
	// By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
	EltMaskIdx = Elt.getConstantOperandVal(1);
	if (Elt.getOperand(0) != V1 \|\| EltMaskIdx != EltIdx)
	break;
	Mask[EltIdx] = EltIdx;
	}

	if (EltIdx == 4) {
	// Let the shuffle legalizer deal with blend operations.
	SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
	if (V1.getSimpleValueType() != VT)
	V1 = DAG.getBitcast(VT, V1);
	return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
	}

	// See if we can lower this build_vector to a INSERTPS.
	if (!Subtarget.hasSSE41())
	return SDValue();

	SDValue V2 = Elt.getOperand(0);
	if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
	V1 = SDValue();

	bool CanFold = true;
	for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
	if (Zeroable[i])
	continue;

	SDValue Current = Op->getOperand(i);
	SDValue SrcVector = Current->getOperand(0);
	if (!V1.getNode())
	V1 = SrcVector;
	CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
	}

	if (!CanFold)
	return SDValue();

	assert(V1.getNode() && "Expected at least two non-zero elements!");
	if (V1.getSimpleValueType() != MVT::v4f32)
	V1 = DAG.getBitcast(MVT::v4f32, V1);
	if (V2.getSimpleValueType() != MVT::v4f32)
	V2 = DAG.getBitcast(MVT::v4f32, V2);

	// Ok, we can emit an INSERTPS instruction.
	unsigned ZMask = Zeroable.to_ulong();

	unsigned InsertPSMask = EltMaskIdx << 6 \| EltIdx << 4 \| ZMask;
	assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
	SDLoc DL(Op);
	SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
	DAG.getIntPtrConstant(InsertPSMask, DL));
	return DAG.getBitcast(VT, Result);
	}

	/// Return a vector logical shift node.
	static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
	SelectionDAG &DAG, const TargetLowering &TLI,
	const SDLoc &dl) {
	assert(VT.is128BitVector() && "Unknown type for VShift");
	MVT ShVT = MVT::v16i8;
	unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
	SrcOp = DAG.getBitcast(ShVT, SrcOp);
	MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
	assert(NumBits % 8 == 0 && "Only support byte sized shifts");
	SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
	return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
	}

	static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
	SelectionDAG &DAG) {

	// Check if the scalar load can be widened into a vector load. And if
	// the address is "base + cst" see if the cst can be "absorbed" into
	// the shuffle mask.
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
	SDValue Ptr = LD->getBasePtr();
	if (!ISD::isNormalLoad(LD) \|\| LD->isVolatile())
	return SDValue();
	EVT PVT = LD->getValueType(0);
	if (PVT != MVT::i32 && PVT != MVT::f32)
	return SDValue();

	int FI = -1;
	int64_t Offset = 0;
	if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
	FI = FINode->getIndex();
	Offset = 0;
	} else if (DAG.isBaseWithConstantOffset(Ptr) &&
	isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
	FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
	Offset = Ptr.getConstantOperandVal(1);
	Ptr = Ptr.getOperand(0);
	} else {
	return SDValue();
	}

	// FIXME: 256-bit vector instructions don't require a strict alignment,
	// improve this code to support it better.
	unsigned RequiredAlign = VT.getSizeInBits()/8;
	SDValue Chain = LD->getChain();
	// Make sure the stack object alignment is at least 16 or 32.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
	if (MFI.isFixedObjectIndex(FI)) {
	// Can't change the alignment. FIXME: It's possible to compute
	// the exact stack offset and reference FI + adjust offset instead.
	// If someone really cares about this. That's the way to implement it.
	return SDValue();
	} else {
	MFI.setObjectAlignment(FI, RequiredAlign);
	}
	}

	// (Offset % 16 or 32) must be multiple of 4. Then address is then
	// Ptr + (Offset & ~15).
	if (Offset < 0)
	return SDValue();
	if ((Offset % RequiredAlign) & 3)
	return SDValue();
	int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
	if (StartOffset) {
	SDLoc DL(Ptr);
	Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
	DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
	}

	int EltNo = (Offset - StartOffset) >> 2;
	unsigned NumElems = VT.getVectorNumElements();

	EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
	SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
	LD->getPointerInfo().getWithOffset(StartOffset));

	SmallVector<int, 8> Mask(NumElems, EltNo);

	return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
	}

	return SDValue();
	}

	/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
	/// elements can be replaced by a single large load which has the same value as
	/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
	///
	/// Example: <load i32 a, load i32 a+4, zero, undef> -> zextload a
	static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
	const SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	bool isAfterLegalize) {
	unsigned NumElems = Elts.size();

	int LastLoadedElt = -1;
	SmallBitVector LoadMask(NumElems, false);
	SmallBitVector ZeroMask(NumElems, false);
	SmallBitVector UndefMask(NumElems, false);

	// For each element in the initializer, see if we've found a load, zero or an
	// undef.
	for (unsigned i = 0; i < NumElems; ++i) {
	SDValue Elt = peekThroughBitcasts(Elts[i]);
	if (!Elt.getNode())
	return SDValue();

	if (Elt.isUndef())
	UndefMask[i] = true;
	else if (X86::isZeroNode(Elt) \|\| ISD::isBuildVectorAllZeros(Elt.getNode()))
	ZeroMask[i] = true;
	else if (ISD::isNON_EXTLoad(Elt.getNode())) {
	LoadMask[i] = true;
	LastLoadedElt = i;
	// Each loaded element must be the correct fractional portion of the
	// requested vector load.
	if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
	return SDValue();
	} else
	return SDValue();
	}
	assert((ZeroMask \| UndefMask \| LoadMask).count() == NumElems &&
	"Incomplete element masks");

	// Handle Special Cases - all undef or undef/zero.
	if (UndefMask.count() == NumElems)
	return DAG.getUNDEF(VT);

	// FIXME: Should we return this as a BUILD_VECTOR instead?
	if ((ZeroMask \| UndefMask).count() == NumElems)
	return VT.isInteger() ? DAG.getConstant(0, DL, VT)
	: DAG.getConstantFP(0.0, DL, VT);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	int FirstLoadedElt = LoadMask.find_first();
	SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
	LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
	EVT LDBaseVT = EltBase.getValueType();

	// Consecutive loads can contain UNDEFS but not ZERO elements.
	// Consecutive loads with UNDEFs and ZEROs elements require a
	// an additional shuffle stage to clear the ZERO elements.
	bool IsConsecutiveLoad = true;
	bool IsConsecutiveLoadWithZeros = true;
	for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
	if (LoadMask[i]) {
	SDValue Elt = peekThroughBitcasts(Elts[i]);
	LoadSDNode *LD = cast<LoadSDNode>(Elt);
	if (!DAG.areNonVolatileConsecutiveLoads(
	LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
	i - FirstLoadedElt)) {
	IsConsecutiveLoad = false;
	IsConsecutiveLoadWithZeros = false;
	break;
	}
	} else if (ZeroMask[i]) {
	IsConsecutiveLoad = false;
	}
	}

	SmallVector<LoadSDNode *, 8> Loads;
	for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i)
	if (LoadMask[i])
	Loads.push_back(cast<LoadSDNode>(peekThroughBitcasts(Elts[i])));

	auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
	auto MMOFlags = LDBase->getMemOperand()->getFlags();
	assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
	"Cannot merge volatile loads.");
	SDValue NewLd =
	DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
	LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
	for (auto *LD : Loads)
	DAG.makeEquivalentMemoryOrdering(LD, NewLd);
	return NewLd;
	};

	// LOAD - all consecutive load/undefs (must start/end with a load).
	// If we have found an entire vector of loads and undefs, then return a large
	// load of the entire vector width starting at the base pointer.
	// If the vector contains zeros, then attempt to shuffle those elements.
	if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
	(IsConsecutiveLoad \|\| IsConsecutiveLoadWithZeros)) {
	assert(LDBase && "Did not find base load for merging consecutive loads");
	EVT EltVT = LDBase->getValueType(0);
	// Ensure that the input vector size for the merged loads matches the
	// cumulative size of the input elements.
	if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
	return SDValue();

	if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
	return SDValue();

	// Don't create 256-bit non-temporal aligned loads without AVX2 as these
	// will lower to regular temporal loads and use the cache.
	if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
	VT.is256BitVector() && !Subtarget.hasInt256())
	return SDValue();

	if (IsConsecutiveLoad)
	return CreateLoad(VT, LDBase);

	// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
	// vector and a zero vector to clear out the zero elements.
	if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
	SmallVector<int, 4> ClearMask(NumElems, -1);
	for (unsigned i = 0; i < NumElems; ++i) {
	if (ZeroMask[i])
	ClearMask[i] = i + NumElems;
	else if (LoadMask[i])
	ClearMask[i] = i;
	}
	SDValue V = CreateLoad(VT, LDBase);
	SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
	: DAG.getConstantFP(0.0, DL, VT);
	return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
	}
	}

	int LoadSize =
	(1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();

	// VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
	if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
	(LoadSize == 32 \|\| LoadSize == 64) &&
	((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()))) {
	MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
	: MVT::getIntegerVT(LoadSize);
	MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
	if (TLI.isTypeLegal(VecVT)) {
	SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
	SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
	SDValue ResNode =
	DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
	LDBase->getPointerInfo(),
	LDBase->getAlignment(),
	MachineMemOperand::MOLoad);
	for (auto *LD : Loads)
	DAG.makeEquivalentMemoryOrdering(LD, ResNode);
	return DAG.getBitcast(VT, ResNode);
	}
	}

	return SDValue();
	}

	static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
	unsigned SplatBitSize, LLVMContext &C) {
	unsigned ScalarSize = VT.getScalarSizeInBits();
	unsigned NumElm = SplatBitSize / ScalarSize;

	SmallVector<Constant *, 32> ConstantVec;
	for (unsigned i = 0; i < NumElm; i++) {
	APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
	Constant *Const;
	if (VT.isFloatingPoint()) {
	if (ScalarSize == 32) {
	Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
	} else {
	assert(ScalarSize == 64 && "Unsupported floating point scalar size");
	Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
	}
	} else
	Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
	ConstantVec.push_back(Const);
	}
	return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
	}

	static bool isUseOfShuffle(SDNode *N) {
	for (auto *U : N->uses()) {
	if (isTargetShuffle(U->getOpcode()))
	return true;
	if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
	return isUseOfShuffle(U);
	}
	return false;
	}

	// Check if the current node of build vector is a zero extended vector.
	// // If so, return the value extended.
	// // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
	// // NumElt - return the number of zero extended identical values.
	// // EltType - return the type of the value include the zero extend.
	static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
	unsigned &NumElt, MVT &EltType) {
	SDValue ExtValue = Op->getOperand(0);
	unsigned NumElts = Op->getNumOperands();
	unsigned Delta = NumElts;

	for (unsigned i = 1; i < NumElts; i++) {
	if (Op->getOperand(i) == ExtValue) {
	Delta = i;
	break;
	}
	if (!(Op->getOperand(i).isUndef() \|\| isNullConstant(Op->getOperand(i))))
	return SDValue();
	}
	if (!isPowerOf2_32(Delta) \|\| Delta == 1)
	return SDValue();

	for (unsigned i = Delta; i < NumElts; i++) {
	if (i % Delta == 0) {
	if (Op->getOperand(i) != ExtValue)
	return SDValue();
	} else if (!(isNullConstant(Op->getOperand(i)) \|\|
	Op->getOperand(i).isUndef()))
	return SDValue();
	}
	unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
	unsigned ExtVTSize = EltSize * Delta;
	EltType = MVT::getIntegerVT(ExtVTSize);
	NumElt = NumElts / Delta;
	return ExtValue;
	}

	/// Attempt to use the vbroadcast instruction to generate a splat value
	/// from a splat BUILD_VECTOR which uses:
	/// a. A single scalar load, or a constant.
	/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
	///
	/// The VBROADCAST node is returned when a pattern is found,
	/// or SDValue() otherwise.
	static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// VBROADCAST requires AVX.
	// TODO: Splats could be generated for non-AVX CPUs using SSE
	// instructions, but there's less potential gain for only 128-bit vectors.
	if (!Subtarget.hasAVX())
	return SDValue();

	MVT VT = BVOp->getSimpleValueType(0);
	SDLoc dl(BVOp);

	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) &&
	"Unsupported vector type for broadcast.");

	BitVector UndefElements;
	SDValue Ld = BVOp->getSplatValue(&UndefElements);

	// Attempt to use VBROADCASTM
	// From this paterrn:
	// a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
	// b. t1 = (build_vector t0 t0)
	//
	// Create (VBROADCASTM v2i1 X)
	if (Subtarget.hasCDI() && (VT.is512BitVector() \|\| Subtarget.hasVLX())) {
	MVT EltType = VT.getScalarType();
	unsigned NumElts = VT.getVectorNumElements();
	SDValue BOperand;
	SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
	if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) \|\|
	(Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
	Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
	if (ZeroExtended)
	BOperand = ZeroExtended.getOperand(0);
	else
	BOperand = Ld.getOperand(0).getOperand(0);
	if (BOperand.getValueType().isVector() &&
	BOperand.getSimpleValueType().getVectorElementType() == MVT::i1) {
	if ((EltType == MVT::i64 && (VT.getVectorElementType() == MVT::i8 \|\|
	NumElts == 8)) \|\| // for broadcastmb2q
	(EltType == MVT::i32 && (VT.getVectorElementType() == MVT::i16 \|\|
	NumElts == 16))) { // for broadcastmw2d
	SDValue Brdcst =
	DAG.getNode(X86ISD::VBROADCASTM, dl,
	MVT::getVectorVT(EltType, NumElts), BOperand);
	return DAG.getBitcast(VT, Brdcst);
	}
	}
	}
	}

	// We need a splat of a single value to use broadcast, and it doesn't
	// make any sense if the value is only in one element of the vector.
	if (!Ld \|\| (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
	APInt SplatValue, Undef;
	unsigned SplatBitSize;
	bool HasUndef;
	// Check if this is a repeated constant pattern suitable for broadcasting.
	if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
	SplatBitSize > VT.getScalarSizeInBits() &&
	SplatBitSize < VT.getSizeInBits()) {
	// Avoid replacing with broadcast when it's a use of a shuffle
	// instruction to preserve the present custom lowering of shuffles.
	if (isUseOfShuffle(BVOp) \|\| BVOp->hasOneUse())
	return SDValue();
	// replace BUILD_VECTOR with broadcast of the repeated constants.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	LLVMContext *Ctx = DAG.getContext();
	MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
	if (Subtarget.hasAVX()) {
	if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
	!(SplatBitSize == 64 && Subtarget.is32Bit())) {
	// Splatted value can fit in one INTEGER constant in constant pool.
	// Load the constant and broadcast it.
	MVT CVT = MVT::getIntegerVT(SplatBitSize);
	Type ScalarTy = Type::getIntNTy(Ctx, SplatBitSize);
	Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
	SDValue CP = DAG.getConstantPool(C, PVT);
	unsigned Repeat = VT.getSizeInBits() / SplatBitSize;

	unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
	Ld = DAG.getLoad(
	CVT, dl, DAG.getEntryNode(), CP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
	MVT::getVectorVT(CVT, Repeat), Ld);
	return DAG.getBitcast(VT, Brdcst);
	} else if (SplatBitSize == 32 \|\| SplatBitSize == 64) {
	// Splatted value can fit in one FLOAT constant in constant pool.
	// Load the constant and broadcast it.
	// AVX have support for 32 and 64 bit broadcast for floats only.
	// No 64bit integer in 32bit subtarget.
	MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
	// Lower the splat via APFloat directly, to avoid any conversion.
	Constant *C =
	SplatBitSize == 32
	? ConstantFP::get(*Ctx,
	APFloat(APFloat::IEEEsingle(), SplatValue))
	: ConstantFP::get(*Ctx,
	APFloat(APFloat::IEEEdouble(), SplatValue));
	SDValue CP = DAG.getConstantPool(C, PVT);
	unsigned Repeat = VT.getSizeInBits() / SplatBitSize;

	unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
	Ld = DAG.getLoad(
	CVT, dl, DAG.getEntryNode(), CP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
	MVT::getVectorVT(CVT, Repeat), Ld);
	return DAG.getBitcast(VT, Brdcst);
	} else if (SplatBitSize > 64) {
	// Load the vector of constants and broadcast it.
	MVT CVT = VT.getScalarType();
	Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
	*Ctx);
	SDValue VCP = DAG.getConstantPool(VecC, PVT);
	unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
	unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
	Ld = DAG.getLoad(
	MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
	return DAG.getBitcast(VT, Brdcst);
	}
	}
	}
	return SDValue();
	}

	bool ConstSplatVal =
	(Ld.getOpcode() == ISD::Constant \|\| Ld.getOpcode() == ISD::ConstantFP);

	// Make sure that all of the users of a non-constant load are from the
	// BUILD_VECTOR node.
	if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
	return SDValue();

	unsigned ScalarSize = Ld.getValueSizeInBits();
	bool IsGE256 = (VT.getSizeInBits() >= 256);

	// When optimizing for size, generate up to 5 extra bytes for a broadcast
	// instruction to save 8 or more bytes of constant pool data.
	// TODO: If multiple splats are generated to load the same constant,
	// it may be detrimental to overall size. There needs to be a way to detect
	// that condition to know if this is truly a size win.
	bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();

	// Handle broadcasting a single constant scalar from the constant pool
	// into a vector.
	// On Sandybridge (no AVX2), it is still better to load a constant vector
	// from the constant pool and not to broadcast it from a scalar.
	// But override that restriction when optimizing for size.
	// TODO: Check if splatting is recommended for other AVX-capable CPUs.
	if (ConstSplatVal && (Subtarget.hasAVX2() \|\| OptForSize)) {
	EVT CVT = Ld.getValueType();
	assert(!CVT.isVector() && "Must not broadcast a vector type");

	// Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
	// For size optimization, also splat v2f64 and v2i64, and for size opt
	// with AVX2, also splat i8 and i16.
	// With pattern matching, the VBROADCAST node may become a VMOVDDUP.
	if (ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64) \|\|
	(OptForSize && (ScalarSize == 64 \|\| Subtarget.hasAVX2()))) {
	const Constant *C = nullptr;
	if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
	C = CI->getConstantIntValue();
	else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
	C = CF->getConstantFPValue();

	assert(C && "Invalid constant type");

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue CP =
	DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
	unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
	Ld = DAG.getLoad(
	CVT, dl, DAG.getEntryNode(), CP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);

	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
	}
	}

	bool IsLoad = ISD::isNormalLoad(Ld.getNode());

	// Handle AVX2 in-register broadcasts.
	if (!IsLoad && Subtarget.hasInt256() &&
	(ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64)))
	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

	// The scalar source must be a normal load.
	if (!IsLoad)
	return SDValue();

	if (ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64) \|\|
	(Subtarget.hasVLX() && ScalarSize == 64))
	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

	// The integer check is needed for the 64-bit into 128-bit so it doesn't match
	// double since there is no vbroadcastsd xmm
	if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
	if (ScalarSize == 8 \|\| ScalarSize == 16 \|\| ScalarSize == 64)
	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
	}

	// Unsupported broadcast.
	return SDValue();
	}

	/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
	/// underlying vector and index.
	///
	/// Modifies \p ExtractedFromVec to the real vector and returns the real
	/// index.
	static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
	SDValue ExtIdx) {
	int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
	if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
	return Idx;

	// For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
	// lowered this:
	// (extract_vector_elt (v8f32 %1), Constant<6>)
	// to:
	// (extract_vector_elt (vector_shuffle<2,u,u,u>
	// (extract_subvector (v8f32 %0), Constant<4>),
	// undef)
	// Constant<0>)
	// In this case the vector is the extract_subvector expression and the index
	// is 2, as specified by the shuffle.
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
	SDValue ShuffleVec = SVOp->getOperand(0);
	MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
	assert(ShuffleVecVT.getVectorElementType() ==
	ExtractedFromVec.getSimpleValueType().getVectorElementType());

	int ShuffleIdx = SVOp->getMaskElt(Idx);
	if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
	ExtractedFromVec = ShuffleVec;
	return ShuffleIdx;
	}
	return Idx;
	}

	static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	// Skip if insert_vec_elt is not supported.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
	return SDValue();

	SDLoc DL(Op);
	unsigned NumElems = Op.getNumOperands();

	SDValue VecIn1;
	SDValue VecIn2;
	SmallVector<unsigned, 4> InsertIndices;
	SmallVector<int, 8> Mask(NumElems, -1);

	for (unsigned i = 0; i != NumElems; ++i) {
	unsigned Opc = Op.getOperand(i).getOpcode();

	if (Opc == ISD::UNDEF)
	continue;

	if (Opc != ISD::EXTRACT_VECTOR_ELT) {
	// Quit if more than 1 elements need inserting.
	if (InsertIndices.size() > 1)
	return SDValue();

	InsertIndices.push_back(i);
	continue;
	}

	SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
	SDValue ExtIdx = Op.getOperand(i).getOperand(1);

	// Quit if non-constant index.
	if (!isa<ConstantSDNode>(ExtIdx))
	return SDValue();
	int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);

	// Quit if extracted from vector of different type.
	if (ExtractedFromVec.getValueType() != VT)
	return SDValue();

	if (!VecIn1.getNode())
	VecIn1 = ExtractedFromVec;
	else if (VecIn1 != ExtractedFromVec) {
	if (!VecIn2.getNode())
	VecIn2 = ExtractedFromVec;
	else if (VecIn2 != ExtractedFromVec)
	// Quit if more than 2 vectors to shuffle
	return SDValue();
	}

	if (ExtractedFromVec == VecIn1)
	Mask[i] = Idx;
	else if (ExtractedFromVec == VecIn2)
	Mask[i] = Idx + NumElems;
	}

	if (!VecIn1.getNode())
	return SDValue();

	VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
	SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);

	for (unsigned Idx : InsertIndices)
	NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
	DAG.getIntPtrConstant(Idx, DL));

	return NV;
	}

	static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
	assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
	Op.getScalarValueSizeInBits() == 1 &&
	"Can not convert non-constant vector");
	uint64_t Immediate = 0;
	for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
	SDValue In = Op.getOperand(idx);
	if (!In.isUndef())
	Immediate \|= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
	}
	SDLoc dl(Op);
	MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
	return DAG.getConstant(Immediate, dl, VT);
	}
	// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
	static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {

	MVT VT = Op.getSimpleValueType();
	assert((VT.getVectorElementType() == MVT::i1) &&
	"Unexpected type in LowerBUILD_VECTORvXi1!");

	SDLoc dl(Op);
	if (ISD::isBuildVectorAllZeros(Op.getNode()))
	return Op;

	if (ISD::isBuildVectorAllOnes(Op.getNode()))
	return Op;

	if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
	if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
	// Split the pieces.
	SDValue Lower =
	DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32));
	SDValue Upper =
	DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32));
	// We have to manually lower both halves so getNode doesn't try to
	// reassemble the build_vector.
	Lower = LowerBUILD_VECTORvXi1(Lower, DAG, Subtarget);
	Upper = LowerBUILD_VECTORvXi1(Upper, DAG, Subtarget);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper);
	}
	SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
	if (Imm.getValueSizeInBits() == VT.getSizeInBits())
	return DAG.getBitcast(VT, Imm);
	SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Vector has one or more non-const elements
	uint64_t Immediate = 0;
	SmallVector<unsigned, 16> NonConstIdx;
	bool IsSplat = true;
	bool HasConstElts = false;
	int SplatIdx = -1;
	for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
	SDValue In = Op.getOperand(idx);
	if (In.isUndef())
	continue;
	if (!isa<ConstantSDNode>(In))
	NonConstIdx.push_back(idx);
	else {
	Immediate \|= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
	HasConstElts = true;
	}
	if (SplatIdx < 0)
	SplatIdx = idx;
	else if (In != Op.getOperand(SplatIdx))
	IsSplat = false;
	}

	// for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
	if (IsSplat)
	return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
	DAG.getConstant(1, dl, VT),
	DAG.getConstant(0, dl, VT));

	// insert elements one by one
	SDValue DstVec;
	SDValue Imm;
	if (Immediate) {
	MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
	Imm = DAG.getConstant(Immediate, dl, ImmVT);
	}
	else if (HasConstElts)
	Imm = DAG.getConstant(0, dl, VT);
	else
	Imm = DAG.getUNDEF(VT);
	if (Imm.getValueSizeInBits() == VT.getSizeInBits())
	DstVec = DAG.getBitcast(VT, Imm);
	else {
	SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
	DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
	DAG.getIntPtrConstant(0, dl));
	}

	for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
	unsigned InsertIdx = NonConstIdx[i];
	DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
	Op.getOperand(InsertIdx),
	DAG.getIntPtrConstant(InsertIdx, dl));
	}
	return DstVec;
	}

	/// \brief Return true if \p N implements a horizontal binop and return the
	/// operands for the horizontal binop into V0 and V1.
	///
	/// This is a helper function of LowerToHorizontalOp().
	/// This function checks that the build_vector \p N in input implements a
	/// horizontal operation. Parameter \p Opcode defines the kind of horizontal
	/// operation to match.
	/// For example, if \p Opcode is equal to ISD::ADD, then this function
	/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
	/// is equal to ISD::SUB, then this function checks if this is a horizontal
	/// arithmetic sub.
	///
	/// This function only analyzes elements of \p N whose indices are
	/// in range [BaseIdx, LastIdx).
	static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
	SelectionDAG &DAG,
	unsigned BaseIdx, unsigned LastIdx,
	SDValue &V0, SDValue &V1) {
	EVT VT = N->getValueType(0);

	assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
	assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
	"Invalid Vector in input!");

	bool IsCommutable = (Opcode == ISD::ADD \|\| Opcode == ISD::FADD);
	bool CanFold = true;
	unsigned ExpectedVExtractIdx = BaseIdx;
	unsigned NumElts = LastIdx - BaseIdx;
	V0 = DAG.getUNDEF(VT);
	V1 = DAG.getUNDEF(VT);

	// Check if N implements a horizontal binop.
	for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
	SDValue Op = N->getOperand(i + BaseIdx);

	// Skip UNDEFs.
	if (Op->isUndef()) {
	// Update the expected vector extract index.
	if (i * 2 == NumElts)
	ExpectedVExtractIdx = BaseIdx;
	ExpectedVExtractIdx += 2;
	continue;
	}

	CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();

	if (!CanFold)
	break;

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	// Try to match the following pattern:
	// (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
	CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Op0.getOperand(0) == Op1.getOperand(0) &&
	isa<ConstantSDNode>(Op0.getOperand(1)) &&
	isa<ConstantSDNode>(Op1.getOperand(1)));
	if (!CanFold)
	break;

	unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
	unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();

	if (i * 2 < NumElts) {
	if (V0.isUndef()) {
	V0 = Op0.getOperand(0);
	if (V0.getValueType() != VT)
	return false;
	}
	} else {
	if (V1.isUndef()) {
	V1 = Op0.getOperand(0);
	if (V1.getValueType() != VT)
	return false;
	}
	if (i * 2 == NumElts)
	ExpectedVExtractIdx = BaseIdx;
	}

	SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
	if (I0 == ExpectedVExtractIdx)
	CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
	else if (IsCommutable && I1 == ExpectedVExtractIdx) {
	// Try to match the following dag sequence:
	// (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
	CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
	} else
	CanFold = false;

	ExpectedVExtractIdx += 2;
	}

	return CanFold;
	}

	/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
	/// a concat_vector.
	///
	/// This is a helper function of LowerToHorizontalOp().
	/// This function expects two 256-bit vectors called V0 and V1.
	/// At first, each vector is split into two separate 128-bit vectors.
	/// Then, the resulting 128-bit vectors are used to implement two
	/// horizontal binary operations.
	///
	/// The kind of horizontal binary operation is defined by \p X86Opcode.
	///
	/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
	/// the two new horizontal binop.
	/// When Mode is set, the first horizontal binop dag node would take as input
	/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
	/// horizontal binop dag node would take as input the lower 128-bit of V1
	/// and the upper 128-bit of V1.
	/// Example:
	/// HADD V0_LO, V0_HI
	/// HADD V1_LO, V1_HI
	///
	/// Otherwise, the first horizontal binop dag node takes as input the lower
	/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
	/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
	/// Example:
	/// HADD V0_LO, V1_LO
	/// HADD V0_HI, V1_HI
	///
	/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
	/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
	/// the upper 128-bits of the result.
	static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
	const SDLoc &DL, SelectionDAG &DAG,
	unsigned X86Opcode, bool Mode,
	bool isUndefLO, bool isUndefHI) {
	MVT VT = V0.getSimpleValueType();
	assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
	"Invalid nodes in input!");

	unsigned NumElts = VT.getVectorNumElements();
	SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
	SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
	SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
	SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
	MVT NewVT = V0_LO.getSimpleValueType();

	SDValue LO = DAG.getUNDEF(NewVT);
	SDValue HI = DAG.getUNDEF(NewVT);

	if (Mode) {
	// Don't emit a horizontal binop if the result is expected to be UNDEF.
	if (!isUndefLO && !V0->isUndef())
	LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
	if (!isUndefHI && !V1->isUndef())
	HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
	} else {
	// Don't emit a horizontal binop if the result is expected to be UNDEF.
	if (!isUndefLO && (!V0_LO->isUndef() \|\| !V1_LO->isUndef()))
	LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);

	if (!isUndefHI && (!V0_HI->isUndef() \|\| !V1_HI->isUndef()))
	HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
	}

	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
	}

	/// Returns true iff \p BV builds a vector with the result equivalent to
	/// the result of ADDSUB operation.
	/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
	/// are written to the parameters \p Opnd0 and \p Opnd1.
	static bool isAddSub(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	SDValue &Opnd0, SDValue &Opnd1,
	unsigned &NumExtracts) {

	MVT VT = BV->getSimpleValueType(0);
	if ((!Subtarget.hasSSE3() \|\| (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
	(!Subtarget.hasAVX() \|\| (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
	(!Subtarget.hasAVX512() \|\| (VT != MVT::v16f32 && VT != MVT::v8f64)))
	return false;

	unsigned NumElts = VT.getVectorNumElements();
	SDValue InVec0 = DAG.getUNDEF(VT);
	SDValue InVec1 = DAG.getUNDEF(VT);

	NumExtracts = 0;

	// Odd-numbered elements in the input build vector are obtained from
	// adding two integer/float elements.
	// Even-numbered elements in the input build vector are obtained from
	// subtracting two integer/float elements.
	unsigned ExpectedOpcode = ISD::FSUB;
	unsigned NextExpectedOpcode = ISD::FADD;
	bool AddFound = false;
	bool SubFound = false;

	for (unsigned i = 0, e = NumElts; i != e; ++i) {
	SDValue Op = BV->getOperand(i);

	// Skip 'undef' values.
	unsigned Opcode = Op.getOpcode();
	if (Opcode == ISD::UNDEF) {
	std::swap(ExpectedOpcode, NextExpectedOpcode);
	continue;
	}

	// Early exit if we found an unexpected opcode.
	if (Opcode != ExpectedOpcode)
	return false;

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	// Try to match the following pattern:
	// (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
	// Early exit if we cannot match that sequence.
	if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(Op0.getOperand(1)) \|\|
	!isa<ConstantSDNode>(Op1.getOperand(1)) \|\|
	Op0.getOperand(1) != Op1.getOperand(1))
	return false;

	unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
	if (I0 != i)
	return false;

	// We found a valid add/sub node. Update the information accordingly.
	if (i & 1)
	AddFound = true;
	else
	SubFound = true;

	// Update InVec0 and InVec1.
	if (InVec0.isUndef()) {
	InVec0 = Op0.getOperand(0);
	if (InVec0.getSimpleValueType() != VT)
	return false;
	}
	if (InVec1.isUndef()) {
	InVec1 = Op1.getOperand(0);
	if (InVec1.getSimpleValueType() != VT)
	return false;
	}

	// Make sure that operands in input to each add/sub node always
	// come from a same pair of vectors.
	if (InVec0 != Op0.getOperand(0)) {
	if (ExpectedOpcode == ISD::FSUB)
	return false;

	// FADD is commutable. Try to commute the operands
	// and then test again.
	std::swap(Op0, Op1);
	if (InVec0 != Op0.getOperand(0))
	return false;
	}

	if (InVec1 != Op1.getOperand(0))
	return false;

	// Update the pair of expected opcodes.
	std::swap(ExpectedOpcode, NextExpectedOpcode);

	// Increment the number of extractions done.
	++NumExtracts;
	}

	// Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
	if (!AddFound \|\| !SubFound \|\| InVec0.isUndef() \|\| InVec1.isUndef())
	return false;

	Opnd0 = InVec0;
	Opnd1 = InVec1;
	return true;
	}

	/// Returns true if is possible to fold MUL and an idiom that has already been
	/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
	/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
	/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
	///
	/// Prior to calling this function it should be known that there is some
	/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
	/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
	/// before replacement of such SDNode with ADDSUB operation. Thus the number
	/// of \p Opnd0 uses is expected to be equal to 2.
	/// For example, this function may be called for the following IR:
	/// %AB = fmul fast <2 x double> %A, %B
	/// %Sub = fsub fast <2 x double> %AB, %C
	/// %Add = fadd fast <2 x double> %AB, %C
	/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
	/// <2 x i32> <i32 0, i32 3>
	/// There is a def for %Addsub here, which potentially can be replaced by
	/// X86ISD::ADDSUB operation:
	/// %Addsub = X86ISD::ADDSUB %AB, %C
	/// and such ADDSUB can further be replaced with FMADDSUB:
	/// %Addsub = FMADDSUB %A, %B, %C.
	///
	/// The main reason why this method is called before the replacement of the
	/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
	/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
	/// FMADDSUB is.
	static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
	SelectionDAG &DAG,
	SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
	unsigned ExpectedUses) {
	if (Opnd0.getOpcode() != ISD::FMUL \|\|
	!Opnd0->hasNUsesOfValue(ExpectedUses, 0) \|\| !Subtarget.hasAnyFMA())
	return false;

	// FIXME: These checks must match the similar ones in
	// DAGCombiner::visitFADDForFMACombine. It would be good to have one
	// function that would answer if it is Ok to fuse MUL + ADD to FMADD
	// or MUL + ADDSUB to FMADDSUB.
	const TargetOptions &Options = DAG.getTarget().Options;
	bool AllowFusion =
	(Options.AllowFPOpFusion == FPOpFusion::Fast \|\| Options.UnsafeFPMath);
	if (!AllowFusion)
	return false;

	Opnd2 = Opnd1;
	Opnd1 = Opnd0.getOperand(1);
	Opnd0 = Opnd0.getOperand(0);

	return true;
	}

	/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
	/// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
	static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Opnd0, Opnd1;
	unsigned NumExtracts;
	if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts))
	return SDValue();

	MVT VT = BV->getSimpleValueType(0);
	SDLoc DL(BV);

	// Try to generate X86ISD::FMADDSUB node here.
	SDValue Opnd2;
	// TODO: According to coverage reports, the FMADDSUB transform is not
	// triggered by any tests.
	if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts))
	return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);

	// Do not generate X86ISD::ADDSUB node for 512-bit types even though
	// the ADDSUB idiom has been successfully recognized. There are no known
	// X86 targets with 512-bit ADDSUB instructions!
	// 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
	// recognition.
	if (VT.is512BitVector())
	return SDValue();

	return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
	}

	/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
	static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = BV->getSimpleValueType(0);
	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumUndefsLO = 0;
	unsigned NumUndefsHI = 0;
	unsigned Half = NumElts/2;

	// Count the number of UNDEF operands in the build_vector in input.
	for (unsigned i = 0, e = Half; i != e; ++i)
	if (BV->getOperand(i)->isUndef())
	NumUndefsLO++;

	for (unsigned i = Half, e = NumElts; i != e; ++i)
	if (BV->getOperand(i)->isUndef())
	NumUndefsHI++;

	// Early exit if this is either a build_vector of all UNDEFs or all the
	// operands but one are UNDEF.
	if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
	return SDValue();

	SDLoc DL(BV);
	SDValue InVec0, InVec1;
	if ((VT == MVT::v4f32 \|\| VT == MVT::v2f64) && Subtarget.hasSSE3()) {
	// Try to match an SSE3 float HADD/HSUB.
	if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
	return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);

	if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
	return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
	} else if ((VT == MVT::v4i32 \|\| VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
	// Try to match an SSSE3 integer HADD/HSUB.
	if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
	return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);

	if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
	return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
	}

	if (!Subtarget.hasAVX())
	return SDValue();

	if ((VT == MVT::v8f32 \|\| VT == MVT::v4f64)) {
	// Try to match an AVX horizontal add/sub of packed single/double
	// precision floating point values from 256-bit vectors.
	SDValue InVec2, InVec3;
	if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
	isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);

	if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
	isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
	} else if (VT == MVT::v8i32 \|\| VT == MVT::v16i16) {
	// Try to match an AVX2 horizontal add/sub of signed integers.
	SDValue InVec2, InVec3;
	unsigned X86Opcode;
	bool CanFold = true;

	if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
	isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	X86Opcode = X86ISD::HADD;
	else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
	isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	X86Opcode = X86ISD::HSUB;
	else
	CanFold = false;

	if (CanFold) {
	// Fold this build_vector into a single horizontal add/sub.
	// Do this only if the target has AVX2.
	if (Subtarget.hasAVX2())
	return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);

	// Do not try to expand this build_vector into a pair of horizontal
	// add/sub if we can emit a pair of scalar add/sub.
	if (NumUndefsLO + 1 == Half \|\| NumUndefsHI + 1 == Half)
	return SDValue();

	// Convert this build_vector into a pair of horizontal binop followed by
	// a concat vector.
	bool isUndefLO = NumUndefsLO == Half;
	bool isUndefHI = NumUndefsHI == Half;
	return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
	isUndefLO, isUndefHI);
	}
	}

	if ((VT == MVT::v8f32 \|\| VT == MVT::v4f64 \|\| VT == MVT::v8i32 \|\|
	VT == MVT::v16i16) && Subtarget.hasAVX()) {
	unsigned X86Opcode;
	if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
	X86Opcode = X86ISD::HADD;
	else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
	X86Opcode = X86ISD::HSUB;
	else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
	X86Opcode = X86ISD::FHADD;
	else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
	X86Opcode = X86ISD::FHSUB;
	else
	return SDValue();

	// Don't try to expand this build_vector into a pair of horizontal add/sub
	// if we can simply emit a pair of scalar add/sub.
	if (NumUndefsLO + 1 == Half \|\| NumUndefsHI + 1 == Half)
	return SDValue();

	// Convert this build_vector into two horizontal add/sub followed by
	// a concat vector.
	bool isUndefLO = NumUndefsLO == Half;
	bool isUndefHI = NumUndefsHI == Half;
	return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
	isUndefLO, isUndefHI);
	}

	return SDValue();
	}

	/// If a BUILD_VECTOR's source elements all apply the same bit operation and
	/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
	/// just apply the bit to the vectors.
	/// NOTE: Its not in our interest to start make a general purpose vectorizer
	/// from this, but enough scalar bit operations are created from the later
	/// legalization + scalarization stages to need basic support.
	static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
	SelectionDAG &DAG) {
	SDLoc DL(Op);
	MVT VT = Op->getSimpleValueType(0);
	unsigned NumElems = VT.getVectorNumElements();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Check that all elements have the same opcode.
	// TODO: Should we allow UNDEFS and if so how many?
	unsigned Opcode = Op->getOperand(0).getOpcode();
	for (unsigned i = 1; i < NumElems; ++i)
	if (Opcode != Op->getOperand(i).getOpcode())
	return SDValue();

	// TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
	switch (Opcode) {
	default:
	return SDValue();
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR:
	// Don't do this if the buildvector is a splat - we'd replace one
	// constant with an entire vector.
	if (Op->getSplatValue())
	return SDValue();
	if (!TLI.isOperationLegalOrPromote(Opcode, VT))
	return SDValue();
	break;
	}

	SmallVector<SDValue, 4> LHSElts, RHSElts;
	for (SDValue Elt : Op->ops()) {
	SDValue LHS = Elt.getOperand(0);
	SDValue RHS = Elt.getOperand(1);

	// We expect the canonicalized RHS operand to be the constant.
	if (!isa<ConstantSDNode>(RHS))
	return SDValue();
	LHSElts.push_back(LHS);
	RHSElts.push_back(RHS);
	}

	SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
	SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
	return DAG.getNode(Opcode, DL, VT, LHS, RHS);
	}

	/// Create a vector constant without a load. SSE/AVX provide the bare minimum
	/// functionality to do this, so it's all zeros, all ones, or some derivation
	/// that is cheap to calculate.
	static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();

	// Vectors containing all zeros can be matched by pxor and xorps.
	if (ISD::isBuildVectorAllZeros(Op.getNode())) {
	// Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
	// and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
	if (VT == MVT::v4i32 \|\| VT == MVT::v8i32 \|\| VT == MVT::v16i32)
	return Op;

	return getZeroVector(VT, Subtarget, DAG, DL);
	}

	// Vectors containing all ones can be matched by pcmpeqd on 128-bit width
	// vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
	// vpcmpeqd on 256-bit vectors.
	if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
	if (VT == MVT::v4i32 \|\| VT == MVT::v16i32 \|\|
	(VT == MVT::v8i32 && Subtarget.hasInt256()))
	return Op;

	return getOnesVector(VT, DAG, DL);
	}

	return SDValue();
	}

	// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
	// reasoned to be a permutation of a vector by indices in a non-constant vector.
	// (build_vector (extract_elt V, (extract_elt I, 0)),
	// (extract_elt V, (extract_elt I, 1)),
	// ...
	// ->
	// (vpermv I, V)
	//
	// TODO: Handle undefs
	// TODO: Utilize pshufb and zero mask blending to support more efficient
	// construction of vectors with constant-0 elements.
	// TODO: Use smaller-element vectors of same width, and "interpolate" the indices,
	// when no native operation available.
	static SDValue
	LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Look for VPERMV and PSHUFB opportunities.
	MVT VT = V.getSimpleValueType();
	switch (VT.SimpleTy) {
	default:
	return SDValue();
	case MVT::v16i8:
	if (!Subtarget.hasSSE3())
	return SDValue();
	break;
	case MVT::v8f32:
	case MVT::v8i32:
	if (!Subtarget.hasAVX2())
	return SDValue();
	break;
	case MVT::v4i64:
	case MVT::v4f64:
	if (!Subtarget.hasVLX())
	return SDValue();
	break;
	case MVT::v16f32:
	case MVT::v8f64:
	case MVT::v16i32:
	case MVT::v8i64:
	if (!Subtarget.hasAVX512())
	return SDValue();
	break;
	case MVT::v32i16:
	if (!Subtarget.hasBWI())
	return SDValue();
	break;
	case MVT::v8i16:
	case MVT::v16i16:
	if (!Subtarget.hasVLX() \|\| !Subtarget.hasBWI())
	return SDValue();
	break;
	case MVT::v64i8:
	if (!Subtarget.hasVBMI())
	return SDValue();
	break;
	case MVT::v32i8:
	if (!Subtarget.hasVLX() \|\| !Subtarget.hasVBMI())
	return SDValue();
	break;
	}
	SDValue SrcVec, IndicesVec;
	// Check for a match of the permute source vector and permute index elements.
	// This is done by checking that the i-th build_vector operand is of the form:
	// (extract_elt SrcVec, (extract_elt IndicesVec, i)).
	for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
	SDValue Op = V.getOperand(Idx);
	if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	// If this is the first extract encountered in V, set the source vector,
	// otherwise verify the extract is from the previously defined source
	// vector.
	if (!SrcVec)
	SrcVec = Op.getOperand(0);
	else if (SrcVec != Op.getOperand(0))
	return SDValue();
	SDValue ExtractedIndex = Op->getOperand(1);
	// Peek through extends.
	if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND \|\|
	ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
	ExtractedIndex = ExtractedIndex.getOperand(0);
	if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	// If this is the first extract from the index vector candidate, set the
	// indices vector, otherwise verify the extract is from the previously
	// defined indices vector.
	if (!IndicesVec)
	IndicesVec = ExtractedIndex.getOperand(0);
	else if (IndicesVec != ExtractedIndex.getOperand(0))
	return SDValue();

	auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
	if (!PermIdx \|\| PermIdx->getZExtValue() != Idx)
	return SDValue();
	}
	MVT IndicesVT = VT;
	if (VT.isFloatingPoint())
	IndicesVT = MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits()),
	VT.getVectorNumElements());
	IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
	if (SrcVec.getValueSizeInBits() < IndicesVT.getSizeInBits()) {
	SrcVec =
	DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(SrcVec), VT, DAG.getUNDEF(VT),
	SrcVec, DAG.getIntPtrConstant(0, SDLoc(SrcVec)));
	}
	if (VT == MVT::v16i8)
	return DAG.getNode(X86ISD::PSHUFB, SDLoc(V), VT, SrcVec, IndicesVec);
	return DAG.getNode(X86ISD::VPERMV, SDLoc(V), VT, IndicesVec, SrcVec);
	}

	SDValue
	X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
	SDLoc dl(Op);

	MVT VT = Op.getSimpleValueType();
	MVT ExtVT = VT.getVectorElementType();
	unsigned NumElems = Op.getNumOperands();

	// Generate vectors for predicate vectors.
	if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
	return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);

	if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
	return VectorConstant;

	BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
	// TODO: Support FMSUBADD here if we ever get tests for the FMADDSUB
	// transform here.
	if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
	return AddSub;
	if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
	return HorizontalOp;
	if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
	return Broadcast;
	if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
	return BitOp;

	unsigned EVTBits = ExtVT.getSizeInBits();

	unsigned NumZero = 0;
	unsigned NumNonZero = 0;
	uint64_t NonZeros = 0;
	bool IsAllConstants = true;
	SmallSet<SDValue, 8> Values;
	unsigned NumConstants = NumElems;
	for (unsigned i = 0; i < NumElems; ++i) {
	SDValue Elt = Op.getOperand(i);
	if (Elt.isUndef())
	continue;
	Values.insert(Elt);
	if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
	IsAllConstants = false;
	NumConstants--;
	}
	if (X86::isZeroNode(Elt))
	NumZero++;
	else {
	assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
	NonZeros \|= ((uint64_t)1 << i);
	NumNonZero++;
	}
	}

	// All undef vector. Return an UNDEF. All zero vectors were handled above.
	if (NumNonZero == 0)
	return DAG.getUNDEF(VT);

	// If we are inserting one variable into a vector of non-zero constants, try
	// to avoid loading each constant element as a scalar. Load the constants as a
	// vector and then insert the variable scalar element. If insertion is not
	// supported, we assume that we will fall back to a shuffle to get the scalar
	// blended with the constants. Insertion into a zero vector is handled as a
	// special-case somewhere below here.
	LLVMContext &Context = *DAG.getContext();
	if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
	(isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) \|\|
	isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
	// Create an all-constant vector. The variable element in the old
	// build vector is replaced by undef in the constant vector. Save the
	// variable scalar element and its index for use in the insertelement.
	Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
	SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
	SDValue VarElt;
	SDValue InsIndex;
	for (unsigned i = 0; i != NumElems; ++i) {
	SDValue Elt = Op.getOperand(i);
	if (auto *C = dyn_cast<ConstantSDNode>(Elt))
	ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
	else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
	ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
	else if (!Elt.isUndef()) {
	assert(!VarElt.getNode() && !InsIndex.getNode() &&
	"Expected one variable element in this vector");
	VarElt = Elt;
	InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));
	}
	}
	Constant *CV = ConstantVector::get(ConstVecOps);
	SDValue DAGConstVec = DAG.getConstantPool(CV, VT);

	// The constants we just created may not be legal (eg, floating point). We
	// must lower the vector right here because we can not guarantee that we'll
	// legalize it before loading it. This is also why we could not just create
	// a new build vector here. If the build vector contains illegal constants,
	// it could get split back up into a series of insert elements.
	// TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
	SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
	MachineFunction &MF = DAG.getMachineFunction();
	MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
	SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
	return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
	}

	// Special case for single non-zero, non-undef, element.
	if (NumNonZero == 1) {
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue Item = Op.getOperand(Idx);

	// If this is an insertion of an i64 value on x86-32, and if the top bits of
	// the value are obviously zero, truncate the value to i32 and do the
	// insertion that way. Only do this if the value is non-constant or if the
	// value is a constant being inserted into element 0. It is cheaper to do
	// a constant pool load than it is to do a movd + shuffle.
	if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
	(!IsAllConstants \|\| Idx == 0)) {
	if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
	// Handle SSE only.
	assert(VT == MVT::v2i64 && "Expected an SSE value type!");
	MVT VecVT = MVT::v4i32;

	// Truncate the value (which may itself be a constant) to i32, and
	// convert it to a vector with movd (S2V+shuffle to zero extend).
	Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
	return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
	Item, Idx * 2, true, Subtarget, DAG));
	}
	}

	// If we have a constant or non-constant insertion into the low element of
	// a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
	// the rest of the elements. This will be matched as movd/movq/movss/movsd
	// depending on what the source datatype is.
	if (Idx == 0) {
	if (NumZero == 0)
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

	if (ExtVT == MVT::i32 \|\| ExtVT == MVT::f32 \|\| ExtVT == MVT::f64 \|\|
	(ExtVT == MVT::i64 && Subtarget.is64Bit())) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\|
	VT.is512BitVector()) &&
	"Expected an SSE value type!");
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
	// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
	return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
	}

	// We can't directly insert an i8 or i16 into a vector, so zero extend
	// it to i32 first.
	if (ExtVT == MVT::i16 \|\| ExtVT == MVT::i8) {
	Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
	if (VT.getSizeInBits() >= 256) {
	MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
	if (Subtarget.hasAVX()) {
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
	Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
	} else {
	// Without AVX, we need to extend to a 128-bit vector and then
	// insert into the 256-bit vector.
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
	SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
	Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
	}
	} else {
	assert(VT.is128BitVector() && "Expected an SSE value type!");
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
	Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
	}
	return DAG.getBitcast(VT, Item);
	}
	}

	// Is it a vector logical left shift?
	if (NumElems == 2 && Idx == 1 &&
	X86::isZeroNode(Op.getOperand(0)) &&
	!X86::isZeroNode(Op.getOperand(1))) {
	unsigned NumBits = VT.getSizeInBits();
	return getVShift(true, VT,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
	VT, Op.getOperand(1)),
	NumBits/2, DAG, *this, dl);
	}

	if (IsAllConstants) // Otherwise, it's better to do a constpool load.
	return SDValue();

	// Otherwise, if this is a vector with i32 or f32 elements, and the element
	// is a non-constant being inserted into an element other than the low one,
	// we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
	// movd/movss) to move this into the low element, then shuffle it into
	// place.
	if (EVTBits == 32) {
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
	return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
	}
	}

	// Splat is obviously ok. Let legalizer expand it to a shuffle.
	if (Values.size() == 1) {
	if (EVTBits == 32) {
	// Instead of a shuffle like this:
	// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
	// Check if it's possible to issue this instead.
	// shuffle (vload ptr)), undef, <1, 1, 1, 1>
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue Item = Op.getOperand(Idx);
	if (Op.getNode()->isOnlyUserOf(Item.getNode()))
	return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
	}
	return SDValue();
	}

	// A vector full of immediates; various special cases are already
	// handled, so this is best done with a single constant-pool load.
	if (IsAllConstants)
	return SDValue();

	if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
	return V;

	// See if we can use a vector load to get all of the elements.
	if (VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) {
	SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
	if (SDValue LD =
	EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
	return LD;
	}

	// For AVX-length vectors, build the individual 128-bit pieces and use
	// shuffles to put them in place.
	if (VT.is256BitVector() \|\| VT.is512BitVector()) {
	EVT HVT = EVT::getVectorVT(Context, ExtVT, NumElems/2);

	// Build both the lower and upper subvector.
	SDValue Lower =
	DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
	SDValue Upper = DAG.getBuildVector(
	HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));

	// Recreate the wider vector with the lower and upper part.
	if (VT.is256BitVector())
	return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
	return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
	}

	// Let legalizer expand 2-wide build_vectors.
	if (EVTBits == 64) {
	if (NumNonZero == 1) {
	// One half is zero or undef.
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
	Op.getOperand(Idx));
	return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
	}
	return SDValue();
	}

	// If element VT is < 32 bits, convert it to inserts into a zero vector.
	if (EVTBits == 8 && NumElems == 16)
	if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
	DAG, Subtarget))
	return V;

	if (EVTBits == 16 && NumElems == 8)
	if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
	DAG, Subtarget))
	return V;

	// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
	if (EVTBits == 32 && NumElems == 4)
	if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
	return V;

	// If element VT is == 32 bits, turn it into a number of shuffles.
	if (NumElems == 4 && NumZero > 0) {
	SmallVector<SDValue, 8> Ops(NumElems);
	for (unsigned i = 0; i < 4; ++i) {
	bool isZero = !(NonZeros & (1ULL << i));
	if (isZero)
	Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
	else
	Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
	}

	for (unsigned i = 0; i < 2; ++i) {
	switch ((NonZeros >> (i*2)) & 0x3) {
	default: llvm_unreachable("Unexpected NonZero count");
	case 0:
	Ops[i] = Ops[i*2]; // Must be a zero vector.
	break;
	case 1:
	Ops[i] = getMOVL(DAG, dl, VT, Ops[i2+1], Ops[i2]);
	break;
	case 2:
	Ops[i] = getMOVL(DAG, dl, VT, Ops[i2], Ops[i2+1]);
	break;
	case 3:
	Ops[i] = getUnpackl(DAG, dl, VT, Ops[i2], Ops[i2+1]);
	break;
	}
	}

	bool Reverse1 = (NonZeros & 0x3) == 2;
	bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
	int MaskVec[] = {
	Reverse1 ? 1 : 0,
	Reverse1 ? 0 : 1,
	static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
	static_cast<int>(Reverse2 ? NumElems : NumElems+1)
	};
	return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
	}

	assert(Values.size() > 1 && "Expected non-undef and non-splat vector");

	// Check for a build vector from mostly shuffle plus few inserting.
	if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
	return Sh;

	// For SSE 4.1, use insertps to put the high elements into the low element.
	if (Subtarget.hasSSE41()) {
	SDValue Result;
	if (!Op.getOperand(0).isUndef())
	Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
	else
	Result = DAG.getUNDEF(VT);

	for (unsigned i = 1; i < NumElems; ++i) {
	if (Op.getOperand(i).isUndef()) continue;
	Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
	Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
	}
	return Result;
	}

	// Otherwise, expand into a number of unpckl*, start by extending each of
	// our (non-undef) elements to the full vector width with the element in the
	// bottom slot of the vector (which generates no code for SSE).
	SmallVector<SDValue, 8> Ops(NumElems);
	for (unsigned i = 0; i < NumElems; ++i) {
	if (!Op.getOperand(i).isUndef())
	Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
	else
	Ops[i] = DAG.getUNDEF(VT);
	}

	// Next, we iteratively mix elements, e.g. for v4f32:
	// Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
	// : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
	// Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
	for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
	// Generate scaled UNPCKL shuffle mask.
	SmallVector<int, 16> Mask;
	for(unsigned i = 0; i != Scale; ++i)
	Mask.push_back(i);
	for (unsigned i = 0; i != Scale; ++i)
	Mask.push_back(NumElems+i);
	Mask.append(NumElems - Mask.size(), SM_SentinelUndef);

	for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
	Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2i], Ops[(2i)+1], Mask);
	}
	return Ops[0];
	}

	// 256-bit AVX can use the vinsertf128 instruction
	// to create 256-bit vectors from two other 128-bit ones.
	static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT ResVT = Op.getSimpleValueType();

	assert((ResVT.is256BitVector() \|\|
	ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");

	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	unsigned NumElems = ResVT.getVectorNumElements();
	if (ResVT.is256BitVector())
	return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);

	if (Op.getNumOperands() == 4) {
	MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
	ResVT.getVectorNumElements()/2);
	SDValue V3 = Op.getOperand(2);
	SDValue V4 = Op.getOperand(3);
	return concat256BitVectors(
	concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
	concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
	NumElems, DAG, dl);
	}
	return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
	}

	// Return true if all the operands of the given CONCAT_VECTORS node are zeros
	// except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
	static bool isExpandWithZeros(const SDValue &Op) {
	assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
	"Expand with zeros only possible in CONCAT_VECTORS nodes!");

	for (unsigned i = 1; i < Op.getNumOperands(); i++)
	if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
	return false;

	return true;
	}

	// Returns true if the given node is a type promotion (by concatenating i1
	// zeros) of the result of a node that already zeros all upper bits of
	// k-register.
	static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
	unsigned Opc = Op.getOpcode();

	assert(Opc == ISD::CONCAT_VECTORS &&
	Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
	"Unexpected node to check for type promotion!");

	// As long as we are concatenating zeros to the upper part of a previous node
	// result, climb up the tree until a node with different opcode is
	// encountered
	while (Opc == ISD::INSERT_SUBVECTOR \|\| Opc == ISD::CONCAT_VECTORS) {
	if (Opc == ISD::INSERT_SUBVECTOR) {
	if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
	Op.getConstantOperandVal(2) == 0)
	Op = Op.getOperand(1);
	else
	return SDValue();
	} else { // Opc == ISD::CONCAT_VECTORS
	if (isExpandWithZeros(Op))
	Op = Op.getOperand(0);
	else
	return SDValue();
	}
	Opc = Op.getOpcode();
	}

	// Check if the first inserted node zeroes the upper bits, or an 'and' result
	// of a node that zeros the upper bits (its masked version).
	if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) \|\|
	(Op.getOpcode() == ISD::AND &&
	(isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) \|\|
	isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
	return Op;
	}

	return SDValue();
	}

	static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG & DAG) {
	SDLoc dl(Op);
	MVT ResVT = Op.getSimpleValueType();
	unsigned NumOperands = Op.getNumOperands();

	assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
	"Unexpected number of operands in CONCAT_VECTORS");

	// If this node promotes - by concatenating zeroes - the type of the result
	// of a node with instruction that zeroes all upper (irrelevant) bits of the
	// output register, mark it as legal and catch the pattern in instruction
	// selection to avoid emitting extra instructions (for zeroing upper bits).
	if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) {
	SDValue ZeroC = DAG.getIntPtrConstant(0, dl);
	SDValue AllZeros = getZeroVector(ResVT, Subtarget, DAG, dl);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted,
	ZeroC);
	}

	unsigned NumZero = 0;
	unsigned NumNonZero = 0;
	uint64_t NonZeros = 0;
	for (unsigned i = 0; i != NumOperands; ++i) {
	SDValue SubVec = Op.getOperand(i);
	if (SubVec.isUndef())
	continue;
	if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
	++NumZero;
	else {
	assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
	NonZeros \|= (uint64_t)1 << i;
	++NumNonZero;
	}
	}


	// If there are zero or one non-zeros we can handle this very simply.
	if (NumNonZero <= 1) {
	SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
	: DAG.getUNDEF(ResVT);
	if (!NumNonZero)
	return Vec;
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue SubVec = Op.getOperand(Idx);
	unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
	DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
	}

	if (NumOperands > 2) {
	MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
	ResVT.getVectorNumElements()/2);
	ArrayRef<SDUse> Ops = Op->ops();
	SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
	Ops.slice(0, NumOperands/2));
	SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
	Ops.slice(NumOperands/2));
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
	}

	assert(NumNonZero == 2 && "Simple cases not handled?");

	if (ResVT.getVectorNumElements() >= 16)
	return Op; // The operation is legal with KUNPCK

	SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
	DAG.getUNDEF(ResVT), Op.getOperand(0),
	DAG.getIntPtrConstant(0, dl));
	unsigned NumElems = ResVT.getVectorNumElements();
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
	DAG.getIntPtrConstant(NumElems/2, dl));
	}

	static SDValue LowerCONCAT_VECTORS(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	if (VT.getVectorElementType() == MVT::i1)
	return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);

	assert((VT.is256BitVector() && Op.getNumOperands() == 2) \|\|
	(VT.is512BitVector() && (Op.getNumOperands() == 2 \|\|
	Op.getNumOperands() == 4)));

	// AVX can use the vinsertf128 instruction to create 256-bit vectors
	// from two other 128-bit ones.

	// 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
	return LowerAVXCONCAT_VECTORS(Op, DAG);
	}

	//===----------------------------------------------------------------------===//
	// Vector shuffle lowering
	//
	// This is an experimental code path for lowering vector shuffles on x86. It is
	// designed to handle arbitrary vector shuffles and blends, gracefully
	// degrading performance as necessary. It works hard to recognize idiomatic
	// shuffles and lower them to optimal instruction patterns without leaving
	// a framework that allows reasonably efficient handling of all vector shuffle
	// patterns.
	//===----------------------------------------------------------------------===//

	/// \brief Tiny helper function to identify a no-op mask.
	///
	/// This is a somewhat boring predicate function. It checks whether the mask
	/// array input, which is assumed to be a single-input shuffle mask of the kind
	/// used by the X86 shuffle instructions (not a fully general
	/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
	/// in-place shuffle are 'no-op's.
	static bool isNoopShuffleMask(ArrayRef<int> Mask) {
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] >= 0 && Mask[i] != i)
	return false;
	}
	return true;
	}

	/// \brief Test whether there are elements crossing 128-bit lanes in this
	/// shuffle mask.
	///
	/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
	/// and we routinely test for these.
	static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
	int LaneSize = 128 / VT.getScalarSizeInBits();
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
	return true;
	return false;
	}

	/// \brief Test whether a shuffle mask is equivalent within each sub-lane.
	///
	/// This checks a shuffle mask to see if it is performing the same
	/// lane-relative shuffle in each sub-lane. This trivially implies
	/// that it is also not lane-crossing. It may however involve a blend from the
	/// same lane of a second vector.
	///
	/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
	/// non-trivial to compute in the face of undef lanes. The representation is
	/// suitable for use with existing 128-bit shuffles as entries from the second
	/// vector have been remapped to [LaneSize, 2*LaneSize).
	static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
	ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
	RepeatedMask.assign(LaneSize, -1);
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i) {
	assert(Mask[i] == SM_SentinelUndef \|\| Mask[i] >= 0);
	if (Mask[i] < 0)
	continue;
	if ((Mask[i] % Size) / LaneSize != i / LaneSize)
	// This entry crosses lanes, so there is no way to model this shuffle.
	return false;

	// Ok, handle the in-lane shuffles by detecting if and when they repeat.
	// Adjust second vector indices to start at LaneSize instead of Size.
	int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
	: Mask[i] % LaneSize + LaneSize;
	if (RepeatedMask[i % LaneSize] < 0)
	// This is the first non-undef entry in this slot of a 128-bit lane.
	RepeatedMask[i % LaneSize] = LocalM;
	else if (RepeatedMask[i % LaneSize] != LocalM)
	// Found a mismatch with the repeated mask.
	return false;
	}
	return true;
	}

	/// Test whether a shuffle mask is equivalent within each 128-bit lane.
	static bool
	is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
	}

	/// Test whether a shuffle mask is equivalent within each 256-bit lane.
	static bool
	is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
	}

	/// Test whether a target shuffle mask is equivalent within each sub-lane.
	/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
	static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
	ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
	RepeatedMask.assign(LaneSize, SM_SentinelUndef);
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i) {
	assert(isUndefOrZero(Mask[i]) \|\| (Mask[i] >= 0));
	if (Mask[i] == SM_SentinelUndef)
	continue;
	if (Mask[i] == SM_SentinelZero) {
	if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
	return false;
	RepeatedMask[i % LaneSize] = SM_SentinelZero;
	continue;
	}
	if ((Mask[i] % Size) / LaneSize != i / LaneSize)
	// This entry crosses lanes, so there is no way to model this shuffle.
	return false;

	// Ok, handle the in-lane shuffles by detecting if and when they repeat.
	// Adjust second vector indices to start at LaneSize instead of Size.
	int LocalM =
	Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
	if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
	// This is the first non-undef entry in this slot of a 128-bit lane.
	RepeatedMask[i % LaneSize] = LocalM;
	else if (RepeatedMask[i % LaneSize] != LocalM)
	// Found a mismatch with the repeated mask.
	return false;
	}
	return true;
	}

	/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
	/// arguments.
	///
	/// This is a fast way to test a shuffle mask against a fixed pattern:
	///
	/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
	///
	/// It returns true if the mask is exactly as wide as the argument list, and
	/// each element of the mask is either -1 (signifying undef) or the value given
	/// in the argument.
	static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
	ArrayRef<int> ExpectedMask) {
	if (Mask.size() != ExpectedMask.size())
	return false;

	int Size = Mask.size();

	// If the values are build vectors, we can look through them to find
	// equivalent inputs that make the shuffles equivalent.
	auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
	auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);

	for (int i = 0; i < Size; ++i) {
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
	auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
	auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
	if (!MaskBV \|\| !ExpectedBV \|\|
	MaskBV->getOperand(Mask[i] % Size) !=
	ExpectedBV->getOperand(ExpectedMask[i] % Size))
	return false;
	}
	}

	return true;
	}

	/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
	///
	/// The masks must be exactly the same width.
	///
	/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
	/// value in ExpectedMask is always accepted. Otherwise the indices must match.
	///
	/// SM_SentinelZero is accepted as a valid negative index but must match in both.
	static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
	ArrayRef<int> ExpectedMask) {
	int Size = Mask.size();
	if (Size != (int)ExpectedMask.size())
	return false;

	for (int i = 0; i < Size; ++i)
	if (Mask[i] == SM_SentinelUndef)
	continue;
	else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
	return false;
	else if (Mask[i] != ExpectedMask[i])
	return false;

	return true;
	}

	// Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
	// mask.
	static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
	const APInt &Zeroable) {
	int NumElts = Mask.size();
	assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");

	SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
	for (int i = 0; i != NumElts; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
	TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
	}
	return TargetMask;
	}

	// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
	// instructions.
	static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
	if (VT != MVT::v8i32 && VT != MVT::v8f32)
	return false;

	SmallVector<int, 8> Unpcklwd;
	createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
	/* Unary = */ false);
	SmallVector<int, 8> Unpckhwd;
	createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
	/* Unary = */ false);
	bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) \|\|
	isTargetShuffleEquivalent(Mask, Unpckhwd));
	return IsUnpackwdMask;
	}

	/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
	///
	/// This helper function produces an 8-bit shuffle immediate corresponding to
	/// the ubiquitous shuffle encoding scheme used in x86 instructions for
	/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
	/// example.
	///
	/// NB: We rely heavily on "undef" masks preserving the input lane.
	static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
	assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
	assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
	assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
	assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
	assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");

	unsigned Imm = 0;
	Imm \|= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
	Imm \|= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
	Imm \|= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
	Imm \|= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
	return Imm;
	}

	static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
	SelectionDAG &DAG) {
	return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
	}

	/// \brief Compute whether each element of a shuffle is zeroable.
	///
	/// A "zeroable" vector shuffle element is one which can be lowered to zero.
	/// Either it is an undef element in the shuffle mask, the element of the input
	/// referenced is undef, or the element of the input referenced is known to be
	/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
	/// as many lanes with this technique as possible to simplify the remaining
	/// shuffle.
	static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
	SDValue V1, SDValue V2) {
	APInt Zeroable(Mask.size(), 0);
	V1 = peekThroughBitcasts(V1);
	V2 = peekThroughBitcasts(V2);

	bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
	bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());

	int VectorSizeInBits = V1.getValueSizeInBits();
	int ScalarSizeInBits = VectorSizeInBits / Mask.size();
	assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");

	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	int M = Mask[i];
	// Handle the easy cases.
	if (M < 0 \|\| (M >= 0 && M < Size && V1IsZero) \|\| (M >= Size && V2IsZero)) {
	Zeroable.setBit(i);
	continue;
	}

	// Determine shuffle input and normalize the mask.
	SDValue V = M < Size ? V1 : V2;
	M %= Size;

	// Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
	if (V.getOpcode() != ISD::BUILD_VECTOR)
	continue;

	// If the BUILD_VECTOR has fewer elements then the bitcasted portion of
	// the (larger) source element must be UNDEF/ZERO.
	if ((Size % V.getNumOperands()) == 0) {
	int Scale = Size / V->getNumOperands();
	SDValue Op = V.getOperand(M / Scale);
	if (Op.isUndef() \|\| X86::isZeroNode(Op))
	Zeroable.setBit(i);
	else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
	APInt Val = Cst->getAPIntValue();
	Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
	Val = Val.getLoBits(ScalarSizeInBits);
	if (Val == 0)
	Zeroable.setBit(i);
	} else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
	APInt Val = Cst->getValueAPF().bitcastToAPInt();
	Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
	Val = Val.getLoBits(ScalarSizeInBits);
	if (Val == 0)
	Zeroable.setBit(i);
	}
	continue;
	}

	// If the BUILD_VECTOR has more elements then all the (smaller) source
	// elements must be UNDEF or ZERO.
	if ((V.getNumOperands() % Size) == 0) {
	int Scale = V->getNumOperands() / Size;
	bool AllZeroable = true;
	for (int j = 0; j < Scale; ++j) {
	SDValue Op = V.getOperand((M * Scale) + j);
	AllZeroable &= (Op.isUndef() \|\| X86::isZeroNode(Op));
	}
	if (AllZeroable)
	Zeroable.setBit(i);
	continue;
	}
	}

	return Zeroable;
	}

	// The Shuffle result is as follow:
	// 0a[0]0a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
	// Each Zeroable's element correspond to a particular Mask's element.
	// As described in computeZeroableShuffleElements function.
	//
	// The function looks for a sub-mask that the nonzero elements are in
	// increasing order. If such sub-mask exist. The function returns true.
	static bool isNonZeroElementsInOrder(const APInt &Zeroable,
	ArrayRef<int> Mask, const EVT &VectorType,
	bool &IsZeroSideLeft) {
	int NextElement = -1;
	// Check if the Mask's nonzero elements are in increasing order.
	for (int i = 0, e = Mask.size(); i < e; i++) {
	// Checks if the mask's zeros elements are built from only zeros.
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] < 0)
	return false;
	if (Zeroable[i])
	continue;
	// Find the lowest non zero element
	if (NextElement < 0) {
	NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
	IsZeroSideLeft = NextElement != 0;
	}
	// Exit if the mask's non zero elements are not in increasing order.
	if (NextElement != Mask[i])
	return false;
	NextElement++;
	}
	return true;
	}

	/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
	static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Size = Mask.size();
	int LaneSize = 128 / VT.getScalarSizeInBits();
	const int NumBytes = VT.getSizeInBits() / 8;
	const int NumEltBytes = VT.getScalarSizeInBits() / 8;

	assert((Subtarget.hasSSSE3() && VT.is128BitVector()) \|\|
	(Subtarget.hasAVX2() && VT.is256BitVector()) \|\|
	(Subtarget.hasBWI() && VT.is512BitVector()));

	SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
	// Sign bit set in i8 mask means zero element.
	SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);

	SDValue V;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / NumEltBytes];
	if (M < 0) {
	PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
	continue;
	}
	if (Zeroable[i / NumEltBytes]) {
	PSHUFBMask[i] = ZeroMask;
	continue;
	}

	// We can only use a single input of V1 or V2.
	SDValue SrcV = (M >= Size ? V2 : V1);
	if (V && V != SrcV)
	return SDValue();
	V = SrcV;
	M %= Size;

	// PSHUFB can't cross lanes, ensure this doesn't happen.
	if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
	return SDValue();

	M = M % LaneSize;
	M = M * NumEltBytes + (i % NumEltBytes);
	PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
	}
	assert(V && "Failed to find a source input");

	MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
	DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
	}

	static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl);

	// X86 has dedicated shuffle that can be lowered to VEXPAND
	static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
	const APInt &Zeroable,
	ArrayRef<int> Mask, SDValue &V1,
	SDValue &V2, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	bool IsLeftZeroSide = true;
	if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
	IsLeftZeroSide))
	return SDValue();
	unsigned VEXPANDMask = (~Zeroable).getZExtValue();
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
	unsigned NumElts = VT.getVectorNumElements();
	assert((NumElts == 4 \|\| NumElts == 8 \|\| NumElts == 16) &&
	"Unexpected number of vector elements");
	SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
	Subtarget, DAG, DL);
	SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
	SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
	return DAG.getSelect(DL, VT, VMask,
	DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
	ZeroVector);
	}

	static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
	unsigned &UnpackOpcode, bool IsUnary,
	ArrayRef<int> TargetMask, SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	int NumElts = VT.getVectorNumElements();

	bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
	for (int i = 0; i != NumElts; i += 2) {
	int M1 = TargetMask[i + 0];
	int M2 = TargetMask[i + 1];
	Undef1 &= (SM_SentinelUndef == M1);
	Undef2 &= (SM_SentinelUndef == M2);
	Zero1 &= isUndefOrZero(M1);
	Zero2 &= isUndefOrZero(M2);
	}
	assert(!((Undef1 \|\| Zero1) && (Undef2 \|\| Zero2)) &&
	"Zeroable shuffle detected");

	// Attempt to match the target mask against the unpack lo/hi mask patterns.
	SmallVector<int, 64> Unpckl, Unpckh;
	createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
	if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
	UnpackOpcode = X86ISD::UNPCKL;
	V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
	V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
	return true;
	}

	createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
	if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
	UnpackOpcode = X86ISD::UNPCKH;
	V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
	V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
	return true;
	}

	// If an unary shuffle, attempt to match as an unpack lo/hi with zero.
	if (IsUnary && (Zero1 \|\| Zero2)) {
	// Don't bother if we can blend instead.
	if ((Subtarget.hasSSE41() \|\| VT == MVT::v2i64 \|\| VT == MVT::v2f64) &&
	isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
	return false;

	bool MatchLo = true, MatchHi = true;
	for (int i = 0; (i != NumElts) && (MatchLo \|\| MatchHi); ++i) {
	int M = TargetMask[i];

	// Ignore if the input is known to be zero or the index is undef.
	if ((((i & 1) == 0) && Zero1) \|\| (((i & 1) == 1) && Zero2) \|\|
	(M == SM_SentinelUndef))
	continue;

	MatchLo &= (M == Unpckl[i]);
	MatchHi &= (M == Unpckh[i]);
	}

	if (MatchLo \|\| MatchHi) {
	UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
	V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
	V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
	return true;
	}
	}

	// If a binary shuffle, commute and try again.
	if (!IsUnary) {
	ShuffleVectorSDNode::commuteMask(Unpckl);
	if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
	UnpackOpcode = X86ISD::UNPCKL;
	std::swap(V1, V2);
	return true;
	}

	ShuffleVectorSDNode::commuteMask(Unpckh);
	if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
	UnpackOpcode = X86ISD::UNPCKH;
	std::swap(V1, V2);
	return true;
	}
	}

	return false;
	}

	// X86 has dedicated unpack instructions that can handle specific blend
	// operations: UNPCKH and UNPCKL.
	static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	SmallVector<int, 8> Unpckl;
	createUnpackShuffleMask(VT, Unpckl, /* Lo = / true, / Unary = */ false);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
	return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);

	SmallVector<int, 8> Unpckh;
	createUnpackShuffleMask(VT, Unpckh, /* Lo = / false, / Unary = */ false);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
	return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);

	// Commute and try again.
	ShuffleVectorSDNode::commuteMask(Unpckl);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
	return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);

	ShuffleVectorSDNode::commuteMask(Unpckh);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
	return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);

	return SDValue();
	}

	// X86 has dedicated pack instructions that can handle specific truncation
	// operations: PACKSS and PACKUS.
	static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
	SDValue &V2, unsigned &PackOpcode,
	ArrayRef<int> TargetMask,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned NumElts = VT.getVectorNumElements();
	unsigned BitSize = VT.getScalarSizeInBits();
	MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
	MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);

	auto MatchPACK = [&](SDValue N1, SDValue N2) {
	SDValue VV1 = DAG.getBitcast(PackVT, N1);
	SDValue VV2 = DAG.getBitcast(PackVT, N2);
	if ((N1.isUndef() \|\| DAG.ComputeNumSignBits(VV1) > BitSize) &&
	(N2.isUndef() \|\| DAG.ComputeNumSignBits(VV2) > BitSize)) {
	V1 = VV1;
	V2 = VV2;
	SrcVT = PackVT;
	PackOpcode = X86ISD::PACKSS;
	return true;
	}

	if (Subtarget.hasSSE41() \|\| PackSVT == MVT::i16) {
	APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
	if ((N1.isUndef() \|\| DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
	(N2.isUndef() \|\| DAG.MaskedValueIsZero(VV2, ZeroMask))) {
	V1 = VV1;
	V2 = VV2;
	SrcVT = PackVT;
	PackOpcode = X86ISD::PACKUS;
	return true;
	}
	}

	return false;
	};

	// Try binary shuffle.
	SmallVector<int, 32> BinaryMask;
	createPackShuffleMask(VT, BinaryMask, false);
	if (isTargetShuffleEquivalent(TargetMask, BinaryMask))
	if (MatchPACK(V1, V2))
	return true;

	// Try unary shuffle.
	SmallVector<int, 32> UnaryMask;
	createPackShuffleMask(VT, UnaryMask, true);
	if (isTargetShuffleEquivalent(TargetMask, UnaryMask))
	if (MatchPACK(V1, V1))
	return true;

	return false;
	}

	static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT PackVT;
	unsigned PackOpcode;
	if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
	Subtarget))
	return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
	DAG.getBitcast(PackVT, V2));

	return SDValue();
	}

	/// \brief Try to emit a bitmask instruction for a shuffle.
	///
	/// This handles cases where we can model a blend exactly as a bitmask due to
	/// one of the inputs being zeroable.
	static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SelectionDAG &DAG) {
	assert(!VT.isFloatingPoint() && "Floating point types are not supported");
	MVT EltVT = VT.getVectorElementType();
	SDValue Zero = DAG.getConstant(0, DL, EltVT);
	SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
	SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
	SDValue V;
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Zeroable[i])
	continue;
	if (Mask[i] % Size != i)
	return SDValue(); // Not a blend.
	if (!V)
	V = Mask[i] < Size ? V1 : V2;
	else if (V != (Mask[i] < Size ? V1 : V2))
	return SDValue(); // Can only let one input through the mask.

	VMaskOps[i] = AllOnes;
	}
	if (!V)
	return SDValue(); // No non-zeroable elements!

	SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
	return DAG.getNode(ISD::AND, DL, VT, V, VMask);
	}

	/// \brief Try to emit a blend instruction for a shuffle using bit math.
	///
	/// This is used as a fallback approach when first class blend instructions are
	/// unavailable. Currently it is only suitable for integer vectors, but could
	/// be generalized for floating point vectors if desirable.
	static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(VT.isInteger() && "Only supports integer vector types!");
	MVT EltVT = VT.getVectorElementType();
	SDValue Zero = DAG.getConstant(0, DL, EltVT);
	SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
	SmallVector<SDValue, 16> MaskOps;
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
	return SDValue(); // Shuffled input!
	MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
	}

	SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
	V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
	// We have to cast V2 around.
	MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
	V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
	DAG.getBitcast(MaskVT, V1Mask),
	DAG.getBitcast(MaskVT, V2)));
	return DAG.getNode(ISD::OR, DL, VT, V1, V2);
	}

	static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG);

	static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
	MutableArrayRef<int> TargetMask,
	bool &ForceV1Zero, bool &ForceV2Zero,
	uint64_t &BlendMask) {
	bool V1IsZeroOrUndef =
	V1.isUndef() \|\| ISD::isBuildVectorAllZeros(V1.getNode());
	bool V2IsZeroOrUndef =
	V2.isUndef() \|\| ISD::isBuildVectorAllZeros(V2.getNode());

	BlendMask = 0;
	ForceV1Zero = false, ForceV2Zero = false;
	assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");

	// Attempt to generate the binary blend mask. If an input is zero then
	// we can use any lane.
	// TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
	for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
	int M = TargetMask[i];
	if (M == SM_SentinelUndef)
	continue;
	if (M == i)
	continue;
	if (M == i + Size) {
	BlendMask \|= 1ull << i;
	continue;
	}
	if (M == SM_SentinelZero) {
	if (V1IsZeroOrUndef) {
	ForceV1Zero = true;
	TargetMask[i] = i;
	continue;
	}
	if (V2IsZeroOrUndef) {
	ForceV2Zero = true;
	BlendMask \|= 1ull << i;
	TargetMask[i] = i + Size;
	continue;
	}
	}
	return false;
	}
	return true;
	}

	static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
	int Scale) {
	uint64_t ScaledMask = 0;
	for (int i = 0; i != Size; ++i)
	if (BlendMask & (1ull << i))
	ScaledMask \|= ((1ull << Scale) - 1) << (i * Scale);
	return ScaledMask;
	}

	/// \brief Try to emit a blend instruction for a shuffle.
	///
	/// This doesn't do any checks for the availability of instructions for blending
	/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
	/// be matched in the backend with the type given. What it does check for is
	/// that the shuffle mask is a blend, or convertible into a blend with zero.
	static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Original,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);

	uint64_t BlendMask = 0;
	bool ForceV1Zero = false, ForceV2Zero = false;
	if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
	BlendMask))
	return SDValue();

	// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
	if (ForceV1Zero)
	V1 = getZeroVector(VT, Subtarget, DAG, DL);
	if (ForceV2Zero)
	V2 = getZeroVector(VT, Subtarget, DAG, DL);

	switch (VT.SimpleTy) {
	case MVT::v2f64:
	case MVT::v4f32:
	case MVT::v4f64:
	case MVT::v8f32:
	return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
	DAG.getConstant(BlendMask, DL, MVT::i8));

	case MVT::v4i64:
	case MVT::v8i32:
	assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
	LLVM_FALLTHROUGH;
	case MVT::v2i64:
	case MVT::v4i32:
	// If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
	// that instruction.
	if (Subtarget.hasAVX2()) {
	// Scale the blend by the number of 32-bit dwords per element.
	int Scale = VT.getScalarSizeInBits() / 32;
	BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
	MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
	V1 = DAG.getBitcast(BlendVT, V1);
	V2 = DAG.getBitcast(BlendVT, V2);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
	DAG.getConstant(BlendMask, DL, MVT::i8)));
	}
	LLVM_FALLTHROUGH;
	case MVT::v8i16: {
	// For integer shuffles we need to expand the mask and cast the inputs to
	// v8i16s prior to blending.
	int Scale = 8 / VT.getVectorNumElements();
	BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
	V1 = DAG.getBitcast(MVT::v8i16, V1);
	V2 = DAG.getBitcast(MVT::v8i16, V2);
	return DAG.getBitcast(VT,
	DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
	DAG.getConstant(BlendMask, DL, MVT::i8)));
	}

	case MVT::v16i16: {
	assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
	// We can lower these with PBLENDW which is mirrored across 128-bit lanes.
	assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
	BlendMask = 0;
	for (int i = 0; i < 8; ++i)
	if (RepeatedMask[i] >= 8)
	BlendMask \|= 1ull << i;
	return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
	DAG.getConstant(BlendMask, DL, MVT::i8));
	}
	LLVM_FALLTHROUGH;
	}
	case MVT::v16i8:
	case MVT::v32i8: {
	assert((VT.is128BitVector() \|\| Subtarget.hasAVX2()) &&
	"256-bit byte-blends require AVX2 support!");

	if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
	return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
	}

	// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
	if (SDValue Masked =
	lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
	return Masked;

	// Scale the blend by the number of bytes per element.
	int Scale = VT.getScalarSizeInBits() / 8;

	// This form of blend is always done on bytes. Compute the byte vector
	// type.
	MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

	// Compute the VSELECT mask. Note that VSELECT is really confusing in the
	// mix of LLVM's code generator and the x86 backend. We tell the code
	// generator that boolean values in the elements of an x86 vector register
	// are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
	// mapping a select to operand #1, and 'false' mapping to operand #2. The
	// reality in x86 is that vector masks (pre-AVX-512) use only the high bit
	// of the element (the remaining are ignored) and 0 in that high bit would
	// mean operand #1 while 1 in the high bit would mean operand #2. So while
	// the LLVM model for boolean values in vector elements gets the relevant
	// bit set, it is set backwards and over constrained relative to x86's
	// actual model.
	SmallVector<SDValue, 32> VSELECTMask;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	for (int j = 0; j < Scale; ++j)
	VSELECTMask.push_back(
	Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
	: DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
	MVT::i8));

	V1 = DAG.getBitcast(BlendVT, V1);
	V2 = DAG.getBitcast(BlendVT, V2);
	return DAG.getBitcast(
	VT,
	DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
	V1, V2));
	}
	case MVT::v16f32:
	case MVT::v8f64:
	case MVT::v8i64:
	case MVT::v16i32:
	case MVT::v32i16:
	case MVT::v64i8: {
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
	return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
	}
	default:
	llvm_unreachable("Not a supported integer vector type!");
	}
	}

	/// \brief Try to lower as a blend of elements from two inputs followed by
	/// a single-input permutation.
	///
	/// This matches the pattern where we can blend elements from two inputs and
	/// then reduce the shuffle to a single-input permutation.
	static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	// We build up the blend mask while checking whether a blend is a viable way
	// to reduce the shuffle.
	SmallVector<int, 32> BlendMask(Mask.size(), -1);
	SmallVector<int, 32> PermuteMask(Mask.size(), -1);

	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");

	if (BlendMask[Mask[i] % Size] < 0)
	BlendMask[Mask[i] % Size] = Mask[i];
	else if (BlendMask[Mask[i] % Size] != Mask[i])
	return SDValue(); // Can't blend in the needed input!

	PermuteMask[i] = Mask[i] % Size;
	}

	SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
	return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
	}

	/// \brief Generic routine to decompose a shuffle and blend into independent
	/// blends and permutes.
	///
	/// This matches the extremely common pattern for handling combined
	/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
	/// operations. It will try to pick the best arrangement of shuffles and
	/// blends.
	static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
	MVT VT, SDValue V1,
	SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	// Shuffle the input elements into the desired positions in V1 and V2 and
	// blend them together.
	SmallVector<int, 32> V1Mask(Mask.size(), -1);
	SmallVector<int, 32> V2Mask(Mask.size(), -1);
	SmallVector<int, 32> BlendMask(Mask.size(), -1);
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= 0 && Mask[i] < Size) {
	V1Mask[i] = Mask[i];
	BlendMask[i] = i;
	} else if (Mask[i] >= Size) {
	V2Mask[i] = Mask[i] - Size;
	BlendMask[i] = i + Size;
	}

	// Try to lower with the simpler initial blend strategy unless one of the
	// input shuffles would be a no-op. We prefer to shuffle inputs as the
	// shuffle may be able to fold with a load or other benefit. However, when
	// we'll have to do 2x as many shuffles in order to achieve this, blending
	// first is a better strategy.
	if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
	if (SDValue BlendPerm =
	lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
	return BlendPerm;

	V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
	return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
	}

	/// \brief Try to lower a vector shuffle as a rotation.
	///
	/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
	static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask) {
	int NumElts = Mask.size();

	// We need to detect various ways of spelling a rotation:
	// [11, 12, 13, 14, 15, 0, 1, 2]
	// [-1, 12, 13, 14, -1, -1, 1, -1]
	// [-1, -1, -1, -1, -1, -1, 1, 2]
	// [ 3, 4, 5, 6, 7, 8, 9, 10]
	// [-1, 4, 5, 6, -1, -1, 9, -1]
	// [-1, 4, 5, 6, -1, -1, -1, -1]
	int Rotation = 0;
	SDValue Lo, Hi;
	for (int i = 0; i < NumElts; ++i) {
	int M = Mask[i];
	assert((M == SM_SentinelUndef \|\| (0 <= M && M < (2*NumElts))) &&
	"Unexpected mask index.");
	if (M < 0)
	continue;

	// Determine where a rotated vector would have started.
	int StartIdx = i - (M % NumElts);
	if (StartIdx == 0)
	// The identity rotation isn't interesting, stop.
	return -1;

	// If we found the tail of a vector the rotation must be the missing
	// front. If we found the head of a vector, it must be how much of the
	// head.
	int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;

	if (Rotation == 0)
	Rotation = CandidateRotation;
	else if (Rotation != CandidateRotation)
	// The rotations don't match, so we can't match this mask.
	return -1;

	// Compute which value this mask is pointing at.
	SDValue MaskV = M < NumElts ? V1 : V2;

	// Compute which of the two target values this index should be assigned
	// to. This reflects whether the high elements are remaining or the low
	// elements are remaining.
	SDValue &TargetV = StartIdx < 0 ? Hi : Lo;

	// Either set up this value if we've not encountered it before, or check
	// that it remains consistent.
	if (!TargetV)
	TargetV = MaskV;
	else if (TargetV != MaskV)
	// This may be a rotation, but it pulls from the inputs in some
	// unsupported interleaving.
	return -1;
	}

	// Check that we successfully analyzed the mask, and normalize the results.
	assert(Rotation != 0 && "Failed to locate a viable rotation!");
	assert((Lo \|\| Hi) && "Failed to find a rotated input vector!");
	if (!Lo)
	Lo = Hi;
	else if (!Hi)
	Hi = Lo;

	V1 = Lo;
	V2 = Hi;

	return Rotation;
	}

	/// \brief Try to lower a vector shuffle as a byte rotation.
	///
	/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
	/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
	/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
	/// try to generically lower a vector shuffle through such an pattern. It
	/// does not check for the profitability of lowering either as PALIGNR or
	/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
	/// This matches shuffle vectors that look like:
	///
	/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
	///
	/// Essentially it concatenates V1 and V2, shifts right by some number of
	/// elements, and takes the low elements as the result. Note that while this is
	/// specified as a right shift because x86 is little-endian, it is a *left
	/// rotate* of the vector lanes.
	static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask) {
	// Don't accept any shuffles with zero elements.
	if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
	return -1;

	// PALIGNR works on 128-bit lanes.
	SmallVector<int, 16> RepeatedMask;
	if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
	return -1;

	int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
	if (Rotation <= 0)
	return -1;

	// PALIGNR rotates bytes, so we need to scale the
	// rotation based on how many bytes are in the vector lane.
	int NumElts = RepeatedMask.size();
	int Scale = 16 / NumElts;
	return Rotation * Scale;
	}

	static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");

	SDValue Lo = V1, Hi = V2;
	int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
	if (ByteRotation <= 0)
	return SDValue();

	// Cast the inputs to i8 vector of correct length to match PALIGNR or
	// PSLLDQ/PSRLDQ.
	MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
	Lo = DAG.getBitcast(ByteVT, Lo);
	Hi = DAG.getBitcast(ByteVT, Hi);

	// SSSE3 targets can use the palignr instruction.
	if (Subtarget.hasSSSE3()) {
	assert((!VT.is512BitVector() \|\| Subtarget.hasBWI()) &&
	"512-bit PALIGNR requires BWI instructions");
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
	DAG.getConstant(ByteRotation, DL, MVT::i8)));
	}

	assert(VT.is128BitVector() &&
	"Rotate-based lowering only supports 128-bit lowering!");
	assert(Mask.size() <= 16 &&
	"Can shuffle at most 16 bytes in a 128-bit vector!");
	assert(ByteVT == MVT::v16i8 &&
	"SSE2 rotate lowering only needed for v16i8!");

	// Default SSE2 implementation
	int LoByteShift = 16 - ByteRotation;
	int HiByteShift = ByteRotation;

	SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
	DAG.getConstant(LoByteShift, DL, MVT::i8));
	SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
	DAG.getConstant(HiByteShift, DL, MVT::i8));
	return DAG.getBitcast(VT,
	DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
	}

	/// \brief Try to lower a vector shuffle as a dword/qword rotation.
	///
	/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
	/// rotation of the concatenation of two vectors; This routine will
	/// try to generically lower a vector shuffle through such an pattern.
	///
	/// Essentially it concatenates V1 and V2, shifts right by some number of
	/// elements, and takes the low elements as the result. Note that while this is
	/// specified as a right shift because x86 is little-endian, it is a *left
	/// rotate* of the vector lanes.
	static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert((VT.getScalarType() == MVT::i32 \|\| VT.getScalarType() == MVT::i64) &&
	"Only 32-bit and 64-bit elements are supported!");

	// 128/256-bit vectors are only supported with VLX.
	assert((Subtarget.hasVLX() \|\| (!VT.is128BitVector() && !VT.is256BitVector()))
	&& "VLX required for 128/256-bit vectors");

	SDValue Lo = V1, Hi = V2;
	int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
	if (Rotation <= 0)
	return SDValue();

	return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
	DAG.getConstant(Rotation, DL, MVT::i8));
	}

	/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
	///
	/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
	/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
	/// matches elements from one of the input vectors shuffled to the left or
	/// right with zeroable elements 'shifted in'. It handles both the strictly
	/// bit-wise element shifts and the byte shift across an entire 128-bit double
	/// quad word lane.
	///
	/// PSHL : (little-endian) left bit shift.
	/// [ zz, 0, zz, 2 ]
	/// [ -1, 4, zz, -1 ]
	/// PSRL : (little-endian) right bit shift.
	/// [ 1, zz, 3, zz]
	/// [ -1, -1, 7, zz]
	/// PSLLDQ : (little-endian) left byte shift
	/// [ zz, 0, 1, 2, 3, 4, 5, 6]
	/// [ zz, zz, -1, -1, 2, 3, 4, -1]
	/// [ zz, zz, zz, zz, zz, zz, -1, 1]
	/// PSRLDQ : (little-endian) right byte shift
	/// [ 5, 6, 7, zz, zz, zz, zz, zz]
	/// [ -1, 5, 6, 7, zz, zz, zz, zz]
	/// [ 1, 2, -1, -1, -1, -1, zz, zz]
	static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
	unsigned ScalarSizeInBits,
	ArrayRef<int> Mask, int MaskOffset,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget) {
	int Size = Mask.size();
	unsigned SizeInBits = Size * ScalarSizeInBits;

	auto CheckZeros = [&](int Shift, int Scale, bool Left) {
	for (int i = 0; i < Size; i += Scale)
	for (int j = 0; j < Shift; ++j)
	if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
	return false;

	return true;
	};

	auto MatchShift = [&](int Shift, int Scale, bool Left) {
	for (int i = 0; i != Size; i += Scale) {
	unsigned Pos = Left ? i + Shift : i;
	unsigned Low = Left ? i : i + Shift;
	unsigned Len = Scale - Shift;
	if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
	return -1;
	}

	int ShiftEltBits = ScalarSizeInBits * Scale;
	bool ByteShift = ShiftEltBits > 64;
	Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
	: (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
	int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);

	// Normalize the scale for byte shifts to still produce an i64 element
	// type.
	Scale = ByteShift ? Scale / 2 : Scale;

	// We need to round trip through the appropriate type for the shift.
	MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
	ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
	: MVT::getVectorVT(ShiftSVT, Size / Scale);
	return (int)ShiftAmt;
	};

	// SSE/AVX supports logical shifts up to 64-bit integers - so we can just
	// keep doubling the size of the integer elements up to that. We can
	// then shift the elements of the integer vector by whole multiples of
	// their width within the elements of the larger integer vector. Test each
	// multiple to see if we can find a match with the moved element indices
	// and that the shifted in elements are all zeroable.
	unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
	for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
	for (int Shift = 1; Shift != Scale; ++Shift)
	for (bool Left : {true, false})
	if (CheckZeros(Shift, Scale, Left)) {
	int ShiftAmt = MatchShift(Shift, Scale, Left);
	if (0 < ShiftAmt)
	return ShiftAmt;
	}

	// no match
	return -1;
	}

	static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Size = Mask.size();
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");

	MVT ShiftVT;
	SDValue V = V1;
	unsigned Opcode;

	// Try to match shuffle against V1 shift.
	int ShiftAmt = matchVectorShuffleAsShift(
	ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);

	// If V1 failed, try to match shuffle against V2 shift.
	if (ShiftAmt < 0) {
	ShiftAmt =
	matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
	Mask, Size, Zeroable, Subtarget);
	V = V2;
	}

	if (ShiftAmt < 0)
	return SDValue();

	assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
	"Illegal integer vector type");
	V = DAG.getBitcast(ShiftVT, V);
	V = DAG.getNode(Opcode, DL, ShiftVT, V,
	DAG.getConstant(ShiftAmt, DL, MVT::i8));
	return DAG.getBitcast(VT, V);
	}

	// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
	// Remainder of lower half result is zero and upper half is all undef.
	static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask, uint64_t &BitLen,
	uint64_t &BitIdx, const APInt &Zeroable) {
	int Size = Mask.size();
	int HalfSize = Size / 2;
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
	assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");

	// Upper half must be undefined.
	if (!isUndefInRange(Mask, HalfSize, HalfSize))
	return false;

	// Determine the extraction length from the part of the
	// lower half that isn't zeroable.
	int Len = HalfSize;
	for (; Len > 0; --Len)
	if (!Zeroable[Len - 1])
	break;
	assert(Len > 0 && "Zeroable shuffle mask");

	// Attempt to match first Len sequential elements from the lower half.
	SDValue Src;
	int Idx = -1;
	for (int i = 0; i != Len; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	SDValue &V = (M < Size ? V1 : V2);
	M = M % Size;

	// The extracted elements must start at a valid index and all mask
	// elements must be in the lower half.
	if (i > M \|\| M >= HalfSize)
	return false;

	if (Idx < 0 \|\| (Src == V && Idx == (M - i))) {
	Src = V;
	Idx = M - i;
	continue;
	}
	return false;
	}

	if (!Src \|\| Idx < 0)
	return false;

	assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
	BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
	BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
	V1 = Src;
	return true;
	}

	// INSERTQ: Extract lowest Len elements from lower half of second source and
	// insert over first source, starting at Idx.
	// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
	static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask, uint64_t &BitLen,
	uint64_t &BitIdx) {
	int Size = Mask.size();
	int HalfSize = Size / 2;
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");

	// Upper half must be undefined.
	if (!isUndefInRange(Mask, HalfSize, HalfSize))
	return false;

	for (int Idx = 0; Idx != HalfSize; ++Idx) {
	SDValue Base;

	// Attempt to match first source from mask before insertion point.
	if (isUndefInRange(Mask, 0, Idx)) {
	/* EMPTY */
	} else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
	Base = V1;
	} else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
	Base = V2;
	} else {
	continue;
	}

	// Extend the extraction length looking to match both the insertion of
	// the second source and the remaining elements of the first.
	for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
	SDValue Insert;
	int Len = Hi - Idx;

	// Match insertion.
	if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
	Insert = V1;
	} else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
	Insert = V2;
	} else {
	continue;
	}

	// Match the remaining elements of the lower half.
	if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
	/* EMPTY */
	} else if ((!Base \|\| (Base == V1)) &&
	isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
	Base = V1;
	} else if ((!Base \|\| (Base == V2)) &&
	isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
	Size + Hi)) {
	Base = V2;
	} else {
	continue;
	}

	BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
	BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
	V1 = Base;
	V2 = Insert;
	return true;
	}
	}

	return false;
	}

	/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
	static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SelectionDAG &DAG) {
	uint64_t BitLen, BitIdx;
	if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
	return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
	DAG.getConstant(BitLen, DL, MVT::i8),
	DAG.getConstant(BitIdx, DL, MVT::i8));

	if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
	return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
	V2 ? V2 : DAG.getUNDEF(VT),
	DAG.getConstant(BitLen, DL, MVT::i8),
	DAG.getConstant(BitIdx, DL, MVT::i8));

	return SDValue();
	}

	/// \brief Lower a vector shuffle as a zero or any extension.
	///
	/// Given a specific number of elements, element bit width, and extension
	/// stride, produce either a zero or any extension based on the available
	/// features of the subtarget. The extended elements are consecutive and
	/// begin and can start from an offsetted element index in the input; to
	/// avoid excess shuffling the offset must either being in the bottom lane
	/// or at the start of a higher lane. All extended elements must be from
	/// the same lane.
	static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
	const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
	ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(Scale > 1 && "Need a scale to extend.");
	int EltBits = VT.getScalarSizeInBits();
	int NumElements = VT.getVectorNumElements();
	int NumEltsPerLane = 128 / EltBits;
	int OffsetLane = Offset / NumEltsPerLane;
	assert((EltBits == 8 \|\| EltBits == 16 \|\| EltBits == 32) &&
	"Only 8, 16, and 32 bit elements can be extended.");
	assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
	assert(0 <= Offset && "Extension offset must be positive.");
	assert((Offset < NumEltsPerLane \|\| Offset % NumEltsPerLane == 0) &&
	"Extension offset must be in the first lane or start an upper lane.");

	// Check that an index is in same lane as the base offset.
	auto SafeOffset = [&](int Idx) {
	return OffsetLane == (Idx / NumEltsPerLane);
	};

	// Shift along an input so that the offset base moves to the first element.
	auto ShuffleOffset = [&](SDValue V) {
	if (!Offset)
	return V;

	SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
	for (int i = 0; i * Scale < NumElements; ++i) {
	int SrcIdx = i + Offset;
	ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
	}
	return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
	};

	// Found a valid zext mask! Try various lowering strategies based on the
	// input type and available ISA extensions.
	if (Subtarget.hasSSE41()) {
	// Not worth offsetting 128-bit vectors if scale == 2, a pattern using
	// PUNPCK will catch this in a later shuffle match.
	if (Offset && Scale == 2 && VT.is128BitVector())
	return SDValue();
	MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
	NumElements / Scale);
	InputV = ShuffleOffset(InputV);
	InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
	return DAG.getBitcast(VT, InputV);
	}

	assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");

	// For any extends we can cheat for larger element sizes and use shuffle
	// instructions that can fold with a load and/or copy.
	if (AnyExt && EltBits == 32) {
	int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
	-1};
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
	DAG.getBitcast(MVT::v4i32, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}
	if (AnyExt && EltBits == 16 && Scale > 2) {
	int PSHUFDMask[4] = {Offset / 2, -1,
	SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
	InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
	DAG.getBitcast(MVT::v4i32, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
	int PSHUFWMask[4] = {1, -1, -1, -1};
	unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
	return DAG.getBitcast(
	VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
	DAG.getBitcast(MVT::v8i16, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
	}

	// The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
	// to 64-bits.
	if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
	assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
	assert(VT.is128BitVector() && "Unexpected vector width!");

	int LoIdx = Offset * EltBits;
	SDValue Lo = DAG.getBitcast(
	MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
	DAG.getConstant(EltBits, DL, MVT::i8),
	DAG.getConstant(LoIdx, DL, MVT::i8)));

	if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) \|\|
	!SafeOffset(Offset + 1))
	return DAG.getBitcast(VT, Lo);

	int HiIdx = (Offset + 1) * EltBits;
	SDValue Hi = DAG.getBitcast(
	MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
	DAG.getConstant(EltBits, DL, MVT::i8),
	DAG.getConstant(HiIdx, DL, MVT::i8)));
	return DAG.getBitcast(VT,
	DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
	}

	// If this would require more than 2 unpack instructions to expand, use
	// pshufb when available. We can only use more than 2 unpack instructions
	// when zero extending i8 elements which also makes it easier to use pshufb.
	if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
	assert(NumElements == 16 && "Unexpected byte vector width!");
	SDValue PSHUFBMask[16];
	for (int i = 0; i < 16; ++i) {
	int Idx = Offset + (i / Scale);
	PSHUFBMask[i] = DAG.getConstant(
	(i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
	}
	InputV = DAG.getBitcast(MVT::v16i8, InputV);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
	DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
	}

	// If we are extending from an offset, ensure we start on a boundary that
	// we can unpack from.
	int AlignToUnpack = Offset % (NumElements / Scale);
	if (AlignToUnpack) {
	SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
	for (int i = AlignToUnpack; i < NumElements; ++i)
	ShMask[i - AlignToUnpack] = i;
	InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
	Offset -= AlignToUnpack;
	}

	// Otherwise emit a sequence of unpacks.
	do {
	unsigned UnpackLoHi = X86ISD::UNPCKL;
	if (Offset >= (NumElements / 2)) {
	UnpackLoHi = X86ISD::UNPCKH;
	Offset -= (NumElements / 2);
	}

	MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
	SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
	: getZeroVector(InputVT, Subtarget, DAG, DL);
	InputV = DAG.getBitcast(InputVT, InputV);
	InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
	Scale /= 2;
	EltBits *= 2;
	NumElements /= 2;
	} while (Scale > 1);
	return DAG.getBitcast(VT, InputV);
	}

	/// \brief Try to lower a vector shuffle as a zero extension on any microarch.
	///
	/// This routine will try to do everything in its power to cleverly lower
	/// a shuffle which happens to match the pattern of a zero extend. It doesn't
	/// check for the profitability of this lowering, it tries to aggressively
	/// match this pattern. It will use all of the micro-architectural details it
	/// can to emit an efficient lowering. It handles both blends with all-zero
	/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
	/// masking out later).
	///
	/// The reason we have dedicated lowering for zext-style shuffles is that they
	/// are both incredibly common and often quite performance sensitive.
	static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Bits = VT.getSizeInBits();
	int NumLanes = Bits / 128;
	int NumElements = VT.getVectorNumElements();
	int NumEltsPerLane = NumElements / NumLanes;
	assert(VT.getScalarSizeInBits() <= 32 &&
	"Exceeds 32-bit integer zero extension limit");
	assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");

	// Define a helper function to check a particular ext-scale and lower to it if
	// valid.
	auto Lower = [&](int Scale) -> SDValue {
	SDValue InputV;
	bool AnyExt = true;
	int Offset = 0;
	int Matches = 0;
	for (int i = 0; i < NumElements; ++i) {
	int M = Mask[i];
	if (M < 0)
	continue; // Valid anywhere but doesn't tell us anything.
	if (i % Scale != 0) {
	// Each of the extended elements need to be zeroable.
	if (!Zeroable[i])
	return SDValue();

	// We no longer are in the anyext case.
	AnyExt = false;
	continue;
	}

	// Each of the base elements needs to be consecutive indices into the
	// same input vector.
	SDValue V = M < NumElements ? V1 : V2;
	M = M % NumElements;
	if (!InputV) {
	InputV = V;
	Offset = M - (i / Scale);
	} else if (InputV != V)
	return SDValue(); // Flip-flopping inputs.

	// Offset must start in the lowest 128-bit lane or at the start of an
	// upper lane.
	// FIXME: Is it ever worth allowing a negative base offset?
	if (!((0 <= Offset && Offset < NumEltsPerLane) \|\|
	(Offset % NumEltsPerLane) == 0))
	return SDValue();

	// If we are offsetting, all referenced entries must come from the same
	// lane.
	if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
	return SDValue();

	if ((M % NumElements) != (Offset + (i / Scale)))
	return SDValue(); // Non-consecutive strided elements.
	Matches++;
	}

	// If we fail to find an input, we have a zero-shuffle which should always
	// have already been handled.
	// FIXME: Maybe handle this here in case during blending we end up with one?
	if (!InputV)
	return SDValue();

	// If we are offsetting, don't extend if we only match a single input, we
	// can always do better by using a basic PSHUF or PUNPCK.
	if (Offset != 0 && Matches < 2)
	return SDValue();

	return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
	DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
	};

	// The widest scale possible for extending is to a 64-bit integer.
	assert(Bits % 64 == 0 &&
	"The number of bits in a vector must be divisible by 64 on x86!");
	int NumExtElements = Bits / 64;

	// Each iteration, try extending the elements half as much, but into twice as
	// many elements.
	for (; NumExtElements < NumElements; NumExtElements *= 2) {
	assert(NumElements % NumExtElements == 0 &&
	"The input vector size must be divisible by the extended size.");
	if (SDValue V = Lower(NumElements / NumExtElements))
	return V;
	}

	// General extends failed, but 128-bit vectors may be able to use MOVQ.
	if (Bits != 128)
	return SDValue();

	// Returns one of the source operands if the shuffle can be reduced to a
	// MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
	auto CanZExtLowHalf = [&]() {
	for (int i = NumElements / 2; i != NumElements; ++i)
	if (!Zeroable[i])
	return SDValue();
	if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
	return V1;
	if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
	return V2;
	return SDValue();
	};

	if (SDValue V = CanZExtLowHalf()) {
	V = DAG.getBitcast(MVT::v2i64, V);
	V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
	return DAG.getBitcast(VT, V);
	}

	// No viable ext lowering found.
	return SDValue();
	}

	/// \brief Try to get a scalar value for a specific element of a vector.
	///
	/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
	static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
	SelectionDAG &DAG) {
	MVT VT = V.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	V = peekThroughBitcasts(V);

	// If the bitcasts shift the element size, we can't extract an equivalent
	// element from it.
	MVT NewVT = V.getSimpleValueType();
	if (!NewVT.isVector() \|\| NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
	return SDValue();

	if (V.getOpcode() == ISD::BUILD_VECTOR \|\|
	(Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
	// Ensure the scalar operand is the same size as the destination.
	// FIXME: Add support for scalar truncation where possible.
	SDValue S = V.getOperand(Idx);
	if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
	return DAG.getBitcast(EltVT, S);
	}

	return SDValue();
	}

	/// \brief Helper to test for a load that can be folded with x86 shuffles.
	///
	/// This is particularly important because the set of instructions varies
	/// significantly based on whether the operand is a load or not.
	static bool isShuffleFoldableLoad(SDValue V) {
	V = peekThroughBitcasts(V);
	return ISD::isNON_EXTLoad(V.getNode());
	}

	/// \brief Try to lower insertion of a single element into a zero vector.
	///
	/// This is a common pattern that we have especially efficient patterns to lower
	/// across all subtarget feature sets.
	static SDValue lowerVectorShuffleAsElementInsertion(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT ExtVT = VT;
	MVT EltVT = VT.getVectorElementType();

	int V2Index =
	find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
	Mask.begin();
	bool IsV1Zeroable = true;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (i != V2Index && !Zeroable[i]) {
	IsV1Zeroable = false;
	break;
	}

	// Check for a single input from a SCALAR_TO_VECTOR node.
	// FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
	// all the smarts here sunk into that routine. However, the current
	// lowering of BUILD_VECTOR makes that nearly impossible until the old
	// vector shuffle lowering is dead.
	SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
	DAG);
	if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
	// We need to zext the scalar if it is smaller than an i32.
	V2S = DAG.getBitcast(EltVT, V2S);
	if (EltVT == MVT::i8 \|\| EltVT == MVT::i16) {
	// Using zext to expand a narrow element won't work for non-zero
	// insertions.
	if (!IsV1Zeroable)
	return SDValue();

	// Zero-extend directly to i32.
	ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
	V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
	}
	V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
	} else if (Mask[V2Index] != (int)Mask.size() \|\| EltVT == MVT::i8 \|\|
	EltVT == MVT::i16) {
	// Either not inserting from the low element of the input or the input
	// element size is too small to use VZEXT_MOVL to clear the high bits.
	return SDValue();
	}

	if (!IsV1Zeroable) {
	// If V1 can't be treated as a zero vector we have fewer options to lower
	// this. We can't support integer vectors or non-zero targets cheaply, and
	// the V1 elements can't be permuted in any way.
	assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
	if (!VT.isFloatingPoint() \|\| V2Index != 0)
	return SDValue();
	SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
	V1Mask[V2Index] = -1;
	if (!isNoopShuffleMask(V1Mask))
	return SDValue();
	if (!VT.is128BitVector())
	return SDValue();

	// Otherwise, use MOVSD or MOVSS.
	assert((EltVT == MVT::f32 \|\| EltVT == MVT::f64) &&
	"Only two types of floating point element types to handle!");
	return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
	ExtVT, V1, V2);
	}

	// This lowering only works for the low element with floating point vectors.
	if (VT.isFloatingPoint() && V2Index != 0)
	return SDValue();

	V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
	if (ExtVT != VT)
	V2 = DAG.getBitcast(VT, V2);

	if (V2Index != 0) {
	// If we have 4 or fewer lanes we can cheaply shuffle the element into
	// the desired position. Otherwise it is more efficient to do a vector
	// shift left. We know that we can do a vector shift left because all
	// the inputs are zero.
	if (VT.isFloatingPoint() \|\| VT.getVectorNumElements() <= 4) {
	SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
	V2Shuffle[V2Index] = 0;
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
	} else {
	V2 = DAG.getBitcast(MVT::v16i8, V2);
	V2 = DAG.getNode(
	X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
	DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
	DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
	DAG.getDataLayout(), VT)));
	V2 = DAG.getBitcast(VT, V2);
	}
	}
	return V2;
	}

	/// Try to lower broadcast of a single - truncated - integer element,
	/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
	///
	/// This assumes we have AVX2.
	static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
	SDValue V0, int BroadcastIdx,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX2() &&
	"We can only lower integer broadcasts with AVX2!");

	EVT EltVT = VT.getVectorElementType();
	EVT V0VT = V0.getValueType();

	assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
	assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");

	EVT V0EltVT = V0VT.getVectorElementType();
	if (!V0EltVT.isInteger())
	return SDValue();

	const unsigned EltSize = EltVT.getSizeInBits();
	const unsigned V0EltSize = V0EltVT.getSizeInBits();

	// This is only a truncation if the original element type is larger.
	if (V0EltSize <= EltSize)
	return SDValue();

	assert(((V0EltSize % EltSize) == 0) &&
	"Scalar type sizes must all be powers of 2 on x86!");

	const unsigned V0Opc = V0.getOpcode();
	const unsigned Scale = V0EltSize / EltSize;
	const unsigned V0BroadcastIdx = BroadcastIdx / Scale;

	if ((V0Opc != ISD::SCALAR_TO_VECTOR \|\| V0BroadcastIdx != 0) &&
	V0Opc != ISD::BUILD_VECTOR)
	return SDValue();

	SDValue Scalar = V0.getOperand(V0BroadcastIdx);

	// If we're extracting non-least-significant bits, shift so we can truncate.
	// Hopefully, we can fold away the trunc/srl/load into the broadcast.
	// Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
	// vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
	if (const int OffsetIdx = BroadcastIdx % Scale)
	Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
	DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));

	return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
	DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
	}

	/// \brief Try to lower broadcast of a single element.
	///
	/// For convenience, this code also bundles all of the subtarget feature set
	/// filtering. While a little annoying to re-dispatch on type here, there isn't
	/// a convenient way to factor it out.
	static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) \|\|
	(Subtarget.hasAVX() && VT.isFloatingPoint()) \|\|
	(Subtarget.hasAVX2() && VT.isInteger())))
	return SDValue();

	// With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
	// we can only broadcast from a register with AVX2.
	unsigned NumElts = Mask.size();
	unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
	? X86ISD::MOVDDUP
	: X86ISD::VBROADCAST;
	bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) \|\| Subtarget.hasAVX2();

	// Check that the mask is a broadcast.
	int BroadcastIdx = -1;
	for (int i = 0; i != (int)NumElts; ++i) {
	SmallVector<int, 8> BroadcastMask(NumElts, i);
	if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
	BroadcastIdx = i;
	break;
	}
	}

	if (BroadcastIdx < 0)
	return SDValue();
	assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
	"a sorted mask where the broadcast "
	"comes from V1.");

	// Go up the chain of (vector) values to find a scalar load that we can
	// combine with the broadcast.
	SDValue V = V1;
	for (;;) {
	switch (V.getOpcode()) {
	case ISD::BITCAST: {
	// Peek through bitcasts as long as BroadcastIdx can be adjusted.
	SDValue VSrc = V.getOperand(0);
	unsigned NumEltBits = V.getScalarValueSizeInBits();
	unsigned NumSrcBits = VSrc.getScalarValueSizeInBits();
	if ((NumEltBits % NumSrcBits) == 0)
	BroadcastIdx *= (NumEltBits / NumSrcBits);
	else if ((NumSrcBits % NumEltBits) == 0 &&
	(BroadcastIdx % (NumSrcBits / NumEltBits)) == 0)
	BroadcastIdx /= (NumSrcBits / NumEltBits);
	else
	break;
	V = VSrc;
	continue;
	}
	case ISD::CONCAT_VECTORS: {
	int OperandSize = Mask.size() / V.getNumOperands();
	V = V.getOperand(BroadcastIdx / OperandSize);
	BroadcastIdx %= OperandSize;
	continue;
	}
	case ISD::INSERT_SUBVECTOR: {
	SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
	auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
	if (!ConstantIdx)
	break;

	int BeginIdx = (int)ConstantIdx->getZExtValue();
	int EndIdx =
	BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
	if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
	BroadcastIdx -= BeginIdx;
	V = VInner;
	} else {
	V = VOuter;
	}
	continue;
	}
	}
	break;
	}

	// Ensure the source vector and BroadcastIdx are for a suitable type.
	if (VT.getScalarSizeInBits() != V.getScalarValueSizeInBits()) {
	unsigned NumEltBits = VT.getScalarSizeInBits();
	unsigned NumSrcBits = V.getScalarValueSizeInBits();
	if ((NumSrcBits % NumEltBits) == 0)
	BroadcastIdx *= (NumSrcBits / NumEltBits);
	else if ((NumEltBits % NumSrcBits) == 0 &&
	(BroadcastIdx % (NumEltBits / NumSrcBits)) == 0)
	BroadcastIdx /= (NumEltBits / NumSrcBits);
	else
	return SDValue();

	unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
	MVT SrcVT = MVT::getVectorVT(VT.getScalarType(), NumSrcElts);
	V = DAG.getBitcast(SrcVT, V);
	}

	// Check if this is a broadcast of a scalar. We special case lowering
	// for scalars so that we can more effectively fold with loads.
	// First, look through bitcast: if the original value has a larger element
	// type than the shuffle, the broadcast element is in essence truncated.
	// Make that explicit to ease folding.
	if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
	if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
	DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
	return TruncBroadcast;

	MVT BroadcastVT = VT;

	// Peek through any bitcast (only useful for loads).
	SDValue BC = peekThroughBitcasts(V);

	// Also check the simpler case, where we can directly reuse the scalar.
	if (V.getOpcode() == ISD::BUILD_VECTOR \|\|
	(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
	V = V.getOperand(BroadcastIdx);

	// If we can't broadcast from a register, check that the input is a load.
	if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
	return SDValue();
	} else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
	// 32-bit targets need to load i64 as a f64 and then bitcast the result.
	if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
	BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
	Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
	? X86ISD::MOVDDUP
	: Opcode;
	}

	// If we are broadcasting a load that is only used by the shuffle
	// then we can reduce the vector load to the broadcasted scalar load.
	LoadSDNode *Ld = cast<LoadSDNode>(BC);
	SDValue BaseAddr = Ld->getOperand(1);
	EVT SVT = BroadcastVT.getScalarType();
	unsigned Offset = BroadcastIdx * SVT.getStoreSize();
	SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
	V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
	DAG.getMachineFunction().getMachineMemOperand(
	Ld->getMemOperand(), Offset, SVT.getStoreSize()));
	DAG.makeEquivalentMemoryOrdering(Ld, V);
	} else if (!BroadcastFromReg) {
	// We can't broadcast from a vector register.
	return SDValue();
	} else if (BroadcastIdx != 0) {
	// We can only broadcast from the zero-element of a vector register,
	// but it can be advantageous to broadcast from the zero-element of a
	// subvector.
	if (!VT.is256BitVector() && !VT.is512BitVector())
	return SDValue();

	// VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
	if (VT == MVT::v4f64 \|\| VT == MVT::v4i64)
	return SDValue();

	// Only broadcast the zero-element of a 128-bit subvector.
	unsigned EltSize = VT.getScalarSizeInBits();
	if (((BroadcastIdx * EltSize) % 128) != 0)
	return SDValue();

	// The shuffle input might have been a bitcast we looked through; look at
	// the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
	// later bitcast it to BroadcastVT.
	assert(V.getScalarValueSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
	"Unexpected vector element size");
	assert((V.getValueSizeInBits() == 256 \|\| V.getValueSizeInBits() == 512) &&
	"Unexpected vector size");
	V = extract128BitVector(V, BroadcastIdx, DAG, DL);
	}

	if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
	DAG.getBitcast(MVT::f64, V));

	// Bitcast back to the same scalar type as BroadcastVT.
	MVT SrcVT = V.getSimpleValueType();
	if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
	assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
	"Unexpected vector element size");
	if (SrcVT.isVector()) {
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
	} else {
	SrcVT = BroadcastVT.getScalarType();
	}
	V = DAG.getBitcast(SrcVT, V);
	}

	// 32-bit targets need to load i64 as a f64 and then bitcast the result.
	if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
	V = DAG.getBitcast(MVT::f64, V);
	unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
	BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
	}

	// We only support broadcasting from 128-bit vectors to minimize the
	// number of patterns we need to deal with in isel. So extract down to
	// 128-bits, removing as many bitcasts as possible.
	if (SrcVT.getSizeInBits() > 128) {
	MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(),
	128 / SrcVT.getScalarSizeInBits());
	V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
	V = DAG.getBitcast(ExtVT, V);
	}

	return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
	}

	// Check for whether we can use INSERTPS to perform the shuffle. We only use
	// INSERTPS when the V1 elements are already in the correct locations
	// because otherwise we can just always use two SHUFPS instructions which
	// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
	// perform INSERTPS if a single V1 element is out of place and all V2
	// elements are zeroable.
	static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
	unsigned &InsertPSMask,
	const APInt &Zeroable,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
	assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	// Attempt to match INSERTPS with one element from VA or VB being
	// inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
	// are updated.
	auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
	ArrayRef<int> CandidateMask) {
	unsigned ZMask = 0;
	int VADstIndex = -1;
	int VBDstIndex = -1;
	bool VAUsedInPlace = false;

	for (int i = 0; i < 4; ++i) {
	// Synthesize a zero mask from the zeroable elements (includes undefs).
	if (Zeroable[i]) {
	ZMask \|= 1 << i;
	continue;
	}

	// Flag if we use any VA inputs in place.
	if (i == CandidateMask[i]) {
	VAUsedInPlace = true;
	continue;
	}

	// We can only insert a single non-zeroable element.
	if (VADstIndex >= 0 \|\| VBDstIndex >= 0)
	return false;

	if (CandidateMask[i] < 4) {
	// VA input out of place for insertion.
	VADstIndex = i;
	} else {
	// VB input for insertion.
	VBDstIndex = i;
	}
	}

	// Don't bother if we have no (non-zeroable) element for insertion.
	if (VADstIndex < 0 && VBDstIndex < 0)
	return false;

	// Determine element insertion src/dst indices. The src index is from the
	// start of the inserted vector, not the start of the concatenated vector.
	unsigned VBSrcIndex = 0;
	if (VADstIndex >= 0) {
	// If we have a VA input out of place, we use VA as the V2 element
	// insertion and don't use the original V2 at all.
	VBSrcIndex = CandidateMask[VADstIndex];
	VBDstIndex = VADstIndex;
	VB = VA;
	} else {
	VBSrcIndex = CandidateMask[VBDstIndex] - 4;
	}

	// If no V1 inputs are used in place, then the result is created only from
	// the zero mask and the V2 insertion - so remove V1 dependency.
	if (!VAUsedInPlace)
	VA = DAG.getUNDEF(MVT::v4f32);

	// Update V1, V2 and InsertPSMask accordingly.
	V1 = VA;
	V2 = VB;

	// Insert the V2 element into the desired position.
	InsertPSMask = VBSrcIndex << 6 \| VBDstIndex << 4 \| ZMask;
	assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
	return true;
	};

	if (matchAsInsertPS(V1, V2, Mask))
	return true;

	// Commute and try again.
	SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
	ShuffleVectorSDNode::commuteMask(CommutedMask);
	if (matchAsInsertPS(V2, V1, CommutedMask))
	return true;

	return false;
	}

	static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");

	// Attempt to match the insertps pattern.
	unsigned InsertPSMask;
	if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
	return SDValue();

	// Insert the V2 element into the desired position.
	return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
	DAG.getConstant(InsertPSMask, DL, MVT::i8));
	}

	/// \brief Try to lower a shuffle as a permute of the inputs followed by an
	/// UNPCK instruction.
	///
	/// This specifically targets cases where we end up with alternating between
	/// the two inputs, and so can permute them into something that feeds a single
	/// UNPCK instruction. Note that this routine only targets integer vectors
	/// because for floating point vectors we have a generalized SHUFPS lowering
	/// strategy that handles everything that doesn't exactly match an unpack,
	/// making this clever lowering unnecessary.
	static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(!VT.isFloatingPoint() &&
	"This routine only supports integer vectors.");
	assert(VT.is128BitVector() &&
	"This routine only works on 128-bit vectors.");
	assert(!V2.isUndef() &&
	"This routine should only be used when blending two inputs.");
	assert(Mask.size() >= 2 && "Single element masks are invalid.");

	int Size = Mask.size();

	int NumLoInputs =
	count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
	int NumHiInputs =
	count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });

	bool UnpackLo = NumLoInputs >= NumHiInputs;

	auto TryUnpack = [&](int ScalarSize, int Scale) {
	SmallVector<int, 16> V1Mask((unsigned)Size, -1);
	SmallVector<int, 16> V2Mask((unsigned)Size, -1);

	for (int i = 0; i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	// Each element of the unpack contains Scale elements from this mask.
	int UnpackIdx = i / Scale;

	// We only handle the case where V1 feeds the first slots of the unpack.
	// We rely on canonicalization to ensure this is the case.
	if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
	return SDValue();

	// Setup the mask for this input. The indexing is tricky as we have to
	// handle the unpack stride.
	SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
	VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
	Mask[i] % Size;
	}

	// If we will have to shuffle both inputs to use the unpack, check whether
	// we can just unpack first and shuffle the result. If so, skip this unpack.
	if ((NumLoInputs == 0 \|\| NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
	!isNoopShuffleMask(V2Mask))
	return SDValue();

	// Shuffle the inputs into place.
	V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);

	// Cast the inputs to the type we will use to unpack them.
	MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
	V1 = DAG.getBitcast(UnpackVT, V1);
	V2 = DAG.getBitcast(UnpackVT, V2);

	// Unpack the inputs and cast the result back to the desired type.
	return DAG.getBitcast(
	VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
	UnpackVT, V1, V2));
	};

	// We try each unpack from the largest to the smallest to try and find one
	// that fits this mask.
	int OrigScalarSize = VT.getScalarSizeInBits();
	for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
	if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
	return Unpack;

	// If none of the unpack-rooted lowerings worked (or were profitable) try an
	// initial unpack.
	if (NumLoInputs == 0 \|\| NumHiInputs == 0) {
	assert((NumLoInputs > 0 \|\| NumHiInputs > 0) &&
	"We have to have some inputs!");
	int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;

	// FIXME: We could consider the total complexity of the permute of each
	// possible unpacking. Or at the least we should consider how many
	// half-crossings are created.
	// FIXME: We could consider commuting the unpacks.

	SmallVector<int, 32> PermMask((unsigned)Size, -1);
	for (int i = 0; i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");

	PermMask[i] =
	2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
	}
	return DAG.getVectorShuffle(
	VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
	DL, VT, V1, V2),
	DAG.getUNDEF(VT), PermMask);
	}

	return SDValue();
	}

	/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
	///
	/// This is the basis function for the 2-lane 64-bit shuffles as we have full
	/// support for floating point shuffles but not integer shuffles. These
	/// instructions will incur a domain crossing penalty on some chips though so
	/// it is better to avoid lowering through this for integer vectors where
	/// possible.
	static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
	assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Straight shuffle of a single input vector. Simulate this by using the
	// single input as both of the "inputs" to this instruction..
	unsigned SHUFPDMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1);

	if (Subtarget.hasAVX()) {
	// If we have AVX, we can use VPERMILPS which will allow folding a load
	// into the shuffle.
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
	DAG.getConstant(SHUFPDMask, DL, MVT::i8));
	}

	return DAG.getNode(
	X86ISD::SHUFP, DL, MVT::v2f64,
	Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
	Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
	DAG.getConstant(SHUFPDMask, DL, MVT::i8));
	}
	assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
	assert(Mask[1] >= 2 && "Non-canonicalized blend!");

	// If we have a single input, insert that into V1 if we can do so cheaply.
	if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;
	// Try inverting the insertion since for v2 masks it is easy to do and we
	// can't reliably sort the mask one way or the other.
	int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
	Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
	return Insertion;
	}

	// Try to use one of the special instruction patterns to handle two common
	// blend patterns if a zero-blend above didn't work.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) \|\|
	isShuffleEquivalent(V1, V2, Mask, {1, 3}))
	if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
	// We can either use a special instruction to load over the low double or
	// to move just the low double.
	return DAG.getNode(
	isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
	DL, MVT::v2f64, V2,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));

	if (Subtarget.hasSSE41())
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
	return V;

	unsigned SHUFPDMask = (Mask[0] == 1) \| (((Mask[1] - 2) == 1) << 1);
	return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
	DAG.getConstant(SHUFPDMask, DL, MVT::i8));
	}

	/// \brief Handle lowering of 2-lane 64-bit integer shuffles.
	///
	/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
	/// the integer unit to minimize domain crossing penalties. However, for blends
	/// it falls back to the floating point shuffle operation with appropriate bit
	/// casting.
	static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
	assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Straight shuffle of a single input vector. For everything from SSE2
	// onward this has a single fast instruction with no scary immediates.
	// We have to map the mask as it is actually a v4i32 shuffle instruction.
	V1 = DAG.getBitcast(MVT::v4i32, V1);
	int WidenedMask[4] = {
	std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
	std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
	return DAG.getBitcast(
	MVT::v2i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
	getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
	}
	assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[0] < 2 && "We sort V1 to be the first input.");
	assert(Mask[1] >= 2 && "We sort V2 to be the second input.");

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// When loading a scalar and then shuffling it into a vector we can often do
	// the insertion cheaply.
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;
	// Try inverting the insertion since for v2 masks it is easy to do and we
	// can't reliably sort the mask one way or the other.
	int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
	return Insertion;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
	return V;

	// Try to use byte rotation instructions.
	// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
	if (Subtarget.hasSSSE3()) {
	if (Subtarget.hasVLX())
	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
	return Rotate;
	}

	// If we have direct support for blends, we should lower by decomposing into
	// a permute. That will be faster than the domain cross.
	if (IsBlendSupported)
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
	Mask, DAG);

	// We implement this with SHUFPD which is pretty lame because it will likely
	// incur 2 cycles of stall for integer vectors on Nehalem and older chips.
	// However, all the alternatives are still more cycles and newer chips don't
	// have this problem. It would be really nice if x86 had better shuffles here.
	V1 = DAG.getBitcast(MVT::v2f64, V1);
	V2 = DAG.getBitcast(MVT::v2f64, V2);
	return DAG.getBitcast(MVT::v2i64,
	DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
	}

	/// \brief Test whether this can be lowered with a single SHUFPS instruction.
	///
	/// This is used to disable more specialized lowerings when the shufps lowering
	/// will happen to be efficient.
	static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
	// This routine only handles 128-bit shufps.
	assert(Mask.size() == 4 && "Unsupported mask size!");
	assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
	assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
	assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
	assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");

	// To lower with a single SHUFPS we need to have the low half and high half
	// each requiring a single input.
	if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
	return false;
	if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
	return false;

	return true;
	}

	/// \brief Lower a vector shuffle using the SHUFPS instruction.
	///
	/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
	/// It makes no assumptions about whether this is the best lowering, it simply
	/// uses it.
	static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	SDValue LowV = V1, HighV = V2;
	int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 1) {
	int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();

	// Compute the index adjacent to V2Index and in the same half by toggling
	// the low bit.
	int V2AdjIndex = V2Index ^ 1;

	if (Mask[V2AdjIndex] < 0) {
	// Handles all the cases where we have a single V2 element and an undef.
	// This will only ever happen in the high lanes because we commute the
	// vector otherwise.
	if (V2Index < 2)
	std::swap(LowV, HighV);
	NewMask[V2Index] -= 4;
	} else {
	// Handle the case where the V2 element ends up adjacent to a V1 element.
	// To make this work, blend them together as the first step.
	int V1Index = V2AdjIndex;
	int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
	V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
	getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

	// Now proceed to reconstruct the final blend as we have the necessary
	// high or low half formed.
	if (V2Index < 2) {
	LowV = V2;
	HighV = V1;
	} else {
	HighV = V2;
	}
	NewMask[V1Index] = 2; // We put the V1 element in V2[2].
	NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
	}
	} else if (NumV2Elements == 2) {
	if (Mask[0] < 4 && Mask[1] < 4) {
	// Handle the easy case where we have V1 in the low lanes and V2 in the
	// high lanes.
	NewMask[2] -= 4;
	NewMask[3] -= 4;
	} else if (Mask[2] < 4 && Mask[3] < 4) {
	// We also handle the reversed case because this utility may get called
	// when we detect a SHUFPS pattern but can't easily commute the shuffle to
	// arrange things in the right direction.
	NewMask[0] -= 4;
	NewMask[1] -= 4;
	HighV = V1;
	LowV = V2;
	} else {
	// We have a mixture of V1 and V2 in both low and high lanes. Rather than
	// trying to place elements directly, just blend them and set up the final
	// shuffle to place them.

	// The first two blend mask elements are for V1, the second two are for
	// V2.
	int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
	Mask[2] < 4 ? Mask[2] : Mask[3],
	(Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
	(Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
	V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
	getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

	// Now we do a normal shuffle of V1 by giving V1 as both operands to
	// a blend.
	LowV = HighV = V1;
	NewMask[0] = Mask[0] < 4 ? 0 : 2;
	NewMask[1] = Mask[0] < 4 ? 2 : 0;
	NewMask[2] = Mask[2] < 4 ? 1 : 3;
	NewMask[3] = Mask[2] < 4 ? 3 : 1;
	}
	}
	return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
	getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
	}

	/// \brief Lower 4-lane 32-bit floating point shuffles.
	///
	/// Uses instructions exclusively from the floating point unit to minimize
	/// domain crossing penalties, as these are sufficient to implement all v4f32
	/// shuffles.
	static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (Subtarget.hasSSE3()) {
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
	if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
	}

	if (Subtarget.hasAVX()) {
	// If we have AVX, we can use VPERMILPS which will allow folding a load
	// into the shuffle.
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
	// in SSE1 because otherwise they are widened to v2f64 and never get here.
	if (!Subtarget.hasSSE2()) {
	if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
	return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
	if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
	return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
	}

	// Otherwise, use a straight shuffle of a single input vector. We pass the
	// input vector to both operands to simulate this with a SHUFPS.
	return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// There are special ways we can lower some single-element blends. However, we
	// have custom ways we can lower more complex single-element blends below that
	// we defer to if both this and BLENDPS fail to match, so restrict this to
	// when the V2 input is targeting element 0 of the mask -- that is the fast
	// case here.
	if (NumV2Elements == 1 && Mask[0] >= 4)
	if (SDValue V = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	if (Subtarget.hasSSE41()) {
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use INSERTPS if we can complete the shuffle efficiently.
	if (SDValue V =
	lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
	return V;

	if (!isSingleSHUFPSMask(Mask))
	if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
	DL, MVT::v4f32, V1, V2, Mask, DAG))
	return BlendPerm;
	}

	// Use low/high mov instructions. These are only valid in SSE1 because
	// otherwise they are widened to v2f64 and never get here.
	if (!Subtarget.hasSSE2()) {
	if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
	return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
	if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
	return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
	}

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
	return V;

	// Otherwise fall back to a SHUFPS lowering strategy.
	return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
	}

	/// \brief Lower 4-lane i32 vector shuffles.
	///
	/// We try to handle these with integer-domain shuffles where we can, but for
	/// blends we use the floating point domain blend instructions.
	static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Straight shuffle of a single input vector. For everything from SSE2
	// onward this has a single fast instruction with no scary immediates.
	// We coerce the shuffle pattern to be compatible with UNPCK instructions
	// but we aren't actually going to use the UNPCK instruction because doing
	// so prevents folding a load into this instruction or making a copy.
	const int UnpackLoMask[] = {0, 0, 1, 1};
	const int UnpackHiMask[] = {2, 2, 3, 3};
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
	Mask = UnpackLoMask;
	else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
	Mask = UnpackHiMask;

	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// There are special ways we can lower some single-element blends.
	if (NumV2Elements == 1)
	if (SDValue V = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
	return V;

	// Try to use byte rotation instructions.
	// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
	if (Subtarget.hasSSSE3()) {
	if (Subtarget.hasVLX())
	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
	return Rotate;
	}

	// Assume that a single SHUFPS is faster than an alternative sequence of
	// multiple instructions (even if the CPU has a domain penalty).
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (!isSingleSHUFPSMask(Mask)) {
	// If we have direct support for blends, we should lower by decomposing into
	// a permute. That will be faster than the domain cross.
	if (IsBlendSupported)
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
	Mask, DAG);

	// Try to lower by permuting the inputs into an unpack instruction.
	if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
	DL, MVT::v4i32, V1, V2, Mask, DAG))
	return Unpack;
	}

	// We implement this with SHUFPS because it can blend from two vectors.
	// Because we're going to eventually use SHUFPS, we use SHUFPS even to build
	// up the inputs, bypassing domain shift penalties that we would incur if we
	// directly used PSHUFD on Nehalem and older. For newer chips, this isn't
	// relevant.
	SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
	SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
	return DAG.getBitcast(MVT::v4i32, ShufPS);
	}

	/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
	/// shuffle lowering, and the most complex part.
	///
	/// The lowering strategy is to try to form pairs of input lanes which are
	/// targeted at the same half of the final vector, and then use a dword shuffle
	/// to place them onto the right half, and finally unpack the paired lanes into
	/// their final position.
	///
	/// The exact breakdown of how to form these dword pairs and align them on the
	/// correct sides is really tricky. See the comments within the function for
	/// more of the details.
	///
	/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
	/// lane must shuffle the exact same way. In fact, you must pass a v8 Mask to
	/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
	/// vector, form the analogous 128-bit 8-element Mask.
	static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
	const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
	MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);

	assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
	MutableArrayRef<int> LoMask = Mask.slice(0, 4);
	MutableArrayRef<int> HiMask = Mask.slice(4, 4);

	// Attempt to directly match PSHUFLW or PSHUFHW.
	if (isUndefOrInRange(LoMask, 0, 4) &&
	isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
	return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
	}
	if (isUndefOrInRange(HiMask, 4, 8) &&
	isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
	for (int i = 0; i != 4; ++i)
	HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
	return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
	}

	SmallVector<int, 4> LoInputs;
	copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
	std::sort(LoInputs.begin(), LoInputs.end());
	LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
	SmallVector<int, 4> HiInputs;
	copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
	std::sort(HiInputs.begin(), HiInputs.end());
	HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
	int NumLToL =
	std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
	int NumHToL = LoInputs.size() - NumLToL;
	int NumLToH =
	std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
	int NumHToH = HiInputs.size() - NumLToH;
	MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
	MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
	MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
	MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);

	// If we are shuffling values from one half - check how many different DWORD
	// pairs we need to create. If only 1 or 2 then we can perform this as a
	// PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
	auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
	ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
	V = DAG.getNode(ShufWOp, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
	V = DAG.getBitcast(PSHUFDVT, V);
	V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
	return DAG.getBitcast(VT, V);
	};

	if ((NumHToL + NumHToH) == 0 \|\| (NumLToL + NumLToH) == 0) {
	int PSHUFDMask[4] = { -1, -1, -1, -1 };
	SmallVector<std::pair<int, int>, 4> DWordPairs;
	int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);

	// Collect the different DWORD pairs.
	for (int DWord = 0; DWord != 4; ++DWord) {
	int M0 = Mask[2 * DWord + 0];
	int M1 = Mask[2 * DWord + 1];
	M0 = (M0 >= 0 ? M0 % 4 : M0);
	M1 = (M1 >= 0 ? M1 % 4 : M1);
	if (M0 < 0 && M1 < 0)
	continue;

	bool Match = false;
	for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
	auto &DWordPair = DWordPairs[j];
	if ((M0 < 0 \|\| isUndefOrEqual(DWordPair.first, M0)) &&
	(M1 < 0 \|\| isUndefOrEqual(DWordPair.second, M1))) {
	DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
	DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
	PSHUFDMask[DWord] = DOffset + j;
	Match = true;
	break;
	}
	}
	if (!Match) {
	PSHUFDMask[DWord] = DOffset + DWordPairs.size();
	DWordPairs.push_back(std::make_pair(M0, M1));
	}
	}

	if (DWordPairs.size() <= 2) {
	DWordPairs.resize(2, std::make_pair(-1, -1));
	int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
	DWordPairs[1].first, DWordPairs[1].second};
	if ((NumHToL + NumHToH) == 0)
	return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
	if ((NumLToL + NumLToH) == 0)
	return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
	}
	}

	// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
	// such inputs we can swap two of the dwords across the half mark and end up
	// with <=2 inputs to each half in each half. Once there, we can fall through
	// to the generic code below. For example:
	//
	// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
	// Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
	//
	// However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
	// and an existing 2-into-2 on the other half. In this case we may have to
	// pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
	// 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
	// Fortunately, we don't have to handle anything but a 2-into-2 pattern
	// because any other situation (including a 3-into-1 or 1-into-3 in the other
	// half than the one we target for fixing) will be fixed when we re-enter this
	// path. We will also combine away any sequence of PSHUFD instructions that
	// result into a single instruction. Here is an example of the tricky case:
	//
	// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
	//
	// This now has a 1-into-3 in the high half! Instead, we do two shuffles:
	//
	// Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
	//
	// Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
	//
	// The result is fine to be handled by the generic logic.
	auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
	ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
	int AOffset, int BOffset) {
	assert((AToAInputs.size() == 3 \|\| AToAInputs.size() == 1) &&
	"Must call this with A having 3 or 1 inputs from the A half.");
	assert((BToAInputs.size() == 1 \|\| BToAInputs.size() == 3) &&
	"Must call this with B having 1 or 3 inputs from the B half.");
	assert(AToAInputs.size() + BToAInputs.size() == 4 &&
	"Must call this with either 3:1 or 1:3 inputs (summing to 4).");

	bool ThreeAInputs = AToAInputs.size() == 3;

	// Compute the index of dword with only one word among the three inputs in
	// a half by taking the sum of the half with three inputs and subtracting
	// the sum of the actual three inputs. The difference is the remaining
	// slot.
	int ADWord, BDWord;
	int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
	int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
	int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
	ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
	int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
	int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
	int TripleNonInputIdx =
	TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
	TripleDWord = TripleNonInputIdx / 2;

	// We use xor with one to compute the adjacent DWord to whichever one the
	// OneInput is in.
	OneInputDWord = (OneInput / 2) ^ 1;

	// Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
	// and BToA inputs. If there is also such a problem with the BToB and AToB
	// inputs, we don't try to fix it necessarily -- we'll recurse and see it in
	// the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
	// is essential that we don't create a 3<-1 as then we might oscillate.
	if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
	// Compute how many inputs will be flipped by swapping these DWords. We
	// need
	// to balance this to ensure we don't form a 3-1 shuffle in the other
	// half.
	int NumFlippedAToBInputs =
	std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
	std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
	int NumFlippedBToBInputs =
	std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
	std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
	if ((NumFlippedAToBInputs == 1 &&
	(NumFlippedBToBInputs == 0 \|\| NumFlippedBToBInputs == 2)) \|\|
	(NumFlippedBToBInputs == 1 &&
	(NumFlippedAToBInputs == 0 \|\| NumFlippedAToBInputs == 2))) {
	// We choose whether to fix the A half or B half based on whether that
	// half has zero flipped inputs. At zero, we may not be able to fix it
	// with that half. We also bias towards fixing the B half because that
	// will more commonly be the high half, and we have to bias one way.
	auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
	ArrayRef<int> Inputs) {
	int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
	bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
	// Determine whether the free index is in the flipped dword or the
	// unflipped dword based on where the pinned index is. We use this bit
	// in an xor to conditionally select the adjacent dword.
	int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
	bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
	if (IsFixIdxInput == IsFixFreeIdxInput)
	FixFreeIdx += 1;
	IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
	assert(IsFixIdxInput != IsFixFreeIdxInput &&
	"We need to be changing the number of flipped inputs!");
	int PSHUFHalfMask[] = {0, 1, 2, 3};
	std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
	V = DAG.getNode(
	FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
	MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
	getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));

	for (int &M : Mask)
	if (M >= 0 && M == FixIdx)
	M = FixFreeIdx;
	else if (M >= 0 && M == FixFreeIdx)
	M = FixIdx;
	};
	if (NumFlippedBToBInputs != 0) {
	int BPinnedIdx =
	BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
	FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
	} else {
	assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
	int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
	FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
	}
	}
	}

	int PSHUFDMask[] = {0, 1, 2, 3};
	PSHUFDMask[ADWord] = BDWord;
	PSHUFDMask[BDWord] = ADWord;
	V = DAG.getBitcast(
	VT,
	DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

	// Adjust the mask to match the new locations of A and B.
	for (int &M : Mask)
	if (M >= 0 && M/2 == ADWord)
	M = 2 * BDWord + M % 2;
	else if (M >= 0 && M/2 == BDWord)
	M = 2 * ADWord + M % 2;

	// Recurse back into this routine to re-compute state now that this isn't
	// a 3 and 1 problem.
	return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
	DAG);
	};
	if ((NumLToL == 3 && NumHToL == 1) \|\| (NumLToL == 1 && NumHToL == 3))
	return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
	if ((NumHToH == 3 && NumLToH == 1) \|\| (NumHToH == 1 && NumLToH == 3))
	return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);

	// At this point there are at most two inputs to the low and high halves from
	// each half. That means the inputs can always be grouped into dwords and
	// those dwords can then be moved to the correct half with a dword shuffle.
	// We use at most one low and one high word shuffle to collect these paired
	// inputs into dwords, and finally a dword shuffle to place them.
	int PSHUFLMask[4] = {-1, -1, -1, -1};
	int PSHUFHMask[4] = {-1, -1, -1, -1};
	int PSHUFDMask[4] = {-1, -1, -1, -1};

	// First fix the masks for all the inputs that are staying in their
	// original halves. This will then dictate the targets of the cross-half
	// shuffles.
	auto fixInPlaceInputs =
	[&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
	MutableArrayRef<int> SourceHalfMask,
	MutableArrayRef<int> HalfMask, int HalfOffset) {
	if (InPlaceInputs.empty())
	return;
	if (InPlaceInputs.size() == 1) {
	SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
	InPlaceInputs[0] - HalfOffset;
	PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
	return;
	}
	if (IncomingInputs.empty()) {
	// Just fix all of the in place inputs.
	for (int Input : InPlaceInputs) {
	SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
	PSHUFDMask[Input / 2] = Input / 2;
	}
	return;
	}

	assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
	SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
	InPlaceInputs[0] - HalfOffset;
	// Put the second input next to the first so that they are packed into
	// a dword. We find the adjacent index by toggling the low bit.
	int AdjIndex = InPlaceInputs[0] ^ 1;
	SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
	std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
	PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
	};
	fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
	fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);

	// Now gather the cross-half inputs and place them into a free dword of
	// their target half.
	// FIXME: This operation could almost certainly be simplified dramatically to
	// look more like the 3-1 fixing operation.
	auto moveInputsToRightHalf = [&PSHUFDMask](
	MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
	MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
	MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
	int DestOffset) {
	auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
	return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
	};
	auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
	int Word) {
	int LowWord = Word & ~1;
	int HighWord = Word \| 1;
	return isWordClobbered(SourceHalfMask, LowWord) \|\|
	isWordClobbered(SourceHalfMask, HighWord);
	};

	if (IncomingInputs.empty())
	return;

	if (ExistingInputs.empty()) {
	// Map any dwords with inputs from them into the right half.
	for (int Input : IncomingInputs) {
	// If the source half mask maps over the inputs, turn those into
	// swaps and use the swapped lane.
	if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
	if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
	SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
	Input - SourceOffset;
	// We have to swap the uses in our half mask in one sweep.
	for (int &M : HalfMask)
	if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
	M = Input;
	else if (M == Input)
	M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
	} else {
	assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
	Input - SourceOffset &&
	"Previous placement doesn't match!");
	}
	// Note that this correctly re-maps both when we do a swap and when
	// we observe the other side of the swap above. We rely on that to
	// avoid swapping the members of the input list directly.
	Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
	}

	// Map the input's dword into the correct half.
	if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
	PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
	else
	assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
	Input / 2 &&
	"Previous placement doesn't match!");
	}

	// And just directly shift any other-half mask elements to be same-half
	// as we will have mirrored the dword containing the element into the
	// same position within that half.
	for (int &M : HalfMask)
	if (M >= SourceOffset && M < SourceOffset + 4) {
	M = M - SourceOffset + DestOffset;
	assert(M >= 0 && "This should never wrap below zero!");
	}
	return;
	}

	// Ensure we have the input in a viable dword of its current half. This
	// is particularly tricky because the original position may be clobbered
	// by inputs being moved and staying in that half.
	if (IncomingInputs.size() == 1) {
	if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
	int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
	SourceOffset;
	SourceHalfMask[InputFixed - SourceOffset] =
	IncomingInputs[0] - SourceOffset;
	std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
	InputFixed);
	IncomingInputs[0] = InputFixed;
	}
	} else if (IncomingInputs.size() == 2) {
	if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 \|\|
	isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
	// We have two non-adjacent or clobbered inputs we need to extract from
	// the source half. To do this, we need to map them into some adjacent
	// dword slot in the source mask.
	int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
	IncomingInputs[1] - SourceOffset};

	// If there is a free slot in the source half mask adjacent to one of
	// the inputs, place the other input in it. We use (Index XOR 1) to
	// compute an adjacent index.
	if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
	SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
	SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
	SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
	InputsFixed[1] = InputsFixed[0] ^ 1;
	} else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
	SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
	SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
	SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
	InputsFixed[0] = InputsFixed[1] ^ 1;
	} else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
	// The two inputs are in the same DWord but it is clobbered and the
	// adjacent DWord isn't used at all. Move both inputs to the free
	// slot.
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
	InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
	InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
	} else {
	// The only way we hit this point is if there is no clobbering
	// (because there are no off-half inputs to this half) and there is no
	// free slot adjacent to one of the inputs. In this case, we have to
	// swap an input with a non-input.
	for (int i = 0; i < 4; ++i)
	assert((SourceHalfMask[i] < 0 \|\| SourceHalfMask[i] == i) &&
	"We can't handle any clobbers here!");
	assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
	"Cannot have adjacent inputs here!");

	SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
	SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;

	// We also have to update the final source mask in this case because
	// it may need to undo the above swap.
	for (int &M : FinalSourceHalfMask)
	if (M == (InputsFixed[0] ^ 1) + SourceOffset)
	M = InputsFixed[1] + SourceOffset;
	else if (M == InputsFixed[1] + SourceOffset)
	M = (InputsFixed[0] ^ 1) + SourceOffset;

	InputsFixed[1] = InputsFixed[0] ^ 1;
	}

	// Point everything at the fixed inputs.
	for (int &M : HalfMask)
	if (M == IncomingInputs[0])
	M = InputsFixed[0] + SourceOffset;
	else if (M == IncomingInputs[1])
	M = InputsFixed[1] + SourceOffset;

	IncomingInputs[0] = InputsFixed[0] + SourceOffset;
	IncomingInputs[1] = InputsFixed[1] + SourceOffset;
	}
	} else {
	llvm_unreachable("Unhandled input size!");
	}

	// Now hoist the DWord down to the right half.
	int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
	assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
	PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
	for (int &M : HalfMask)
	for (int Input : IncomingInputs)
	if (M == Input)
	M = FreeDWord * 2 + Input % 2;
	};
	moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
	/SourceOffset/ 4, /DestOffset/ 0);
	moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
	/SourceOffset/ 0, /DestOffset/ 4);

	// Now enact all the shuffles we've computed to move the inputs into their
	// target half.
	if (!isNoopShuffleMask(PSHUFLMask))
	V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
	if (!isNoopShuffleMask(PSHUFHMask))
	V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
	if (!isNoopShuffleMask(PSHUFDMask))
	V = DAG.getBitcast(
	VT,
	DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

	// At this point, each half should contain all its inputs, and we can then
	// just shuffle them into their final position.
	assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
	"Failed to lift all the high half inputs to the low mask!");
	assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
	"Failed to lift all the low half inputs to the high mask!");

	// Do a half shuffle for the low mask.
	if (!isNoopShuffleMask(LoMask))
	V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));

	// Do a half shuffle with the high mask after shifting its values down.
	for (int &M : HiMask)
	if (M >= 0)
	M -= 4;
	if (!isNoopShuffleMask(HiMask))
	V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));

	return V;
	}

	/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
	/// blend if only one input is used.
	static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
	bool &V2InUse) {
	SDValue V1Mask[16];
	SDValue V2Mask[16];
	V1InUse = false;
	V2InUse = false;

	int Size = Mask.size();
	int Scale = 16 / Size;
	for (int i = 0; i < 16; ++i) {
	if (Mask[i / Scale] < 0) {
	V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
	} else {
	const int ZeroMask = 0x80;
	int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
	: ZeroMask;
	int V2Idx = Mask[i / Scale] < Size
	? ZeroMask
	: (Mask[i / Scale] - Size) * Scale + i % Scale;
	if (Zeroable[i / Scale])
	V1Idx = V2Idx = ZeroMask;
	V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
	V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
	V1InUse \|= (ZeroMask != V1Idx);
	V2InUse \|= (ZeroMask != V2Idx);
	}
	}

	if (V1InUse)
	V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
	DAG.getBitcast(MVT::v16i8, V1),
	DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
	if (V2InUse)
	V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
	DAG.getBitcast(MVT::v16i8, V2),
	DAG.getBuildVector(MVT::v16i8, DL, V2Mask));

	// If we need shuffled inputs from both, blend the two.
	SDValue V;
	if (V1InUse && V2InUse)
	V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
	else
	V = V1InUse ? V1 : V2;

	// Cast the result back to the correct type.
	return DAG.getBitcast(VT, V);
	}

	/// \brief Generic lowering of 8-lane i16 shuffles.
	///
	/// This handles both single-input shuffles and combined shuffle/blends with
	/// two inputs. The single input shuffles are immediately delegated to
	/// a dedicated lowering routine.
	///
	/// The blends are lowered in one of three fundamental ways. If there are few
	/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
	/// of the input is significantly cheaper when lowered as an interleaving of
	/// the two inputs, try to interleave them. Otherwise, blend the low and high
	/// halves of the inputs separately (making them have relatively few inputs)
	/// and then concatenate them.
	static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });

	if (NumV2Inputs == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2,
	DAG, Subtarget))
	return V;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
	Mask, Subtarget, DAG))
	return Rotate;

	// Make a copy of the mask so it can be modified.
	SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
	return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
	MutableMask, Subtarget,
	DAG);
	}

	assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
	"All single-input shuffles should be canonicalized to be V1-input "
	"shuffles.");

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// See if we can use SSE4A Extraction / Insertion.
	if (Subtarget.hasSSE4A())
	if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, DAG))
	return V;

	// There are special ways we can lower some single-element blends.
	if (NumV2Inputs == 1)
	if (SDValue V = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue BitBlend =
	lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
	return BitBlend;

	// Try to lower by permuting the inputs into an unpack instruction.
	if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
	V2, Mask, DAG))
	return Unpack;

	// If we can't directly blend but can use PSHUFB, that will be better as it
	// can both shuffle and set up the inefficient blend.
	if (!IsBlendSupported && Subtarget.hasSSSE3()) {
	bool V1InUse, V2InUse;
	return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, DAG, V1InUse, V2InUse);
	}

	// We can always bit-blend if we have to so the fallback strategy is to
	// decompose into single-input permutes and blends.
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
	Mask, DAG);
	}

	/// \brief Check whether a compaction lowering can be done by dropping even
	/// elements and compute how many times even elements must be dropped.
	///
	/// This handles shuffles which take every Nth element where N is a power of
	/// two. Example shuffle masks:
	///
	/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
	/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
	/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
	/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
	/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
	/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
	///
	/// Any of these lanes can of course be undef.
	///
	/// This routine only supports N <= 3.
	/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
	/// for larger N.
	///
	/// \returns N above, or the number of times even elements must be dropped if
	/// there is such a number. Otherwise returns zero.
	static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
	bool IsSingleInput) {
	// The modulus for the shuffle vector entries is based on whether this is
	// a single input or not.
	int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
	assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
	"We should only be called with masks with a power-of-2 size!");

	uint64_t ModMask = (uint64_t)ShuffleModulus - 1;

	// We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
	// and 2^3 simultaneously. This is because we may have ambiguity with
	// partially undef inputs.
	bool ViableForN[3] = {true, true, true};

	for (int i = 0, e = Mask.size(); i < e; ++i) {
	// Ignore undef lanes, we'll optimistically collapse them to the pattern we
	// want.
	if (Mask[i] < 0)
	continue;

	bool IsAnyViable = false;
	for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
	if (ViableForN[j]) {
	uint64_t N = j + 1;

	// The shuffle mask must be equal to (i * 2^N) % M.
	if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
	IsAnyViable = true;
	else
	ViableForN[j] = false;
	}
	// Early exit if we exhaust the possible powers of two.
	if (!IsAnyViable)
	break;
	}

	for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
	if (ViableForN[j])
	return j + 1;

	// Return 0 as there is no viable power of two.
	return 0;
	}

	static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
	MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());

	SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);

	return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
	}

	/// \brief Generic lowering of v16i8 shuffles.
	///
	/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
	/// detect any complexity reducing interleaving. If that doesn't help, it uses
	/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
	/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
	/// back together.
	static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use a zext lowering.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// See if we can use SSE4A Extraction / Insertion.
	if (Subtarget.hasSSE4A())
	if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, DAG))
	return V;

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });

	// For single-input shuffles, there are some nicer lowering tricks we can use.
	if (NumV2Elements == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Check whether we can widen this to an i16 shuffle by duplicating bytes.
	// Notably, this handles splat and partial-splat shuffles more efficiently.
	// However, it only makes sense if the pre-duplication shuffle simplifies
	// things significantly. Currently, this means we need to be able to
	// express the pre-duplication shuffle as an i16 shuffle.
	//
	// FIXME: We should check for other patterns which can be widened into an
	// i16 shuffle as well.
	auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
	for (int i = 0; i < 16; i += 2)
	if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
	return false;

	return true;
	};
	auto tryToWidenViaDuplication = [&]() -> SDValue {
	if (!canWidenViaDuplication(Mask))
	return SDValue();
	SmallVector<int, 4> LoInputs;
	copy_if(Mask, std::back_inserter(LoInputs),
	[](int M) { return M >= 0 && M < 8; });
	std::sort(LoInputs.begin(), LoInputs.end());
	LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
	LoInputs.end());
	SmallVector<int, 4> HiInputs;
	copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
	std::sort(HiInputs.begin(), HiInputs.end());
	HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
	HiInputs.end());

	bool TargetLo = LoInputs.size() >= HiInputs.size();
	ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
	ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;

	int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
	SmallDenseMap<int, int, 8> LaneMap;
	for (int I : InPlaceInputs) {
	PreDupI16Shuffle[I/2] = I/2;
	LaneMap[I] = I;
	}
	int j = TargetLo ? 0 : 4, je = j + 4;
	for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
	// Check if j is already a shuffle of this input. This happens when
	// there are two adjacent bytes after we move the low one.
	if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
	// If we haven't yet mapped the input, search for a slot into which
	// we can map it.
	while (j < je && PreDupI16Shuffle[j] >= 0)
	++j;

	if (j == je)
	// We can't place the inputs into a single half with a simple i16 shuffle, so bail.
	return SDValue();

	// Map this input with the i16 shuffle.
	PreDupI16Shuffle[j] = MovingInputs[i] / 2;
	}

	// Update the lane map based on the mapping we ended up with.
	LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
	}
	V1 = DAG.getBitcast(
	MVT::v16i8,
	DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
	DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));

	// Unpack the bytes to form the i16s that will be shuffled into place.
	V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
	MVT::v16i8, V1, V1);

	int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
	for (int i = 0; i < 16; ++i)
	if (Mask[i] >= 0) {
	int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
	assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
	if (PostDupI16Shuffle[i / 2] < 0)
	PostDupI16Shuffle[i / 2] = MappedMask;
	else
	assert(PostDupI16Shuffle[i / 2] == MappedMask &&
	"Conflicting entries in the original shuffle!");
	}
	return DAG.getBitcast(
	MVT::v16i8,
	DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
	DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
	};
	if (SDValue V = tryToWidenViaDuplication())
	return V;
	}

	if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
	return V;

	// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
	// with PSHUFB. It is important to do this before we attempt to generate any
	// blends but after all of the single-input lowerings. If the single input
	// lowerings can find an instruction sequence that is faster than a PSHUFB, we
	// want to preserve that and we can DAG combine any longer sequences into
	// a PSHUFB in the end. But once we start blending from multiple inputs,
	// the complexity of DAG combining bad patterns back into PSHUFB is too high,
	// and there are very few patterns that would actually be faster than the
	// PSHUFB approach because of its ability to zero lanes.
	//
	// FIXME: The only exceptions to the above are blends which are exact
	// interleavings with direct instructions supporting them. We currently don't
	// handle those well here.
	if (Subtarget.hasSSSE3()) {
	bool V1InUse = false;
	bool V2InUse = false;

	SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);

	// If both V1 and V2 are in use and we can use a direct blend or an unpack,
	// do so. This avoids using them to handle blends-with-zero which is
	// important as a single pshufb is significantly faster for that.
	if (V1InUse && V2InUse) {
	if (Subtarget.hasSSE41())
	if (SDValue Blend = lowerVectorShuffleAsBlend(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Blend;

	// We can use an unpack to do the blending rather than an or in some
	// cases. Even though the or may be (very minorly) more efficient, we
	// preference this lowering because there are common cases where part of
	// the complexity of the shuffles goes away when we do the final blend as
	// an unpack.
	// FIXME: It might be worth trying to detect if the unpack-feeding
	// shuffles will both be pshufb, in which case we shouldn't bother with
	// this.
	if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
	DL, MVT::v16i8, V1, V2, Mask, DAG))
	return Unpack;

	// If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
	if (Subtarget.hasVBMI() && Subtarget.hasVLX())
	return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
	}

	return PSHUFB;
	}

	// There are special ways we can lower some single-element blends.
	if (NumV2Elements == 1)
	if (SDValue V = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	if (SDValue BitBlend =
	lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
	return BitBlend;

	// Check whether a compaction lowering can be done. This handles shuffles
	// which take every Nth element for some even N. See the helper function for
	// details.
	//
	// We special case these as they can be particularly efficiently handled with
	// the PACKUSB instruction on x86 and they show up in common patterns of
	// rearranging bytes to truncate wide elements.
	bool IsSingleInput = V2.isUndef();
	if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
	// NumEvenDrops is the power of two stride of the elements. Another way of
	// thinking about it is that we need to drop the even elements this many
	// times to get the original input.

	// First we need to zero all the dropped bytes.
	assert(NumEvenDrops <= 3 &&
	"No support for dropping even elements more than 3 times.");
	// We use the mask type to pick which bytes are preserved based on how many
	// elements are dropped.
	MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
	SDValue ByteClearMask = DAG.getBitcast(
	MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
	V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
	if (!IsSingleInput)
	V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);

	// Now pack things back together.
	V1 = DAG.getBitcast(MVT::v8i16, V1);
	V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
	SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
	for (int i = 1; i < NumEvenDrops; ++i) {
	Result = DAG.getBitcast(MVT::v8i16, Result);
	Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
	}

	return Result;
	}

	// Handle multi-input cases by blending single-input shuffles.
	if (NumV2Elements > 0)
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
	Mask, DAG);

	// The fallback path for single-input shuffles widens this into two v8i16
	// vectors with unpacks, shuffles those, and then pulls them back together
	// with a pack.
	SDValue V = V1;

	std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
	std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
	for (int i = 0; i < 16; ++i)
	if (Mask[i] >= 0)
	(i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];

	SDValue VLoHalf, VHiHalf;
	// Check if any of the odd lanes in the v16i8 are used. If not, we can mask
	// them out and avoid using UNPCK{L,H} to extract the elements of V as
	// i16s.
	if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
	none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
	// Use a mask to drop the high bytes.
	VLoHalf = DAG.getBitcast(MVT::v8i16, V);
	VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
	DAG.getConstant(0x00FF, DL, MVT::v8i16));

	// This will be a single vector shuffle instead of a blend so nuke VHiHalf.
	VHiHalf = DAG.getUNDEF(MVT::v8i16);

	// Squash the masks to point directly into VLoHalf.
	for (int &M : LoBlendMask)
	if (M >= 0)
	M /= 2;
	for (int &M : HiBlendMask)
	if (M >= 0)
	M /= 2;
	} else {
	// Otherwise just unpack the low half of V into VLoHalf and the high half into
	// VHiHalf so that we can blend them as i16s.
	SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);

	VLoHalf = DAG.getBitcast(
	MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
	VHiHalf = DAG.getBitcast(
	MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
	}

	SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
	SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);

	return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
	}

	/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
	///
	/// This routine breaks down the specific type of 128-bit shuffle and
	/// dispatches to the lowering routines accordingly.
	static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	switch (VT.SimpleTy) {
	case MVT::v2i64:
	return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v2f64:
	return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4i32:
	return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4f32:
	return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i16:
	return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i8:
	return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Unimplemented!");
	}
	}

	/// \brief Generic routine to split vector shuffle into half-sized shuffles.
	///
	/// This routine just extracts two subvectors, shuffles them independently, and
	/// then concatenates them back together. This should work effectively with all
	/// AVX vector shuffle types.
	static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(VT.getSizeInBits() >= 256 &&
	"Only for 256-bit or wider vector shuffles!");
	assert(V1.getSimpleValueType() == VT && "Bad operand type!");
	assert(V2.getSimpleValueType() == VT && "Bad operand type!");

	ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
	ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);

	int NumElements = VT.getVectorNumElements();
	int SplitNumElements = NumElements / 2;
	MVT ScalarVT = VT.getVectorElementType();
	MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);

	// Rather than splitting build-vectors, just build two narrower build
	// vectors. This helps shuffling with splats and zeros.
	auto SplitVector = [&](SDValue V) {
	V = peekThroughBitcasts(V);

	MVT OrigVT = V.getSimpleValueType();
	int OrigNumElements = OrigVT.getVectorNumElements();
	int OrigSplitNumElements = OrigNumElements / 2;
	MVT OrigScalarVT = OrigVT.getVectorElementType();
	MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);

	SDValue LoV, HiV;

	auto *BV = dyn_cast<BuildVectorSDNode>(V);
	if (!BV) {
	LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
	DAG.getIntPtrConstant(0, DL));
	HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
	DAG.getIntPtrConstant(OrigSplitNumElements, DL));
	} else {

	SmallVector<SDValue, 16> LoOps, HiOps;
	for (int i = 0; i < OrigSplitNumElements; ++i) {
	LoOps.push_back(BV->getOperand(i));
	HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
	}
	LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
	HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
	}
	return std::make_pair(DAG.getBitcast(SplitVT, LoV),
	DAG.getBitcast(SplitVT, HiV));
	};

	SDValue LoV1, HiV1, LoV2, HiV2;
	std::tie(LoV1, HiV1) = SplitVector(V1);
	std::tie(LoV2, HiV2) = SplitVector(V2);

	// Now create two 4-way blends of these half-width vectors.
	auto HalfBlend = [&](ArrayRef<int> HalfMask) {
	bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
	SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
	SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
	SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
	for (int i = 0; i < SplitNumElements; ++i) {
	int M = HalfMask[i];
	if (M >= NumElements) {
	if (M >= NumElements + SplitNumElements)
	UseHiV2 = true;
	else
	UseLoV2 = true;
	V2BlendMask[i] = M - NumElements;
	BlendMask[i] = SplitNumElements + i;
	} else if (M >= 0) {
	if (M >= SplitNumElements)
	UseHiV1 = true;
	else
	UseLoV1 = true;
	V1BlendMask[i] = M;
	BlendMask[i] = i;
	}
	}

	// Because the lowering happens after all combining takes place, we need to
	// manually combine these blend masks as much as possible so that we create
	// a minimal number of high-level vector shuffle nodes.

	// First try just blending the halves of V1 or V2.
	if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
	return DAG.getUNDEF(SplitVT);
	if (!UseLoV2 && !UseHiV2)
	return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
	if (!UseLoV1 && !UseHiV1)
	return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);

	SDValue V1Blend, V2Blend;
	if (UseLoV1 && UseHiV1) {
	V1Blend =
	DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
	} else {
	// We only use half of V1 so map the usage down into the final blend mask.
	V1Blend = UseLoV1 ? LoV1 : HiV1;
	for (int i = 0; i < SplitNumElements; ++i)
	if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
	BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
	}
	if (UseLoV2 && UseHiV2) {
	V2Blend =
	DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
	} else {
	// We only use half of V2 so map the usage down into the final blend mask.
	V2Blend = UseLoV2 ? LoV2 : HiV2;
	for (int i = 0; i < SplitNumElements; ++i)
	if (BlendMask[i] >= SplitNumElements)
	BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
	}
	return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
	};
	SDValue Lo = HalfBlend(LoMask);
	SDValue Hi = HalfBlend(HiMask);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
	}

	/// \brief Either split a vector in halves or decompose the shuffles and the
	/// blend.
	///
	/// This is provided as a good fallback for many lowerings of non-single-input
	/// shuffles with more than one 128-bit lane. In those cases, we want to select
	/// between splitting the shuffle into 128-bit components and stitching those
	/// back together vs. extracting the single-input shuffles and blending those
	/// results.
	static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(!V2.isUndef() && "This routine must not be used to lower single-input "
	"shuffles as it could then recurse on itself.");
	int Size = Mask.size();

	// If this can be modeled as a broadcast of two elements followed by a blend,
	// prefer that lowering. This is especially important because broadcasts can
	// often fold with memory operands.
	auto DoBothBroadcast = [&] {
	int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
	for (int M : Mask)
	if (M >= Size) {
	if (V2BroadcastIdx < 0)
	V2BroadcastIdx = M - Size;
	else if (M - Size != V2BroadcastIdx)
	return false;
	} else if (M >= 0) {
	if (V1BroadcastIdx < 0)
	V1BroadcastIdx = M;
	else if (M != V1BroadcastIdx)
	return false;
	}
	return true;
	};
	if (DoBothBroadcast())
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
	DAG);

	// If the inputs all stem from a single 128-bit lane of each input, then we
	// split them rather than blending because the split will decompose to
	// unusually few instructions.
	int LaneCount = VT.getSizeInBits() / 128;
	int LaneSize = Size / LaneCount;
	SmallBitVector LaneInputs[2];
	LaneInputs[0].resize(LaneCount, false);
	LaneInputs[1].resize(LaneCount, false);
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0)
	LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
	if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
	return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);

	// Otherwise, just fall back to decomposed shuffles and a blend. This requires
	// that the decomposed single-input shuffles don't end up here.
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
	}

	/// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
	/// a permutation and blend of those lanes.
	///
	/// This essentially blends the out-of-lane inputs to each lane into the lane
	/// from a permuted copy of the vector. This lowering strategy results in four
	/// instructions in the worst case for a single-input cross lane shuffle which
	/// is lower than any other fully general cross-lane shuffle strategy I'm aware
	/// of. Special cases for each particular shuffle pattern should be handled
	/// prior to trying this lowering.
	static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// FIXME: This should probably be generalized for 512-bit vectors as well.
	assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
	int Size = Mask.size();
	int LaneSize = Size / 2;

	// If there are only inputs from one 128-bit lane, splitting will in fact be
	// less expensive. The flags track whether the given lane contains an element
	// that crosses to another lane.
	if (!Subtarget.hasAVX2()) {
	bool LaneCrossing[2] = {false, false};
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
	LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
	if (!LaneCrossing[0] \|\| !LaneCrossing[1])
	return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
	} else {
	bool LaneUsed[2] = {false, false};
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0)
	LaneUsed[(Mask[i] / LaneSize)] = true;
	if (!LaneUsed[0] \|\| !LaneUsed[1])
	return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
	}

	assert(V2.isUndef() &&
	"This last part of this routine only works on single input shuffles");

	SmallVector<int, 32> FlippedBlendMask(Size);
	for (int i = 0; i < Size; ++i)
	FlippedBlendMask[i] =
	Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
	? Mask[i]
	: Mask[i] % LaneSize +
	(i / LaneSize) * LaneSize + Size);

	// Flip the vector, and blend the results which should now be in-lane.
	MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
	SDValue Flipped = DAG.getBitcast(PVT, V1);
	Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT),
	{ 2, 3, 0, 1 });
	Flipped = DAG.getBitcast(VT, Flipped);
	return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
	}

	/// \brief Handle lowering 2-lane 128-bit shuffles.
	static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
	if (Subtarget.hasAVX2() && V2.isUndef())
	return SDValue();

	SmallVector<int, 4> WidenedMask;
	if (!canWidenShuffleElements(Mask, WidenedMask))
	return SDValue();

	// TODO: If minimizing size and one of the inputs is a zero vector and the
	// the zero vector has only one use, we could use a VPERM2X128 to save the
	// instruction bytes needed to explicitly generate the zero vector.

	// Blends are faster and handle all the non-lane-crossing cases.
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	bool IsLowZero = (Zeroable & 0x3) == 0x3;
	bool IsHighZero = (Zeroable & 0xc) == 0xc;

	// If either input operand is a zero vector, use VPERM2X128 because its mask
	// allows us to replace the zero input with an implicit zero.
	if (!IsLowZero && !IsHighZero) {
	// Check for patterns which can be matched with a single insert of a 128-bit
	// subvector.
	bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
	if (OnlyUsesV1 \|\| isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {

	// With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
	// this will likely become vinsertf128 which can't fold a 256-bit memop.
	if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
	VT.getVectorNumElements() / 2);
	SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
	DAG.getIntPtrConstant(0, DL));
	SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
	OnlyUsesV1 ? V1 : V2,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
	}
	}

	// Try to use SHUF128 if possible.
	if (Subtarget.hasVLX()) {
	if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
	unsigned PermMask = ((WidenedMask[0] % 2) << 0) \|
	((WidenedMask[1] % 2) << 1);
	return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
	DAG.getConstant(PermMask, DL, MVT::i8));
	}
	}
	}

	// Otherwise form a 128-bit permutation. After accounting for undefs,
	// convert the 64-bit shuffle mask selection values into 128-bit
	// selection bits by dividing the indexes by 2 and shifting into positions
	// defined by a vperm2*128 instruction's immediate control byte.

	// The immediate permute control byte looks like this:
	// [1:0] - select 128 bits from sources for low half of destination
	// [2] - ignore
	// [3] - zero low half of destination
	// [5:4] - select 128 bits from sources for high half of destination
	// [6] - ignore
	// [7] - zero high half of destination

	assert(WidenedMask[0] >= 0 && WidenedMask[1] >= 0 && "Undef half?");

	unsigned PermMask = 0;
	PermMask \|= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
	PermMask \|= IsHighZero ? 0x80 : (WidenedMask[1] << 4);

	// Check the immediate mask and replace unused sources with undef.
	if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
	V1 = DAG.getUNDEF(VT);
	if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
	V2 = DAG.getUNDEF(VT);

	return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
	DAG.getConstant(PermMask, DL, MVT::i8));
	}

	/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
	/// shuffling each lane.
	///
	/// This will only succeed when the result of fixing the 128-bit lanes results
	/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
	/// each 128-bit lanes. This handles many cases where we can quickly blend away
	/// the lane crosses early and then use simpler shuffles within each lane.
	///
	/// FIXME: It might be worthwhile at some point to support this without
	/// requiring the 128-bit lane-relative shuffles to be repeating, but currently
	/// in x86 only floating point has interesting non-repeating shuffles, and even
	/// those are still marginally more expensive.
	static SDValue lowerVectorShuffleByMerging128BitLanes(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(!V2.isUndef() && "This is only useful with multiple inputs.");

	int Size = Mask.size();
	int LaneSize = 128 / VT.getScalarSizeInBits();
	int NumLanes = Size / LaneSize;
	assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");

	// See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
	// check whether the in-128-bit lane shuffles share a repeating pattern.
	SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
	SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
	for (int i = 0; i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	int j = i / LaneSize;

	if (Lanes[j] < 0) {
	// First entry we've seen for this lane.
	Lanes[j] = Mask[i] / LaneSize;
	} else if (Lanes[j] != Mask[i] / LaneSize) {
	// This doesn't match the lane selected previously!
	return SDValue();
	}

	// Check that within each lane we have a consistent shuffle mask.
	int k = i % LaneSize;
	if (InLaneMask[k] < 0) {
	InLaneMask[k] = Mask[i] % LaneSize;
	} else if (InLaneMask[k] != Mask[i] % LaneSize) {
	// This doesn't fit a repeating in-lane mask.
	return SDValue();
	}
	}

	// First shuffle the lanes into place.
	MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
	VT.getSizeInBits() / 64);
	SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
	for (int i = 0; i < NumLanes; ++i)
	if (Lanes[i] >= 0) {
	LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
	LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
	}

	V1 = DAG.getBitcast(LaneVT, V1);
	V2 = DAG.getBitcast(LaneVT, V2);
	SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);

	// Cast it back to the type we actually want.
	LaneShuffle = DAG.getBitcast(VT, LaneShuffle);

	// Now do a simple shuffle that isn't lane crossing.
	SmallVector<int, 8> NewMask((unsigned)Size, -1);
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0)
	NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
	assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
	"Must not introduce lane crosses at this point!");

	return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
	}

	/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
	/// This allows for fast cases such as subvector extraction/insertion
	/// or shuffling smaller vector types which can lower more efficiently.
	static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert((VT.is256BitVector() \|\| VT.is512BitVector()) &&
	"Expected 256-bit or 512-bit vector");

	unsigned NumElts = VT.getVectorNumElements();
	unsigned HalfNumElts = NumElts / 2;
	MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);

	bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
	bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
	if (!UndefLower && !UndefUpper)
	return SDValue();

	// Upper half is undef and lower half is whole upper subvector.
	// e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
	if (UndefUpper &&
	isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
	DAG.getIntPtrConstant(HalfNumElts, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
	DAG.getIntPtrConstant(0, DL));
	}

	// Lower half is undef and upper half is whole lower subvector.
	// e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
	if (UndefLower &&
	isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
	DAG.getIntPtrConstant(HalfNumElts, DL));
	}

	// If the shuffle only uses two of the four halves of the input operands,
	// then extract them and perform the 'half' shuffle at half width.
	// e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
	int HalfIdx1 = -1, HalfIdx2 = -1;
	SmallVector<int, 8> HalfMask(HalfNumElts);
	unsigned Offset = UndefLower ? HalfNumElts : 0;
	for (unsigned i = 0; i != HalfNumElts; ++i) {
	int M = Mask[i + Offset];
	if (M < 0) {
	HalfMask[i] = M;
	continue;
	}

	// Determine which of the 4 half vectors this element is from.
	// i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
	int HalfIdx = M / HalfNumElts;

	// Determine the element index into its half vector source.
	int HalfElt = M % HalfNumElts;

	// We can shuffle with up to 2 half vectors, set the new 'half'
	// shuffle mask accordingly.
	if (HalfIdx1 < 0 \|\| HalfIdx1 == HalfIdx) {
	HalfMask[i] = HalfElt;
	HalfIdx1 = HalfIdx;
	continue;
	}
	if (HalfIdx2 < 0 \|\| HalfIdx2 == HalfIdx) {
	HalfMask[i] = HalfElt + HalfNumElts;
	HalfIdx2 = HalfIdx;
	continue;
	}

	// Too many half vectors referenced.
	return SDValue();
	}
	assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");

	// Only shuffle the halves of the inputs when useful.
	int NumLowerHalves =
	(HalfIdx1 == 0 \|\| HalfIdx1 == 2) + (HalfIdx2 == 0 \|\| HalfIdx2 == 2);
	int NumUpperHalves =
	(HalfIdx1 == 1 \|\| HalfIdx1 == 3) + (HalfIdx2 == 1 \|\| HalfIdx2 == 3);

	// uuuuXXXX - don't extract uppers just to insert again.
	if (UndefLower && NumUpperHalves != 0)
	return SDValue();

	// XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
	if (UndefUpper && NumUpperHalves == 2)
	return SDValue();

	// AVX2 - XXXXuuuu - always extract lowers.
	if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
	// AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
	if (VT == MVT::v4f64 \|\| VT == MVT::v4i64)
	return SDValue();
	// AVX2 supports variable 32-bit element cross-lane shuffles.
	if (VT == MVT::v8f32 \|\| VT == MVT::v8i32) {
	// XXXXuuuu - don't extract lowers and uppers.
	if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
	return SDValue();
	}
	}

	// AVX512 - XXXXuuuu - always extract lowers.
	if (VT.is512BitVector() && !(UndefUpper && NumUpperHalves == 0))
	return SDValue();

	auto GetHalfVector = [&](int HalfIdx) {
	if (HalfIdx < 0)
	return DAG.getUNDEF(HalfVT);
	SDValue V = (HalfIdx < 2 ? V1 : V2);
	HalfIdx = (HalfIdx % 2) * HalfNumElts;
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
	DAG.getIntPtrConstant(HalfIdx, DL));
	};

	SDValue Half1 = GetHalfVector(HalfIdx1);
	SDValue Half2 = GetHalfVector(HalfIdx2);
	SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
	DAG.getIntPtrConstant(Offset, DL));
	}

	/// \brief Test whether the specified input (0 or 1) is in-place blended by the
	/// given mask.
	///
	/// This returns true if the elements from a particular input are already in the
	/// slot required by the given mask and require no permutation.
	static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
	assert((Input == 0 \|\| Input == 1) && "Only two inputs to shuffles.");
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
	return false;

	return true;
	}

	/// Handle case where shuffle sources are coming from the same 128-bit lane and
	/// every lane can be represented as the same repeating mask - allowing us to
	/// shuffle the sources with the repeating shuffle and then permute the result
	/// to the destination lanes.
	static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	int NumElts = VT.getVectorNumElements();
	int NumLanes = VT.getSizeInBits() / 128;
	int NumLaneElts = NumElts / NumLanes;

	// On AVX2 we may be able to just shuffle the lowest elements and then
	// broadcast the result.
	if (Subtarget.hasAVX2()) {
	for (unsigned BroadcastSize : {16, 32, 64}) {
	if (BroadcastSize <= VT.getScalarSizeInBits())
	continue;
	int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();

	// Attempt to match a repeating pattern every NumBroadcastElts,
	// accounting for UNDEFs but only references the lowest 128-bit
	// lane of the inputs.
	auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
	for (int i = 0; i != NumElts; i += NumBroadcastElts)
	for (int j = 0; j != NumBroadcastElts; ++j) {
	int M = Mask[i + j];
	if (M < 0)
	continue;
	int &R = RepeatMask[j];
	if (0 != ((M % NumElts) / NumLaneElts))
	return false;
	if (0 <= R && R != M)
	return false;
	R = M;
	}
	return true;
	};

	SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
	if (!FindRepeatingBroadcastMask(RepeatMask))
	continue;

	// Shuffle the (lowest) repeated elements in place for broadcast.
	SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);

	// Shuffle the actual broadcast.
	SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
	for (int i = 0; i != NumElts; i += NumBroadcastElts)
	for (int j = 0; j != NumBroadcastElts; ++j)
	BroadcastMask[i + j] = j;
	return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
	BroadcastMask);
	}
	}

	// Bail if the shuffle mask doesn't cross 128-bit lanes.
	if (!is128BitLaneCrossingShuffleMask(VT, Mask))
	return SDValue();

	// Bail if we already have a repeated lane shuffle mask.
	SmallVector<int, 8> RepeatedShuffleMask;
	if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
	return SDValue();

	// On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
	// (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
	int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
	int NumSubLanes = NumLanes * SubLaneScale;
	int NumSubLaneElts = NumLaneElts / SubLaneScale;

	// Check that all the sources are coming from the same lane and see if we can
	// form a repeating shuffle mask (local to each sub-lane). At the same time,
	// determine the source sub-lane for each destination sub-lane.
	int TopSrcSubLane = -1;
	SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
	SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
	SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
	SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};

	for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
	// Extract the sub-lane mask, check that it all comes from the same lane
	// and normalize the mask entries to come from the first lane.
	int SrcLane = -1;
	SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
	for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
	int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
	if (M < 0)
	continue;
	int Lane = (M % NumElts) / NumLaneElts;
	if ((0 <= SrcLane) && (SrcLane != Lane))
	return SDValue();
	SrcLane = Lane;
	int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
	SubLaneMask[Elt] = LocalM;
	}

	// Whole sub-lane is UNDEF.
	if (SrcLane < 0)
	continue;

	// Attempt to match against the candidate repeated sub-lane masks.
	for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
	auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
	for (int i = 0; i != NumSubLaneElts; ++i) {
	if (M1[i] < 0 \|\| M2[i] < 0)
	continue;
	if (M1[i] != M2[i])
	return false;
	}
	return true;
	};

	auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
	if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
	continue;

	// Merge the sub-lane mask into the matching repeated sub-lane mask.
	for (int i = 0; i != NumSubLaneElts; ++i) {
	int M = SubLaneMask[i];
	if (M < 0)
	continue;
	assert((RepeatedSubLaneMask[i] < 0 \|\| RepeatedSubLaneMask[i] == M) &&
	"Unexpected mask element");
	RepeatedSubLaneMask[i] = M;
	}

	// Track the top most source sub-lane - by setting the remaining to UNDEF
	// we can greatly simplify shuffle matching.
	int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
	TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
	Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
	break;
	}

	// Bail if we failed to find a matching repeated sub-lane mask.
	if (Dst2SrcSubLanes[DstSubLane] < 0)
	return SDValue();
	}
	assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
	"Unexpected source lane");

	// Create a repeating shuffle mask for the entire vector.
	SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
	for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
	int Lane = SubLane / SubLaneScale;
	auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
	for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
	int M = RepeatedSubLaneMask[Elt];
	if (M < 0)
	continue;
	int Idx = (SubLane * NumSubLaneElts) + Elt;
	RepeatedMask[Idx] = M + (Lane * NumLaneElts);
	}
	}
	SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);

	// Shuffle each source sub-lane to its destination.
	SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
	for (int i = 0; i != NumElts; i += NumSubLaneElts) {
	int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
	if (SrcSubLane < 0)
	continue;
	for (int j = 0; j != NumSubLaneElts; ++j)
	SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
	}

	return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
	SubLaneMask);
	}

	static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
	unsigned &ShuffleImm,
	ArrayRef<int> Mask) {
	int NumElts = VT.getVectorNumElements();
	assert(VT.getScalarSizeInBits() == 64 &&
	(NumElts == 2 \|\| NumElts == 4 \|\| NumElts == 8) &&
	"Unexpected data type for VSHUFPD");

	// Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
	// Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
	ShuffleImm = 0;
	bool ShufpdMask = true;
	bool CommutableMask = true;
	for (int i = 0; i < NumElts; ++i) {
	if (Mask[i] == SM_SentinelUndef)
	continue;
	if (Mask[i] < 0)
	return false;
	int Val = (i & 6) + NumElts * (i & 1);
	int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
	if (Mask[i] < Val \|\| Mask[i] > Val + 1)
	ShufpdMask = false;
	if (Mask[i] < CommutVal \|\| Mask[i] > CommutVal + 1)
	CommutableMask = false;
	ShuffleImm \|= (Mask[i] % 2) << i;
	}

	if (ShufpdMask)
	return true;
	if (CommutableMask) {
	std::swap(V1, V2);
	return true;
	}

	return false;
	}

	static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	assert((VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v8f64)&&
	"Unexpected data type for VSHUFPD");

	unsigned Immediate = 0;
	if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
	return SDValue();

	return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
	DAG.getConstant(Immediate, DL, MVT::i8));
	}

	/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
	///
	/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
	/// isn't available.
	static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return V;

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Use low duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);

	if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
	// Non-half-crossing single input shuffles can be lowered with an
	// interleaved permutation.
	unsigned VPERMILPMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1) \|
	((Mask[2] == 3) << 2) \| ((Mask[3] == 3) << 3);
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
	DAG.getConstant(VPERMILPMask, DL, MVT::i8));
	}

	// With AVX2 we have direct support for this permutation.
	if (Subtarget.hasAVX2())
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Otherwise, fall back.
	return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
	DAG, Subtarget);
	}

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check if the blend happens to exactly fit that of SHUFPD.
	if (SDValue Op =
	lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
	return Op;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle. However, if we have AVX2 and either inputs are already in place,
	// we will be able to shuffle even across lanes the other input in a single
	// instruction so skip this pattern.
	if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) \|\|
	isShuffleMaskInputInPlace(1, Mask))))
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return Result;
	// If we have VLX support, we can use VEXPAND.
	if (Subtarget.hasVLX())
	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;

	// If we have AVX2 then we always want to lower with a blend because an v4 we
	// can fully permute the elements.
	if (Subtarget.hasAVX2())
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
	Mask, DAG);

	// Otherwise fall back on generic lowering.
	return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
	}

	/// \brief Handle lowering of 4-lane 64-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v4i64 shuffling..
	static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");

	if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	if (V2.isUndef()) {
	// When the shuffle is mirrored between the 128-bit lanes of the unit, we
	// can use lower latency instructions that will operate on both lanes.
	SmallVector<int, 2> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
	SmallVector<int, 4> PSHUFDMask;
	scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
	return DAG.getBitcast(
	MVT::v4i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
	DAG.getBitcast(MVT::v8i32, V1),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}

	// AVX2 provides a direct instruction for permuting a single input across
	// lanes.
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// If we have VLX support, we can use VALIGN or VEXPAND.
	if (Subtarget.hasVLX()) {
	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;
	}

	// Try to use PALIGNR.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
	return V;

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle. However, if we have AVX2 and either inputs are already in place,
	// we will be able to shuffle even across lanes the other input in a single
	// instruction so skip this pattern.
	if (!isShuffleMaskInputInPlace(0, Mask) &&
	!isShuffleMaskInputInPlace(1, Mask))
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic blend lowering.
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
	Mask, DAG);
	}

	/// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
	///
	/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
	/// isn't available.
	static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// If the shuffle mask is repeated in each 128-bit lane, we have many more
	// options to efficiently lower the shuffle.
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
	assert(RepeatedMask.size() == 4 &&
	"Repeated masks must be half the mask width!");

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);

	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
	return V;

	// Otherwise, fall back to a SHUFPS sequence. Here it is important that we
	// have already handled any direct blends.
	return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
	}

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
	return V;

	// If we have a single input shuffle with different shuffle patterns in the
	// two 128-bit lanes use the variable mask to VPERMILPS.
	if (V2.isUndef()) {
	SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
	if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
	return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);

	if (Subtarget.hasAVX2())
	return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);

	// Otherwise, fall back.
	return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
	DAG, Subtarget);
	}

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
	return Result;
	// If we have VLX support, we can use VEXPAND.
	if (Subtarget.hasVLX())
	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;

	// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
	// since after split we get a more efficient code using vpunpcklwd and
	// vpunpckhwd instrs than vblend.
	if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
	if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
	Mask, DAG))
	return V;

	// If we have AVX2 then we always want to lower with a blend because at v8 we
	// can fully permute the elements.
	if (Subtarget.hasAVX2())
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
	Mask, DAG);

	// Otherwise fall back on generic lowering.
	return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
	}

	/// \brief Handle lowering of 8-lane 32-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v8i32 shuffling..
	static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
	// since after split we get a more efficient code than vblend by using
	// vpunpcklwd and vpunpckhwd instrs.
	if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
	!Subtarget.hasAVX512())
	if (SDValue V =
	lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// If the shuffle mask is repeated in each 128-bit lane we can use more
	// efficient instructions that mirror the shuffles across the two 128-bit
	// lanes.
	SmallVector<int, 4> RepeatedMask;
	bool Is128BitLaneRepeatedShuffle =
	is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
	if (Is128BitLaneRepeatedShuffle) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
	if (V2.isUndef())
	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
	return V;
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// If we have VLX support, we can use VALIGN or EXPAND.
	if (Subtarget.hasVLX()) {
	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;
	}

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
	return V;

	// If the shuffle patterns aren't repeated but it is a single input, directly
	// generate a cross-lane VPERMD instruction.
	if (V2.isUndef()) {
	SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
	return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
	}

	// Assume that a single SHUFPS is faster than an alternative sequence of
	// multiple instructions (even if the CPU has a domain penalty).
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
	SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
	SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
	CastV1, CastV2, DAG);
	return DAG.getBitcast(MVT::v8i32, ShufPS);
	}

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic blend lowering.
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
	Mask, DAG);
	}

	/// \brief Handle lowering of 16-lane 16-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v16i16 shuffling..
	static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
	return V;

	if (V2.isUndef()) {
	// There are no generalized cross-lane shuffle operations available on i16
	// element types.
	if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
	return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
	Mask, DAG, Subtarget);

	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
	// As this is a single-input shuffle, the repeated mask should be
	// a strictly valid v8i16 mask that we can pass through to the v8i16
	// lowering to handle even the v16 case.
	return lowerV8I16GeneralSingleInputVectorShuffle(
	DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
	}
	}

	if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
	DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
	return PSHUFB;

	// AVX512BWVL can lower to VPERMW.
	if (Subtarget.hasBWI() && Subtarget.hasVLX())
	return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic lowering.
	return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
	}

	/// \brief Handle lowering of 32-lane 8-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v32i8 shuffling..
	static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
	assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
	return V;

	// Use dedicated pack instructions for masks that match their pattern.
	if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
	Subtarget))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
	return V;

	// There are no generalized cross-lane shuffle operations available on i8
	// element types.
	if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
	return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
	DAG, Subtarget);

	if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
	DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
	return PSHUFB;

	// AVX512VBMIVL can lower to VPERMB.
	if (Subtarget.hasVBMI() && Subtarget.hasVLX())
	return lowerVectorShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic lowering.
	return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
	}

	/// \brief High-level routine to lower various 256-bit x86 vector shuffles.
	///
	/// This routine either breaks down the specific type of a 256-bit x86 vector
	/// shuffle or splits it into two 128-bit shuffles and fuses the results back
	/// together based on the available instructions.
	static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// If we have a single input to the zero element, insert that into V1 if we
	// can do so cheaply.
	int NumElts = VT.getVectorNumElements();
	int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

	if (NumV2Elements == 1 && Mask[0] >= NumElts)
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;

	// Handle special cases where the lower or upper half is UNDEF.
	if (SDValue V =
	lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
	return V;

	// There is a really nice hard cut-over between AVX1 and AVX2 that means we
	// can check for those subtargets here and avoid much of the subtarget
	// querying in the per-vector-type lowering routines. With AVX1 we have
	// essentially zero ability to manipulate a 256-bit vector with integer
	// types. Since we'll use floating point types there eventually, just
	// immediately cast everything to a float and operate entirely in that domain.
	if (VT.isInteger() && !Subtarget.hasAVX2()) {
	int ElementBits = VT.getScalarSizeInBits();
	if (ElementBits < 32) {
	// No floating point type available, if we can't use the bit operations
	// for masking/blending then decompose into 128-bit vectors.
	if (SDValue V =
	lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
	return V;
	if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
	return V;
	return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
	}

	MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
	VT.getVectorNumElements());
	V1 = DAG.getBitcast(FpVT, V1);
	V2 = DAG.getBitcast(FpVT, V2);
	return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
	}

	switch (VT.SimpleTy) {
	case MVT::v4f64:
	return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4i64:
	return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8f32:
	return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i32:
	return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i16:
	return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v32i8:
	return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Not a valid 256-bit x86 vector type!");
	}
	}

	/// \brief Try to lower a vector shuffle as a 128-bit shuffles.
	static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	assert(VT.getScalarSizeInBits() == 64 &&
	"Unexpected element type size for 128bit shuffle.");

	// To handle 256 bit vector requires VLX and most probably
	// function lowerV2X128VectorShuffle() is better solution.
	assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");

	SmallVector<int, 4> WidenedMask;
	if (!canWidenShuffleElements(Mask, WidenedMask))
	return SDValue();

	// Check for patterns which can be matched with a single insert of a 256-bit
	// subvector.
	bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
	{0, 1, 2, 3, 0, 1, 2, 3});
	if (OnlyUsesV1 \|\| isShuffleEquivalent(V1, V2, Mask,
	{0, 1, 2, 3, 8, 9, 10, 11})) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
	SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
	DAG.getIntPtrConstant(0, DL));
	SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
	OnlyUsesV1 ? V1 : V2,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
	}

	assert(WidenedMask.size() == 4);

	// See if this is an insertion of the lower 128-bits of V2 into V1.
	bool IsInsert = true;
	int V2Index = -1;
	for (int i = 0; i < 4; ++i) {
	assert(WidenedMask[i] >= -1);
	if (WidenedMask[i] < 0)
	continue;

	// Make sure all V1 subvectors are in place.
	if (WidenedMask[i] < 4) {
	if (WidenedMask[i] != i) {
	IsInsert = false;
	break;
	}
	} else {
	// Make sure we only have a single V2 index and its the lowest 128-bits.
	if (V2Index >= 0 \|\| WidenedMask[i] != 4) {
	IsInsert = false;
	break;
	}
	V2Index = i;
	}
	}
	if (IsInsert && V2Index >= 0) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
	SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
	DAG.getIntPtrConstant(0, DL));
	return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
	}

	// Try to lower to to vshuf64x2/vshuf32x4.
	SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
	unsigned PermMask = 0;
	// Insure elements came from the same Op.
	for (int i = 0; i < 4; ++i) {
	assert(WidenedMask[i] >= -1);
	if (WidenedMask[i] < 0)
	continue;

	SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
	unsigned OpIndex = i / 2;
	if (Ops[OpIndex].isUndef())
	Ops[OpIndex] = Op;
	else if (Ops[OpIndex] != Op)
	return SDValue();

	// Convert the 128-bit shuffle mask selection values into 128-bit selection
	// bits defined by a vshuf64x2 instruction's immediate control byte.
	PermMask \|= (WidenedMask[i] % 4) << (i * 2);
	}

	return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
	DAG.getConstant(PermMask, DL, MVT::i8));
	}

	/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
	static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (V2.isUndef()) {
	// Use low duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
	return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);

	if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
	// Non-half-crossing single input shuffles can be lowered with an
	// interleaved permutation.
	unsigned VPERMILPMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1) \|
	((Mask[2] == 3) << 2) \| ((Mask[3] == 3) << 3) \|
	((Mask[4] == 5) << 4) \| ((Mask[5] == 5) << 5) \|
	((Mask[6] == 7) << 6) \| ((Mask[7] == 7) << 7);
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
	DAG.getConstant(VPERMILPMask, DL, MVT::i8));
	}

	SmallVector<int, 4> RepeatedMask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
	}

	if (SDValue Shuf128 =
	lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
	return Shuf128;

	if (SDValue Unpck =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
	return Unpck;

	// Check if the blend happens to exactly fit that of SHUFPD.
	if (SDValue Op =
	lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
	return Op;

	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
	V2, DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
	}

	/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
	static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// If the shuffle mask is repeated in each 128-bit lane, we have many more
	// options to efficiently lower the shuffle.
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);

	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue Unpck =
	lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
	return Unpck;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Otherwise, fall back to a SHUFPS sequence.
	return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
	}

	// If we have a single input shuffle with different shuffle patterns in the
	// 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
	if (V2.isUndef() &&
	!is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
	SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
	return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
	}

	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;

	return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
	}

	/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
	static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (V2.isUndef()) {
	// When the shuffle is mirrored between the 128-bit lanes of the unit, we
	// can use lower latency instructions that will operate on all four
	// 128-bit lanes.
	SmallVector<int, 2> Repeated128Mask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
	SmallVector<int, 4> PSHUFDMask;
	scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
	return DAG.getBitcast(
	MVT::v8i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
	DAG.getBitcast(MVT::v16i32, V1),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}

	SmallVector<int, 4> Repeated256Mask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
	getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
	}

	if (SDValue Shuf128 =
	lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
	return Shuf128;

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use VALIGN.
	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	// Try to use PALIGNR.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue Unpck =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
	return Unpck;
	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
	V2, DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
	}

	/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
	static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// If the shuffle mask is repeated in each 128-bit lane we can use more
	// efficient instructions that mirror the shuffles across the four 128-bit
	// lanes.
	SmallVector<int, 4> RepeatedMask;
	bool Is128BitLaneRepeatedShuffle =
	is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
	if (Is128BitLaneRepeatedShuffle) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
	if (V2.isUndef())
	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
	return V;
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use VALIGN.
	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	// Try to use byte rotation instructions.
	if (Subtarget.hasBWI())
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	// Assume that a single SHUFPS is faster than using a permv shuffle.
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
	SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
	SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
	CastV1, CastV2, DAG);
	return DAG.getBitcast(MVT::v16i32, ShufPS);
	}
	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;
	return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
	}

	/// \brief Handle lowering of 32-lane 16-bit integer shuffles.
	static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
	assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
	assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	if (V2.isUndef()) {
	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
	// As this is a single-input shuffle, the repeated mask should be
	// a strictly valid v8i16 mask that we can pass through to the v8i16
	// lowering to handle even the v32 case.
	return lowerV8I16GeneralSingleInputVectorShuffle(
	DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
	}
	}

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
	DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
	return PSHUFB;

	return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
	}

	/// \brief Handle lowering of 64-lane 8-bit integer shuffles.
	static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
	assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
	assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
	DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
	return PSHUFB;

	// VBMI can use VPERMV/VPERMV3 byte shuffles.
	if (Subtarget.hasVBMI())
	return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// FIXME: Implement direct support for this type!
	return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
	}

	/// \brief High-level routine to lower various 512-bit x86 vector shuffles.
	///
	/// This routine either breaks down the specific type of a 512-bit x86 vector
	/// shuffle or splits it into two 256-bit shuffles and fuses the results back
	/// together based on the available instructions.
	static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"Cannot lower 512-bit vectors w/ basic ISA!");

	// If we have a single input to the zero element, insert that into V1 if we
	// can do so cheaply.
	int NumElts = Mask.size();
	int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

	if (NumV2Elements == 1 && Mask[0] >= NumElts)
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;

	// Handle special cases where the lower or upper half is UNDEF.
	if (SDValue V =
	lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast =
	lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Dispatch to each element type for lowering. If we don't have support for
	// specific element type shuffles at 512 bits, immediately split them and
	// lower them. Each lowering routine of a given type is allowed to assume that
	// the requisite ISA extensions for that element type are available.
	switch (VT.SimpleTy) {
	case MVT::v8f64:
	return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16f32:
	return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i64:
	return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i32:
	return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v32i16:
	return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v64i8:
	return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Not a valid 512-bit x86 vector type!");
	}
	}

	// Lower vXi1 vector shuffles.
	// There is no a dedicated instruction on AVX-512 that shuffles the masks.
	// The only way to shuffle bits is to sign-extend the mask vector to SIMD
	// vector, shuffle and then truncate it back.
	static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"Cannot lower 512-bit vectors w/o basic ISA!");
	MVT ExtVT;
	switch (VT.SimpleTy) {
	default:
	llvm_unreachable("Expected a vector of i1 elements");
	case MVT::v2i1:
	ExtVT = MVT::v2i64;
	break;
	case MVT::v4i1:
	ExtVT = MVT::v4i32;
	break;
	case MVT::v8i1:
	// Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
	// shuffle.
	ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
	break;
	case MVT::v16i1:
	ExtVT = MVT::v16i32;
	break;
	case MVT::v32i1:
	ExtVT = MVT::v32i16;
	break;
	case MVT::v64i1:
	ExtVT = MVT::v64i8;
	break;
	}

	if (ISD::isBuildVectorAllZeros(V1.getNode()))
	V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
	else if (ISD::isBuildVectorAllOnes(V1.getNode()))
	V1 = getOnesVector(ExtVT, DAG, DL);
	else
	V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);

	if (V2.isUndef())
	V2 = DAG.getUNDEF(ExtVT);
	else if (ISD::isBuildVectorAllZeros(V2.getNode()))
	V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
	else if (ISD::isBuildVectorAllOnes(V2.getNode()))
	V2 = getOnesVector(ExtVT, DAG, DL);
	else
	V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);

	SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
	// i1 was sign extended we can use X86ISD::CVT2MASK.
	int NumElems = VT.getVectorNumElements();
	if ((Subtarget.hasBWI() && (NumElems >= 32)) \|\|
	(Subtarget.hasDQI() && (NumElems < 32)))
	return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);

	return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
	}

	/// Helper function that returns true if the shuffle mask should be
	/// commuted to improve canonicalization.
	static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
	int NumElements = Mask.size();

	int NumV1Elements = 0, NumV2Elements = 0;
	for (int M : Mask)
	if (M < 0)
	continue;
	else if (M < NumElements)
	++NumV1Elements;
	else
	++NumV2Elements;

	// Commute the shuffle as needed such that more elements come from V1 than
	// V2. This allows us to match the shuffle pattern strictly on how many
	// elements come from V1 without handling the symmetric cases.
	if (NumV2Elements > NumV1Elements)
	return true;

	assert(NumV1Elements > 0 && "No V1 indices");

	if (NumV2Elements == 0)
	return false;

	// When the number of V1 and V2 elements are the same, try to minimize the
	// number of uses of V2 in the low half of the vector. When that is tied,
	// ensure that the sum of indices for V1 is equal to or lower than the sum
	// indices for V2. When those are equal, try to ensure that the number of odd
	// indices for V1 is lower than the number of odd indices for V2.
	if (NumV1Elements == NumV2Elements) {
	int LowV1Elements = 0, LowV2Elements = 0;
	for (int M : Mask.slice(0, NumElements / 2))
	if (M >= NumElements)
	++LowV2Elements;
	else if (M >= 0)
	++LowV1Elements;
	if (LowV2Elements > LowV1Elements)
	return true;
	if (LowV2Elements == LowV1Elements) {
	int SumV1Indices = 0, SumV2Indices = 0;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= NumElements)
	SumV2Indices += i;
	else if (Mask[i] >= 0)
	SumV1Indices += i;
	if (SumV2Indices < SumV1Indices)
	return true;
	if (SumV2Indices == SumV1Indices) {
	int NumV1OddIndices = 0, NumV2OddIndices = 0;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= NumElements)
	NumV2OddIndices += i % 2;
	else if (Mask[i] >= 0)
	NumV1OddIndices += i % 2;
	if (NumV2OddIndices < NumV1OddIndices)
	return true;
	}
	}
	}

	return false;
	}

	/// \brief Top-level lowering for x86 vector shuffles.
	///
	/// This handles decomposition, canonicalization, and lowering of all x86
	/// vector shuffles. Most of the specific lowering strategies are encapsulated
	/// above in helper routines. The canonicalization attempts to widen shuffles
	/// to involve fewer lanes of wider elements, consolidate symmetric patterns
	/// s.t. only one of the two inputs needs to be tested, etc.
	static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
	ArrayRef<int> Mask = SVOp->getMask();
	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	MVT VT = Op.getSimpleValueType();
	int NumElements = VT.getVectorNumElements();
	SDLoc DL(Op);
	bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);

	assert((VT.getSizeInBits() != 64 \|\| Is1BitVector) &&
	"Can't lower MMX shuffles");

	bool V1IsUndef = V1.isUndef();
	bool V2IsUndef = V2.isUndef();
	if (V1IsUndef && V2IsUndef)
	return DAG.getUNDEF(VT);

	// When we create a shuffle node we put the UNDEF node to second operand,
	// but in some cases the first operand may be transformed to UNDEF.
	// In this case we should just commute the node.
	if (V1IsUndef)
	return DAG.getCommutedVectorShuffle(*SVOp);

	// Check for non-undef masks pointing at an undef vector and make the masks
	// undef as well. This makes it easier to match the shuffle based solely on
	// the mask.
	if (V2IsUndef)
	for (int M : Mask)
	if (M >= NumElements) {
	SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
	for (int &M : NewMask)
	if (M >= NumElements)
	M = -1;
	return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
	}

	// Check for illegal shuffle mask element index values.
	int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
	assert(llvm::all_of(Mask,
	[&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
	"Out of bounds shuffle index");

	// We actually see shuffles that are entirely re-arrangements of a set of
	// zero inputs. This mostly happens while decomposing complex shuffles into
	// simple ones. Directly lower these as a buildvector of zeros.
	APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
	if (Zeroable.isAllOnesValue())
	return getZeroVector(VT, Subtarget, DAG, DL);

	// Try to collapse shuffles into using a vector type with fewer elements but
	// wider element types. We cap this to not form integers or floating point
	// elements wider than 64 bits, but it might be interesting to form i128
	// integers to handle flipping the low and high halves of AVX 256-bit vectors.
	SmallVector<int, 16> WidenedMask;
	if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
	canWidenShuffleElements(Mask, WidenedMask)) {
	MVT NewEltVT = VT.isFloatingPoint()
	? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
	: MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
	MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
	// Make sure that the new vector type is legal. For example, v2f64 isn't
	// legal on SSE1.
	if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
	V1 = DAG.getBitcast(NewVT, V1);
	V2 = DAG.getBitcast(NewVT, V2);
	return DAG.getBitcast(
	VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
	}
	}

	// Commute the shuffle if it will improve canonicalization.
	if (canonicalizeShuffleMaskWithCommute(Mask))
	return DAG.getCommutedVectorShuffle(*SVOp);

	// For each vector width, delegate to a specialized lowering routine.
	if (VT.is128BitVector())
	return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
	DAG);

	if (VT.is256BitVector())
	return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
	DAG);

	if (VT.is512BitVector())
	return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
	DAG);

	if (Is1BitVector)
	return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);

	llvm_unreachable("Unimplemented!");
	}

	/// \brief Try to lower a VSELECT instruction to a vector shuffle.
	static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Cond = Op.getOperand(0);
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
	return SDValue();
	auto *CondBV = cast<BuildVectorSDNode>(Cond);

	// Only non-legal VSELECTs reach this lowering, convert those into generic
	// shuffles and re-use the shuffle lowering path for blends.
	SmallVector<int, 32> Mask;
	for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
	SDValue CondElt = CondBV->getOperand(i);
	Mask.push_back(
	isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
	: -1);
	}
	return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
	}

	SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
	// A vselect where all conditions and data are constants can be optimized into
	// a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
	if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
	ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
	ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
	return SDValue();

	// Try to lower this to a blend-style vector shuffle. This can handle all
	// constant condition cases.
	if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
	return BlendOp;

	// If this VSELECT has a vector if i1 as a mask, it will be directly matched
	// with patterns on the mask registers on AVX-512.
	if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
	return Op;

	// Variable blends are only legal from SSE4.1 onward.
	if (!Subtarget.hasSSE41())
	return SDValue();

	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	// If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
	// into an i1 condition so that we can use the mask-based 512-bit blend
	// instructions.
	if (VT.getSizeInBits() == 512) {
	SDValue Cond = Op.getOperand(0);
	// The vNi1 condition case should be handled above as it can be trivially
	// lowered.
	assert(Cond.getValueType().getScalarSizeInBits() ==
	VT.getScalarSizeInBits() &&
	"Should have a size-matched integer condition!");
	// Build a mask by testing the condition against itself (tests for zero).
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
	// Now return a new VSELECT using the mask.
	return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
	}

	// Only some types will be legal on some subtargets. If we can emit a legal
	// VSELECT-matching blend, return Op, and but if we need to expand, return
	// a null value.
	switch (VT.SimpleTy) {
	default:
	// Most of the vector types have blends past SSE4.1.
	return Op;

	case MVT::v32i8:
	// The byte blends for AVX vectors were introduced only in AVX2.
	if (Subtarget.hasAVX2())
	return Op;

	return SDValue();

	case MVT::v8i16:
	case MVT::v16i16:
	// FIXME: We should custom lower this by fixing the condition and using i8
	// blends.
	return SDValue();
	}
	}

	static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
	return SDValue();

	if (VT.getSizeInBits() == 8) {
	SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
	Op.getOperand(0), Op.getOperand(1));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
	}

	if (VT == MVT::f32) {
	// EXTRACTPS outputs to a GPR32 register which will require a movd to copy
	// the result back to FR32 register. It's only worth matching if the
	// result has a single use which is a store or a bitcast to i32. And in
	// the case of a store, it's not worth it if the index is a constant 0,
	// because a MOVSSmr can be used instead, which is smaller and faster.
	if (!Op.hasOneUse())
	return SDValue();
	SDNode User = Op.getNode()->use_begin();
	if ((User->getOpcode() != ISD::STORE \|\|
	isNullConstant(Op.getOperand(1))) &&
	(User->getOpcode() != ISD::BITCAST \|\|
	User->getValueType(0) != MVT::i32))
	return SDValue();
	SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
	Op.getOperand(1));
	return DAG.getBitcast(MVT::f32, Extract);
	}

	if (VT == MVT::i32 \|\| VT == MVT::i64) {
	// ExtractPS/pextrq works with constant index.
	if (isa<ConstantSDNode>(Op.getOperand(1)))
	return Op;
	}

	return SDValue();
	}

	/// Extract one bit from mask vector, like v16i1 or v8i1.
	/// AVX-512 feature.
	static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Vec = Op.getOperand(0);
	SDLoc dl(Vec);
	MVT VecVT = Vec.getSimpleValueType();
	SDValue Idx = Op.getOperand(1);
	MVT EltVT = Op.getSimpleValueType();

	assert((VecVT.getVectorNumElements() <= 16 \|\| Subtarget.hasBWI()) &&
	"Unexpected vector type in ExtractBitFromMaskVector");

	// variable index can't be handled in mask registers,
	// extend vector to VR512/128
	if (!isa<ConstantSDNode>(Idx)) {
	unsigned NumElts = VecVT.getVectorNumElements();
	// Extending v8i1/v16i1 to 512-bit get better performance on KNL
	// than extending to 128/256bit.
	MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
	MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
	SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
	SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
	return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
	}

	// Canonicalize result type to MVT::i32.
	if (EltVT != MVT::i32) {
	SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	Vec, Idx);
	return DAG.getAnyExtOrTrunc(Extract, dl, EltVT);
	}

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

	// Extracts from element 0 are always allowed.
	if (IdxVal == 0)
	return Op;

	// If the kshift instructions of the correct width aren't natively supported
	// then we need to promote the vector to the native size to get the correct
	// zeroing behavior.
	if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) \|\|
	(VecVT.getVectorNumElements() < 8)) {
	VecVT = MVT::v16i1;
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
	DAG.getUNDEF(VecVT),
	Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Use kshiftr instruction to move to the lower element.
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	SDValue
	X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	MVT VecVT = Vec.getSimpleValueType();
	SDValue Idx = Op.getOperand(1);

	if (VecVT.getVectorElementType() == MVT::i1)
	return ExtractBitFromMaskVector(Op, DAG, Subtarget);

	if (!isa<ConstantSDNode>(Idx)) {
	// Its more profitable to go through memory (1 cycles throughput)
	// than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
	// IACA tool was used to get performance estimation
	// (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
	//
	// example : extractelement <16 x i8> %a, i32 %i
	//
	// Block Throughput: 3.00 Cycles
	// Throughput Bottleneck: Port5
	//
	// \| Num Of \| Ports pressure in cycles \| \|
	// \| Uops \| 0 - DV \| 5 \| 6 \| 7 \| \|
	// ---------------------------------------------
	// \| 1 \| \| 1.0 \| \| \| CP \| vmovd xmm1, edi
	// \| 1 \| \| 1.0 \| \| \| CP \| vpshufb xmm0, xmm0, xmm1
	// \| 2 \| 1.0 \| 1.0 \| \| \| CP \| vpextrb eax, xmm0, 0x0
	// Total Num Of Uops: 4
	//
	//
	// Block Throughput: 1.00 Cycles
	// Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
	//
	// \| \| Ports pressure in cycles \| \|
	// \|Uops\| 1 \| 2 - D \|3 - D \| 4 \| 5 \| \|
	// ---------------------------------------------------------
	// \|2^ \| \| 0.5 \| 0.5 \|1.0\| \|CP\| vmovaps xmmword ptr [rsp-0x18], xmm0
	// \|1 \|0.5\| \| \| \|0.5\| \| lea rax, ptr [rsp-0x18]
	// \|1 \| \|0.5, 0.5\|0.5, 0.5\| \| \|CP\| mov al, byte ptr [rdi+rax*1]
	// Total Num Of Uops: 4

	return SDValue();
	}

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

	// If this is a 256-bit vector result, first extract the 128-bit vector and
	// then extract the element from the 128-bit vector.
	if (VecVT.is256BitVector() \|\| VecVT.is512BitVector()) {
	// Get the 128-bit vector.
	Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
	MVT EltVT = VecVT.getVectorElementType();

	unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
	// this can be done with a mask.
	IdxVal &= ElemsPerChunk - 1;
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
	DAG.getConstant(IdxVal, dl, MVT::i32));
	}

	assert(VecVT.is128BitVector() && "Unexpected vector length");

	MVT VT = Op.getSimpleValueType();

	if (VT.getSizeInBits() == 16) {
	// If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
	// we're going to zero extend the register or fold the store (SSE41 only).
	if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
	!(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
	return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Vec), Idx));

	// Transform it so it match pextrw which produces a 32-bit result.
	SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
	Op.getOperand(0), Op.getOperand(1));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
	}

	if (Subtarget.hasSSE41())
	if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
	return Res;

	// TODO: We only extract a single element from v16i8, we can probably afford
	// to be more aggressive here before using the default approach of spilling to
	// stack.
	if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
	// Extract either the lowest i32 or any i16, and extract the sub-byte.
	int DWordIdx = IdxVal / 4;
	if (DWordIdx == 0) {
	SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Vec),
	DAG.getIntPtrConstant(DWordIdx, dl));
	int ShiftVal = (IdxVal % 4) * 8;
	if (ShiftVal != 0)
	Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
	DAG.getConstant(ShiftVal, dl, MVT::i32));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	}

	int WordIdx = IdxVal / 2;
	SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
	DAG.getBitcast(MVT::v8i16, Vec),
	DAG.getIntPtrConstant(WordIdx, dl));
	int ShiftVal = (IdxVal % 2) * 8;
	if (ShiftVal != 0)
	Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
	DAG.getConstant(ShiftVal, dl, MVT::i16));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	}

	if (VT.getSizeInBits() == 32) {
	if (IdxVal == 0)
	return Op;

	// SHUFPS the element to the lowest double word, then movss.
	int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
	Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	if (VT.getSizeInBits() == 64) {
	// FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
	// FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
	// to match extract_elt for f64.
	if (IdxVal == 0)
	return Op;

	// UNPCKHPD the element to the lowest double word, then movsd.
	// Note if the lower 64 bits of the result of the UNPCKHPD is then stored
	// to a f64mem, the whole operation is folded into a single MOVHPDmr.
	int Mask[2] = { 1, -1 };
	Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	return SDValue();
	}

	/// Insert one bit to mask vector, like v16i1 or v8i1.
	/// AVX-512 feature.
	static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	SDValue Elt = Op.getOperand(1);
	SDValue Idx = Op.getOperand(2);
	MVT VecVT = Vec.getSimpleValueType();

	if (!isa<ConstantSDNode>(Idx)) {
	// Non constant index. Extend source and destination,
	// insert element and then truncate the result.
	unsigned NumElts = VecVT.getVectorNumElements();
	MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
	MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
	SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
	DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
	DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
	return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
	}

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	unsigned NumElems = VecVT.getVectorNumElements();

	// If the kshift instructions of the correct width aren't natively supported
	// then we need to promote the vector to the native size to get the correct
	// zeroing behavior.
	if ((!Subtarget.hasDQI() && NumElems == 8) \|\| (NumElems < 8)) {
	// Need to promote to v16i1, do the insert, then extract back.
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
	DAG.getUNDEF(MVT::v16i1), Vec,
	DAG.getIntPtrConstant(0, dl));
	Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i1, Vec, Elt, Idx);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VecVT, Op,
	DAG.getIntPtrConstant(0, dl));
	}

	SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);

	if (Vec.isUndef()) {
	if (IdxVal)
	EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	return EltInVec;
	}

	// Insertion of one bit into first position
	if (IdxVal == 0 ) {
	// Clean top bits of vector.
	EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
	DAG.getConstant(NumElems - 1, dl, MVT::i8));
	EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec,
	DAG.getConstant(NumElems - 1, dl, MVT::i8));
	// Clean the first bit in source vector.
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
	DAG.getConstant(1 , dl, MVT::i8));
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
	DAG.getConstant(1, dl, MVT::i8));

	return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
	}
	// Insertion of one bit into last position
	if (IdxVal == NumElems - 1) {
	// Move the bit to the last position inside the vector.
	EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	// Clean the last bit in the source vector.
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
	DAG.getConstant(1, dl, MVT::i8));
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
	DAG.getConstant(1 , dl, MVT::i8));

	return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
	}

	// Move the current value of the bit to be replace to bit 0.
	SDValue Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	// Xor with the new bit.
	Merged = DAG.getNode(ISD::XOR, dl, VecVT, Merged, EltInVec);
	// Shift to MSB, filling bottom bits with 0.
	Merged = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Merged,
	DAG.getConstant(NumElems - 1, dl, MVT::i8));
	// Shift to the final position, filling upper bits with 0.
	Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Merged,
	DAG.getConstant(NumElems - 1 - IdxVal, dl, MVT::i8));
	// Xor with original vector to cancel out the original bit value that's still
	// present.
	return DAG.getNode(ISD::XOR, dl, VecVT, Merged, Vec);
	}

	SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElts = VT.getVectorNumElements();

	if (EltVT == MVT::i1)
	return InsertBitToMaskVector(Op, DAG, Subtarget);

	SDLoc dl(Op);
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);
	SDValue N2 = Op.getOperand(2);
	if (!isa<ConstantSDNode>(N2))
	return SDValue();
	auto *N2C = cast<ConstantSDNode>(N2);
	unsigned IdxVal = N2C->getZExtValue();

	bool IsZeroElt = X86::isZeroNode(N1);
	bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);

	// If we are inserting a element, see if we can do this more efficiently with
	// a blend shuffle with a rematerializable vector than a costly integer
	// insertion.
	if ((IsZeroElt \|\| IsAllOnesElt) && Subtarget.hasSSE41() &&
	16 <= EltVT.getSizeInBits()) {
	SmallVector<int, 8> BlendMask;
	for (unsigned i = 0; i != NumElts; ++i)
	BlendMask.push_back(i == IdxVal ? i + NumElts : i);
	SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
	: getOnesVector(VT, DAG, dl);
	return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
	}

	// If the vector is wider than 128 bits, extract the 128-bit subvector, insert
	// into that, and then insert the subvector back into the result.
	if (VT.is256BitVector() \|\| VT.is512BitVector()) {
	// With a 256-bit vector, we can insert into the zero element efficiently
	// using a blend if we have AVX or AVX2 and the right data type.
	if (VT.is256BitVector() && IdxVal == 0) {
	// TODO: It is worthwhile to cast integer to floating point and back
	// and incur a domain crossing penalty if that's what we'll end up
	// doing anyway after extracting to a 128-bit vector.
	if ((Subtarget.hasAVX() && (EltVT == MVT::f64 \|\| EltVT == MVT::f32)) \|\|
	(Subtarget.hasAVX2() && EltVT == MVT::i32)) {
	SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
	N2 = DAG.getIntPtrConstant(1, dl);
	return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
	}
	}

	// Get the desired 128-bit vector chunk.
	SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);

	// Insert the element into the desired chunk.
	unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
	assert(isPowerOf2_32(NumEltsIn128));
	// Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
	unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);

	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
	DAG.getConstant(IdxIn128, dl, MVT::i32));

	// Insert the changed part back into the bigger vector
	return insert128BitVector(N0, V, IdxVal, DAG, dl);
	}
	assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");

	// Transform it so it match pinsr{b,w} which expects a GR32 as its second
	// argument. SSE41 required for pinsrb.
	if (VT == MVT::v8i16 \|\| (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
	unsigned Opc;
	if (VT == MVT::v8i16) {
	assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
	Opc = X86ISD::PINSRW;
	} else {
	assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
	assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
	Opc = X86ISD::PINSRB;
	}

	if (N1.getValueType() != MVT::i32)
	N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
	if (N2.getValueType() != MVT::i32)
	N2 = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(Opc, dl, VT, N0, N1, N2);
	}

	if (Subtarget.hasSSE41()) {
	if (EltVT == MVT::f32) {
	// Bits [7:6] of the constant are the source select. This will always be
	// zero here. The DAG Combiner may combine an extract_elt index into
	// these bits. For example (insert (extract, 3), 2) could be matched by
	// putting the '3' into bits [7:6] of X86ISD::INSERTPS.
	// Bits [5:4] of the constant are the destination select. This is the
	// value of the incoming immediate.
	// Bits [3:0] of the constant are the zero mask. The DAG Combiner may
	// combine either bitwise AND or insert of float 0.0 to set these bits.

	bool MinSize = DAG.getMachineFunction().getFunction().optForMinSize();
	if (IdxVal == 0 && (!MinSize \|\| !MayFoldLoad(N1))) {
	// If this is an insertion of 32-bits into the low 32-bits of
	// a vector, we prefer to generate a blend with immediate rather
	// than an insertps. Blends are simpler operations in hardware and so
	// will always have equal or better performance than insertps.
	// But if optimizing for size and there's a load folding opportunity,
	// generate insertps because blendps does not have a 32-bit memory
	// operand form.
	N2 = DAG.getIntPtrConstant(1, dl);
	N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
	return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
	}
	N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
	// Create this as a scalar to vector..
	N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
	return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
	}

	// PINSR* works with constant index.
	if (EltVT == MVT::i32 \|\| EltVT == MVT::i64)
	return Op;
	}

	return SDValue();
	}

	static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT OpVT = Op.getSimpleValueType();

	// It's always cheaper to replace a xor+movd with xorps and simplifies further
	// combines.
	if (X86::isZeroNode(Op.getOperand(0)))
	return getZeroVector(OpVT, Subtarget, DAG, dl);

	// If this is a 256-bit vector result, first insert into a 128-bit
	// vector and then insert into the 256-bit vector.
	if (!OpVT.is128BitVector()) {
	// Insert into a 128-bit vector.
	unsigned SizeFactor = OpVT.getSizeInBits() / 128;
	MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
	OpVT.getVectorNumElements() / SizeFactor);

	Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));

	// Insert the 128-bit vector.
	return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
	}
	assert(OpVT.is128BitVector() && "Expected an SSE type!");

	// Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
	if (OpVT == MVT::v4i32)
	return Op;

	SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
	return DAG.getBitcast(
	OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
	}

	// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
	// simple superregister reference or explicit instructions to insert
	// the upper bits of a vector.
	static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);

	return insert1BitVector(Op, DAG, Subtarget);
	}

	static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
	"Only vXi1 extract_subvectors need custom lowering");

	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	SDValue Idx = Op.getOperand(1);

	if (!isa<ConstantSDNode>(Idx))
	return SDValue();

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	if (IdxVal == 0) // the operation is legal
	return Op;

	MVT VecVT = Vec.getSimpleValueType();
	unsigned NumElems = VecVT.getVectorNumElements();

	// Extend to natively supported kshift.
	MVT WideVecVT = VecVT;
	if ((!Subtarget.hasDQI() && NumElems == 8) \|\| NumElems < 8) {
	WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
	DAG.getUNDEF(WideVecVT), Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Shift to the LSB.
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
	DAG.getConstant(IdxVal, dl, MVT::i8));

	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Returns the appropriate wrapper opcode for a global reference.
	unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
	// References to absolute symbols are never PC-relative.
	if (GV && GV->isAbsoluteSymbolRef())
	return X86ISD::Wrapper;

	CodeModel::Model M = getTargetMachine().getCodeModel();
	if (Subtarget.isPICStyleRIPRel() &&
	(M == CodeModel::Small \|\| M == CodeModel::Kernel))
	return X86ISD::WrapperRIP;

	return X86ISD::Wrapper;
	}

	// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
	// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
	// one of the above mentioned nodes. It has to be wrapped because otherwise
	// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
	// be used to form addressing mode. These wrapped nodes will be selected
	// into MOV32ri.
	SDValue
	X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
	ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetConstantPool(
	CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
	SDLoc DL(CP);
	Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
	// With PIC, the address is actually $g + Offset.
	if (OpFlag) {
	Result =
	DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
	}

	return Result;
	}

	SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
	JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
	SDLoc DL(JT);
	Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (OpFlag)
	Result =
	DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);

	return Result;
	}

	SDValue
	X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
	const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
	unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);

	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);

	SDLoc DL(Op);
	Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (isPositionIndependent() && !Subtarget.is64Bit()) {
	Result =
	DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
	}

	// For symbols that require a load from a stub to get the address, emit the
	// load.
	if (isGlobalStubReference(OpFlag))
	Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));

	return Result;
	}

	SDValue
	X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
	// Create the TargetBlockAddressAddress node.
	unsigned char OpFlags =
	Subtarget.classifyBlockAddressReference();
	const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
	int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
	SDLoc dl(Op);
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
	Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (isGlobalRelativeToPICBase(OpFlags)) {
	Result = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
	}

	return Result;
	}

	SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
	const SDLoc &dl, int64_t Offset,
	SelectionDAG &DAG) const {
	// Create the TargetGlobalAddress node, folding in the constant
	// offset if it is legal.
	unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
	CodeModel::Model M = DAG.getTarget().getCodeModel();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result;
	if (OpFlags == X86II::MO_NO_FLAG &&
	X86::isOffsetSuitableForCodeModel(Offset, M)) {
	// A direct static reference to a global.
	Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
	Offset = 0;
	} else {
	Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
	}

	Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (isGlobalRelativeToPICBase(OpFlags)) {
	Result = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
	}

	// For globals that require a load from a stub to get the address, emit the
	// load.
	if (isGlobalStubReference(OpFlags))
	Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));

	// If there was a non-zero offset that we didn't fold, create an explicit
	// addition for it.
	if (Offset != 0)
	Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
	DAG.getConstant(Offset, dl, PtrVT));

	return Result;
	}

	SDValue
	X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
	const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
	int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
	return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
	}

	static SDValue
	GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
	SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
	unsigned char OperandFlags, bool LocalDynamic = false) {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDLoc dl(GA);
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(),
	OperandFlags);

	X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
	: X86ISD::TLSADDR;

	if (InFlag) {
	SDValue Ops[] = { Chain, TGA, *InFlag };
	Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
	} else {
	SDValue Ops[] = { Chain, TGA };
	Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
	}

	// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
	MFI.setAdjustsStack(true);
	MFI.setHasCalls(true);

	SDValue Flag = Chain.getValue(1);
	return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
	}

	// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
	static SDValue
	LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT) {
	SDValue InFlag;
	SDLoc dl(GA); // ? function entry point might be better
	SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
	DAG.getNode(X86ISD::GlobalBaseReg,
	SDLoc(), PtrVT), InFlag);
	InFlag = Chain.getValue(1);

	return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
	}

	// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
	static SDValue
	LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT) {
	return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
	X86::RAX, X86II::MO_TLSGD);
	}

	static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
	SelectionDAG &DAG,
	const EVT PtrVT,
	bool is64Bit) {
	SDLoc dl(GA);

	// Get the start address of the TLS block for this module.
	X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
	.getInfo<X86MachineFunctionInfo>();
	MFI->incNumLocalDynamicTLSAccesses();

	SDValue Base;
	if (is64Bit) {
	Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
	X86II::MO_TLSLD, /LocalDynamic=/true);
	} else {
	SDValue InFlag;
	SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
	InFlag = Chain.getValue(1);
	Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
	X86II::MO_TLSLDM, /LocalDynamic=/true);
	}

	// Note: the CleanupLocalDynamicTLSPass will remove redundant computations
	// of Base.

	// Build x@dtpoff.
	unsigned char OperandFlags = X86II::MO_DTPOFF;
	unsigned WrapperKind = X86ISD::Wrapper;
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(), OperandFlags);
	SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

	// Add x@dtpoff with the base.
	return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
	}

	// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
	static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT, TLSModel::Model model,
	bool is64Bit, bool isPIC) {
	SDLoc dl(GA);

	// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
	Value Ptr = Constant::getNullValue(Type::getInt8PtrTy(DAG.getContext(),
	is64Bit ? 257 : 256));

	SDValue ThreadPointer =
	DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
	MachinePointerInfo(Ptr));

	unsigned char OperandFlags = 0;
	// Most TLS accesses are not RIP relative, even on x86-64. One exception is
	// initialexec.
	unsigned WrapperKind = X86ISD::Wrapper;
	if (model == TLSModel::LocalExec) {
	OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
	} else if (model == TLSModel::InitialExec) {
	if (is64Bit) {
	OperandFlags = X86II::MO_GOTTPOFF;
	WrapperKind = X86ISD::WrapperRIP;
	} else {
	OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
	}
	} else {
	llvm_unreachable("Unexpected model");
	}

	// emit "addl x@ntpoff,%eax" (local exec)
	// or "addl x@indntpoff,%eax" (initial exec)
	// or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
	SDValue TGA =
	DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
	GA->getOffset(), OperandFlags);
	SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

	if (model == TLSModel::InitialExec) {
	if (isPIC && !is64Bit) {
	Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
	Offset);
	}

	Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
	}

	// The address of the thread local variable is the add of the thread
	// pointer with the offset of the variable.
	return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
	}

	SDValue
	X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {

	GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);

	if (DAG.getTarget().Options.EmulatedTLS)
	return LowerToTLSEmulatedModel(GA, DAG);

	const GlobalValue *GV = GA->getGlobal();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	bool PositionIndependent = isPositionIndependent();

	if (Subtarget.isTargetELF()) {
	TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
	switch (model) {
	case TLSModel::GeneralDynamic:
	if (Subtarget.is64Bit())
	return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
	return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
	case TLSModel::LocalDynamic:
	return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
	Subtarget.is64Bit());
	case TLSModel::InitialExec:
	case TLSModel::LocalExec:
	return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
	PositionIndependent);
	}
	llvm_unreachable("Unknown TLS model.");
	}

	if (Subtarget.isTargetDarwin()) {
	// Darwin only has one model of TLS. Lower to that.
	unsigned char OpFlag = 0;
	unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
	X86ISD::WrapperRIP : X86ISD::Wrapper;

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
	if (PIC32)
	OpFlag = X86II::MO_TLVP_PIC_BASE;
	else
	OpFlag = X86II::MO_TLVP;
	SDLoc DL(Op);
	SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
	GA->getValueType(0),
	GA->getOffset(), OpFlag);
	SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);

	// With PIC32, the address is actually $g + Offset.
	if (PIC32)
	Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
	Offset);

	// Lowering the machine isd will make sure everything is in the right
	// location.
	SDValue Chain = DAG.getEntryNode();
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
	SDValue Args[] = { Chain, Offset };
	Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
	DAG.getIntPtrConstant(0, DL, true),
	Chain.getValue(1), DL);

	// TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setAdjustsStack(true);

	// And our return value (tls address) is in the standard call return value
	// location.
	unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
	return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
	}

	if (Subtarget.isTargetKnownWindowsMSVC() \|\|
	Subtarget.isTargetWindowsItanium() \|\|
	Subtarget.isTargetWindowsGNU()) {
	// Just use the implicit TLS architecture
	// Need to generate something similar to:
	// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
	// ; from TEB
	// mov ecx, dword [rel _tls_index]: Load index (from C runtime)
	// mov rcx, qword [rdx+rcx*8]
	// mov eax, .tls$:tlsvar
	// [rax+rcx] contains the address
	// Windows 64bit: gs:0x58
	// Windows 32bit: fs:__tls_array

	SDLoc dl(GA);
	SDValue Chain = DAG.getEntryNode();

	// Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
	// %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
	// use its literal value of 0x2C.
	Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
	? Type::getInt8PtrTy(*DAG.getContext(),
	256)
	: Type::getInt32PtrTy(*DAG.getContext(),
	257));

	SDValue TlsArray = Subtarget.is64Bit()
	? DAG.getIntPtrConstant(0x58, dl)
	: (Subtarget.isTargetWindowsGNU()
	? DAG.getIntPtrConstant(0x2C, dl)
	: DAG.getExternalSymbol("_tls_array", PtrVT));

	SDValue ThreadPointer =
	DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));

	SDValue res;
	if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
	res = ThreadPointer;
	} else {
	// Load the _tls_index variable
	SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
	if (Subtarget.is64Bit())
	IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
	MachinePointerInfo(), MVT::i32);
	else
	IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());

	auto &DL = DAG.getDataLayout();
	SDValue Scale =
	DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
	IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);

	res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
	}

	res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());

	// Get the offset of start of .tls section
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(), X86II::MO_SECREL);
	SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);

	// The address of the thread local variable is the add of the thread
	// pointer with the offset of the variable.
	return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
	}

	llvm_unreachable("TLS not implemented for this target.");
	}

	/// Lower SRA_PARTS and friends, which return two i32 values
	/// and take a 2 x i32 value to shift plus a shift amount.
	static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getNumOperands() == 3 && "Not a double-shift!");
	MVT VT = Op.getSimpleValueType();
	unsigned VTBits = VT.getSizeInBits();
	SDLoc dl(Op);
	bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
	SDValue ShOpLo = Op.getOperand(0);
	SDValue ShOpHi = Op.getOperand(1);
	SDValue ShAmt = Op.getOperand(2);
	// X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
	// generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
	// during isel.
	SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
	DAG.getConstant(VTBits - 1, dl, MVT::i8));
	SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
	DAG.getConstant(VTBits - 1, dl, MVT::i8))
	: DAG.getConstant(0, dl, VT);

	SDValue Tmp2, Tmp3;
	if (Op.getOpcode() == ISD::SHL_PARTS) {
	Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
	Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
	} else {
	Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
	Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
	}

	// If the shift amount is larger or equal than the width of a part we can't
	// rely on the results of shld/shrd. Insert a test and select the appropriate
	// values for large shift amounts.
	SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
	DAG.getConstant(VTBits, dl, MVT::i8));
	SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
	AndNode, DAG.getConstant(0, dl, MVT::i8));

	SDValue Hi, Lo;
	SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
	SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
	SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };

	if (Op.getOpcode() == ISD::SHL_PARTS) {
	Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
	Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
	} else {
	Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
	Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
	}

	SDValue Ops[2] = { Lo, Hi };
	return DAG.getMergeValues(Ops, dl);
	}

	SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	if (SrcVT.isVector()) {
	if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
	return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
	DAG.getUNDEF(SrcVT)));
	}
	if (SrcVT == MVT::v2i1) {
	// For v2i1, we need to widen to v4i1 first.
	assert(VT == MVT::v2f64 && "Unexpected type");
	Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Src,
	DAG.getUNDEF(MVT::v2i1));
	return DAG.getNode(X86ISD::CVTSI2P, dl, Op.getValueType(),
	DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Src));
	}
	return SDValue();
	}

	assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
	"Unknown SINT_TO_FP to lower!");

	// These are really Legal; return the operand so the caller accepts it as
	// Legal.
	if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
	return Op;
	if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
	Subtarget.is64Bit()) {
	return Op;
	}

	SDValue ValueToStore = Op.getOperand(0);
	if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
	!Subtarget.is64Bit())
	// Bitcasting to f64 here allows us to do a single 64-bit store from
	// an SSE register, avoiding the store forwarding penalty that would come
	// with two 32-bit stores.
	ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

	unsigned Size = SrcVT.getSizeInBits()/8;
	MachineFunction &MF = DAG.getMachineFunction();
	auto PtrVT = getPointerTy(MF.getDataLayout());
	int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
	SDValue Chain = DAG.getStore(
	DAG.getEntryNode(), dl, ValueToStore, StackSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
	return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
	}

	SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
	SDValue StackSlot,
	SelectionDAG &DAG) const {
	// Build the FILD
	SDLoc DL(Op);
	SDVTList Tys;
	bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
	if (useSSE)
	Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
	else
	Tys = DAG.getVTList(Op.getValueType(), MVT::Other);

	unsigned ByteSize = SrcVT.getSizeInBits()/8;

	FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
	MachineMemOperand *MMO;
	if (FI) {
	int SSFI = FI->getIndex();
	MMO = DAG.getMachineFunction().getMachineMemOperand(
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
	MachineMemOperand::MOLoad, ByteSize, ByteSize);
	} else {
	MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
	StackSlot = StackSlot.getOperand(1);
	}
	SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
	SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
	X86ISD::FILD, DL,
	Tys, Ops, SrcVT, MMO);

	if (useSSE) {
	Chain = Result.getValue(1);
	SDValue InFlag = Result.getValue(2);

	// FIXME: Currently the FST is flagged to the FILD_FLAG. This
	// shouldn't be necessary except that RFP cannot be live across
	// multiple blocks. When stackifier is fixed, they can be uncoupled.
	MachineFunction &MF = DAG.getMachineFunction();
	unsigned SSFISize = Op.getValueSizeInBits()/8;
	int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
	auto PtrVT = getPointerTy(MF.getDataLayout());
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
	Tys = DAG.getVTList(MVT::Other);
	SDValue Ops[] = {
	Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
	};
	MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
	MachineMemOperand::MOStore, SSFISize, SSFISize);

	Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
	Ops, Op.getValueType(), MMO);
	Result = DAG.getLoad(
	Op.getValueType(), DL, Chain, StackSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
	}

	return Result;
	}

	/// 64-bit unsigned integer to double expansion.
	static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// This algorithm is not obvious. Here it is what we're trying to output:
	/*
	movq %rax, %xmm0
	punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
	subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
	#ifdef __SSE3__
	haddpd %xmm0, %xmm0
	#else
	pshufd $0x4e, %xmm0, %xmm1
	addpd %xmm1, %xmm0
	#endif
	*/

	SDLoc dl(Op);
	LLVMContext *Context = DAG.getContext();

	// Build some magic constants.
	static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
	Constant C0 = ConstantDataVector::get(Context, CV0);
	auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
	SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);

	SmallVector<Constant*,2> CV1;
	CV1.push_back(
	ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
	APInt(64, 0x4330000000000000ULL))));
	CV1.push_back(
	ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
	APInt(64, 0x4530000000000000ULL))));
	Constant *C1 = ConstantVector::get(CV1);
	SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);

	// Load the 64-bit value into an XMM register.
	SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
	Op.getOperand(0));
	SDValue CLod0 =
	DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	/* Alignment = */ 16);
	SDValue Unpck1 =
	getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);

	SDValue CLod1 =
	DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	/* Alignment = */ 16);
	SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
	SDValue Result;

	if (Subtarget.hasSSE3()) {
	// FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
	Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
	} else {
	SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
	SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
	Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
	DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
	}

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
	DAG.getIntPtrConstant(0, dl));
	}

	/// 32-bit unsigned integer to float expansion.
	static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	// FP constant to bias correct the final result.
	SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
	MVT::f64);

	// Load the 32-bit value into an XMM register.
	SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
	Op.getOperand(0));

	// Zero out the upper parts of the register.
	Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);

	Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
	DAG.getBitcast(MVT::v2f64, Load),
	DAG.getIntPtrConstant(0, dl));

	// Or the load with the bias.
	SDValue Or = DAG.getNode(
	ISD::OR, dl, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
	DAG.getBitcast(MVT::v2i64,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
	Or =
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
	DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));

	// Subtract the bias.
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);

	// Handle final rounding.
	return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
	}

	static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget, SDLoc &DL) {
	if (Op.getSimpleValueType() != MVT::v2f64)
	return SDValue();

	SDValue N0 = Op.getOperand(0);
	assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");

	// Legalize to v4i32 type.
	N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
	DAG.getUNDEF(MVT::v2i32));

	if (Subtarget.hasAVX512())
	return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);

	// Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
	// but using v2i32 to v2f64 with X86ISD::CVTSI2P.
	SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
	SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);

	// Two to the power of half-word-size.
	SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);

	// Clear upper part of LO, lower HI.
	SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
	SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);

	SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
	fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
	SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);

	// Add the two halves.
	return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
	}

	static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// The algorithm is the following:
	// #ifdef __SSE4_1__
	// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
	// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
	// (uint4) 0x53000000, 0xaa);
	// #else
	// uint4 lo = (v & (uint4) 0xffff) \| (uint4) 0x4b000000;
	// uint4 hi = (v >> 16) \| (uint4) 0x53000000;
	// #endif
	// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
	// return (float4) lo + fhi;

	// We shouldn't use it when unsafe-fp-math is enabled though: we might later
	// reassociate the two FADDs, and if we do that, the algorithm fails
	// spectacularly (PR24512).
	// FIXME: If we ever have some kind of Machine FMF, this should be marked
	// as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
	// there's also the MachineCombiner reassociations happening on Machine IR.
	if (DAG.getTarget().Options.UnsafeFPMath)
	return SDValue();

	SDLoc DL(Op);
	SDValue V = Op->getOperand(0);
	MVT VecIntVT = V.getSimpleValueType();
	bool Is128 = VecIntVT == MVT::v4i32;
	MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
	// If we convert to something else than the supported type, e.g., to v4f64,
	// abort early.
	if (VecFloatVT != Op->getSimpleValueType(0))
	return SDValue();

	assert((VecIntVT == MVT::v4i32 \|\| VecIntVT == MVT::v8i32) &&
	"Unsupported custom type");

	// In the #idef/#else code, we have in common:
	// - The vector of constants:
	// -- 0x4b000000
	// -- 0x53000000
	// - A shift:
	// -- v >> 16

	// Create the splat vector for 0x4b000000.
	SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
	// Create the splat vector for 0x53000000.
	SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);

	// Create the right shift.
	SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
	SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);

	SDValue Low, High;
	if (Subtarget.hasSSE41()) {
	MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
	// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
	SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
	SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
	// Low will be bitcasted right away, so do not bother bitcasting back to its
	// original type.
	Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
	VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
	// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
	// (uint4) 0x53000000, 0xaa);
	SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
	SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
	// High will be bitcasted right away, so do not bother bitcasting back to
	// its original type.
	High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
	VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
	} else {
	SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
	// uint4 lo = (v & (uint4) 0xffff) \| (uint4) 0x4b000000;
	SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
	Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);

	// uint4 hi = (v >> 16) \| (uint4) 0x53000000;
	High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
	}

	// Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
	SDValue VecCstFAdd = DAG.getConstantFP(
	APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);

	// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
	SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue FHigh =
	DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
	// return (float4) lo + fhi;
	SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
	return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
	}

	static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue N0 = Op.getOperand(0);
	MVT SrcVT = N0.getSimpleValueType();
	SDLoc dl(Op);

	if (SrcVT == MVT::v2i1) {
	// For v2i1, we need to widen to v4i1 first.
	assert(Op.getValueType() == MVT::v2f64 && "Unexpected type");
	N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, N0,
	DAG.getUNDEF(MVT::v2i1));
	return DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v2f64,
	DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0));
	}

	switch (SrcVT.SimpleTy) {
	default:
	llvm_unreachable("Custom UINT_TO_FP is not supported!");
	case MVT::v2i32:
	return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
	case MVT::v4i32:
	case MVT::v8i32:
	assert(!Subtarget.hasAVX512());
	return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
	}
	}

	SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue N0 = Op.getOperand(0);
	SDLoc dl(Op);
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	if (Op.getSimpleValueType().isVector())
	return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);

	MVT SrcVT = N0.getSimpleValueType();
	MVT DstVT = Op.getSimpleValueType();

	if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
	(SrcVT == MVT::i32 \|\| (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
	// Conversions from unsigned i32 to f32/f64 are legal,
	// using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
	return Op;
	}

	if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
	return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
	if (SrcVT == MVT::i32 && X86ScalarSSEf64)
	return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
	if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
	return SDValue();

	// Make a 64-bit buffer, and use it to build an FILD.
	SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
	if (SrcVT == MVT::i32) {
	SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
	SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
	StackSlot, MachinePointerInfo());
	SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
	OffsetSlot, MachinePointerInfo());
	SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
	return Fild;
	}

	assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
	SDValue ValueToStore = Op.getOperand(0);
	if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
	// Bitcasting to f64 here allows us to do a single 64-bit store from
	// an SSE register, avoiding the store forwarding penalty that would come
	// with two 32-bit stores.
	ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
	SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
	MachinePointerInfo());
	// For i64 source, we need to add the appropriate power of 2 if the input
	// was negative. This is the same as the optimization in
	// DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
	// we must be careful to do the computation in x87 extended precision, not
	// in SSE. (The generic code can't know it's OK to do this, or how to.)
	int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
	MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
	MachineMemOperand::MOLoad, 8, 8);

	SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
	SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
	SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
	MVT::i64, MMO);

	APInt FF(32, 0x5F800000ULL);

	// Check whether the sign bit is set.
	SDValue SignSet = DAG.getSetCC(
	dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
	Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);

	// Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
	SDValue FudgePtr = DAG.getConstantPool(
	ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);

	// Get a pointer to FF if the sign bit was set, or to 0 otherwise.
	SDValue Zero = DAG.getIntPtrConstant(0, dl);
	SDValue Four = DAG.getIntPtrConstant(4, dl);
	SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
	FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);

	// Load the value out, extending it from f32 to f80.
	// FIXME: Avoid the extend by constructing the right constant pool?
	SDValue Fudge = DAG.getExtLoad(
	ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
	/* Alignment = */ 4);
	// Extend everything to 80 bits to force it to be done on x87.
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
	return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
	DAG.getIntPtrConstant(0, dl));
	}

	// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
	// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
	// just return an <SDValue(), SDValue()> pair.
	// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
	// to i16, i32 or i64, and we lower it to a legal sequence.
	// If lowered to the final integer result we return a <result, SDValue()> pair.
	// Otherwise we lower it to a sequence ending with a FIST, return a
	// <FIST, StackSlot> pair, and the caller is responsible for loading
	// the final integer result from StackSlot.
	std::pair<SDValue,SDValue>
	X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
	bool IsSigned, bool IsReplace) const {
	SDLoc DL(Op);

	EVT DstTy = Op.getValueType();
	EVT TheVT = Op.getOperand(0).getValueType();
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
	// f16 must be promoted before using the lowering in this routine.
	// fp128 does not use this lowering.
	return std::make_pair(SDValue(), SDValue());
	}

	// If using FIST to compute an unsigned i64, we'll need some fixup
	// to handle values above the maximum signed i64. A FIST is always
	// used for the 32-bit subtarget, but also for f80 on a 64-bit target.
	bool UnsignedFixup = !IsSigned &&
	DstTy == MVT::i64 &&
	(!Subtarget.is64Bit() \|\|
	!isScalarFPTypeInSSEReg(TheVT));

	if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
	// Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
	// The low 32 bits of the fist result will have the correct uint32 result.
	assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
	DstTy = MVT::i64;
	}

	assert(DstTy.getSimpleVT() <= MVT::i64 &&
	DstTy.getSimpleVT() >= MVT::i16 &&
	"Unknown FP_TO_INT to lower!");

	// These are really Legal.
	if (DstTy == MVT::i32 &&
	isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
	return std::make_pair(SDValue(), SDValue());
	if (Subtarget.is64Bit() &&
	DstTy == MVT::i64 &&
	isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
	return std::make_pair(SDValue(), SDValue());

	// We lower FP->int64 into FISTP64 followed by a load from a temporary
	// stack slot.
	MachineFunction &MF = DAG.getMachineFunction();
	unsigned MemSize = DstTy.getSizeInBits()/8;
	int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

	unsigned Opc;
	switch (DstTy.getSimpleVT().SimpleTy) {
	default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
	case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
	case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
	case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
	}

	SDValue Chain = DAG.getEntryNode();
	SDValue Value = Op.getOperand(0);
	SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.

	if (UnsignedFixup) {
	//
	// Conversion to unsigned i64 is implemented with a select,
	// depending on whether the source value fits in the range
	// of a signed i64. Let Thresh be the FP equivalent of
	// 0x8000000000000000ULL.
	//
	// Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
	// FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
	// Fist-to-mem64 FistSrc
	// Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
	// to XOR'ing the high 32 bits with Adjust.
	//
	// Being a power of 2, Thresh is exactly representable in all FP formats.
	// For X87 we'd like to use the smallest FP type for this constant, but
	// for DAG type consistency we have to match the FP operand type.

	APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
	LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
	bool LosesInfo = false;
	if (TheVT == MVT::f64)
	// The rounding mode is irrelevant as the conversion should be exact.
	Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
	&LosesInfo);
	else if (TheVT == MVT::f80)
	Status = Thresh.convert(APFloat::x87DoubleExtended(),
	APFloat::rmNearestTiesToEven, &LosesInfo);

	assert(Status == APFloat::opOK && !LosesInfo &&
	"FP conversion should have been exact");

	SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);

	SDValue Cmp = DAG.getSetCC(DL,
	getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), TheVT),
	Value, ThreshVal, ISD::SETLT);
	Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
	DAG.getConstant(0, DL, MVT::i32),
	DAG.getConstant(0x80000000, DL, MVT::i32));
	SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
	Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), TheVT),
	Value, ThreshVal, ISD::SETLT);
	Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
	}

	// FIXME This causes a redundant load/store if the SSE-class value is already
	// in memory, such as if it is on the callstack.
	if (isScalarFPTypeInSSEReg(TheVT)) {
	assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
	Chain = DAG.getStore(Chain, DL, Value, StackSlot,
	MachinePointerInfo::getFixedStack(MF, SSFI));
	SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
	SDValue Ops[] = {
	Chain, StackSlot, DAG.getValueType(TheVT)
	};

	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
	MachineMemOperand::MOLoad, MemSize, MemSize);
	Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
	Chain = Value.getValue(1);
	SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
	StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
	}

	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
	MachineMemOperand::MOStore, MemSize, MemSize);

	if (UnsignedFixup) {

	// Insert the FIST, load its result as two i32's,
	// and XOR the high i32 with Adjust.

	SDValue FistOps[] = { Chain, Value, StackSlot };
	SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
	FistOps, DstTy, MMO);

	SDValue Low32 =
	DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
	SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);

	SDValue High32 =
	DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
	High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);

	if (Subtarget.is64Bit()) {
	// Join High32 and Low32 into a 64-bit result.
	// (High32 << 32) \| Low32
	Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
	High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
	High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
	DAG.getConstant(32, DL, MVT::i8));
	SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
	return std::make_pair(Result, SDValue());
	}

	SDValue ResultOps[] = { Low32, High32 };

	SDValue pair = IsReplace
	? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
	: DAG.getMergeValues(ResultOps, DL);
	return std::make_pair(pair, SDValue());
	} else {
	// Build the FP_TO_INT*_IN_MEM
	SDValue Ops[] = { Chain, Value, StackSlot };
	SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
	Ops, DstTy, MMO);
	return std::make_pair(FIST, StackSlot);
	}
	}

	static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	SDLoc dl(Op);

	if ((VT != MVT::v4i64 \|\| InVT != MVT::v4i32) &&
	(VT != MVT::v8i32 \|\| InVT != MVT::v8i16) &&
	(VT != MVT::v16i16 \|\| InVT != MVT::v16i8) &&
	(VT != MVT::v8i64 \|\| InVT != MVT::v8i32) &&
	(VT != MVT::v8i64 \|\| InVT != MVT::v8i16) &&
	(VT != MVT::v16i32 \|\| InVT != MVT::v16i16) &&
	(VT != MVT::v16i32 \|\| InVT != MVT::v16i8) &&
	(VT != MVT::v32i16 \|\| InVT != MVT::v32i8))
	return SDValue();

	if (Subtarget.hasInt256())
	return DAG.getNode(X86ISD::VZEXT, dl, VT, In);

	// Optimize vectors in AVX mode:
	//
	// v8i16 -> v8i32
	// Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
	// Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
	// Concat upper and lower parts.
	//
	// v4i32 -> v4i64
	// Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
	// Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
	// Concat upper and lower parts.
	//

	SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
	SDValue Undef = DAG.getUNDEF(InVT);
	bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
	SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
	SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);

	MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
	VT.getVectorNumElements()/2);

	OpLo = DAG.getBitcast(HVT, OpLo);
	OpHi = DAG.getBitcast(HVT, OpHi);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
	}

	static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
	SDLoc DL(Op);
	unsigned NumElts = VT.getVectorNumElements();

	// Extend VT if the scalar type is v8/v16 and BWI is not supported.
	MVT ExtVT = VT;
	if (!Subtarget.hasBWI() &&
	(VT.getVectorElementType().getSizeInBits() <= 16))
	ExtVT = MVT::getVectorVT(MVT::i32, NumElts);

	// Widen to 512-bits if VLX is not supported.
	MVT WideVT = ExtVT;
	if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
	NumElts *= 512 / ExtVT.getSizeInBits();
	InVT = MVT::getVectorVT(MVT::i1, NumElts);
	In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
	In, DAG.getIntPtrConstant(0, DL));
	WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
	NumElts);
	}

	SDValue One = DAG.getConstant(1, DL, WideVT);
	SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, DL);

	SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);

	// Truncate if we had to extend i16/i8 above.
	if (VT != ExtVT) {
	WideVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
	SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
	}

	// Extract back to 128/256-bit if we widened.
	if (WideVT != VT)
	SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
	DAG.getIntPtrConstant(0, DL));

	return SelectedVal;
	}

	static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = Op.getOperand(0);
	MVT SVT = In.getSimpleValueType();

	if (SVT.getVectorElementType() == MVT::i1)
	return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);

	if (Subtarget.hasFp256())
	if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
	return Res;

	assert(!Op.getSimpleValueType().is256BitVector() \|\| !SVT.is128BitVector() \|\|
	Op.getSimpleValueType().getVectorNumElements() !=
	SVT.getVectorNumElements());
	return SDValue();
	}

	/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
	/// It makes use of the fact that vectors with enough leading sign/zero bits
	/// prevent the PACKSS/PACKUS from saturating the results.
	/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
	/// within each 128-bit lane.
	static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
	const SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert((Opcode == X86ISD::PACKSS \|\| Opcode == X86ISD::PACKUS) &&
	"Unexpected PACK opcode");

	// Requires SSE2 but AVX512 has fast truncate.
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX512())
	return SDValue();

	EVT SrcVT = In.getValueType();

	// No truncation required, we might get here due to recursive calls.
	if (SrcVT == DstVT)
	return In;

	// We only support vector truncation to 128bits or greater from a
	// 256bits or greater source.
	unsigned DstSizeInBits = DstVT.getSizeInBits();
	unsigned SrcSizeInBits = SrcVT.getSizeInBits();
	if ((DstSizeInBits % 128) != 0 \|\| (SrcSizeInBits % 256) != 0)
	return SDValue();

	LLVMContext &Ctx = *DAG.getContext();
	unsigned NumElems = SrcVT.getVectorNumElements();
	assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
	assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");

	EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);

	// Extract lower/upper subvectors.
	unsigned NumSubElts = NumElems / 2;
	SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
	SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);

	// Pack to the largest type possible:
	// vXi64/vXi32 -> PACKSDW and vXi16 -> PACKSWB.
	EVT InVT = MVT::i16, OutVT = MVT::i8;
	if (DstVT.getScalarSizeInBits() > 8 &&
	(Opcode == X86ISD::PACKSS \|\| Subtarget.hasSSE41())) {
	InVT = MVT::i32;
	OutVT = MVT::i16;
	}

	unsigned SubSizeInBits = SrcSizeInBits / 2;
	InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
	OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());

	// 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
	if (SrcVT.is256BitVector()) {
	Lo = DAG.getBitcast(InVT, Lo);
	Hi = DAG.getBitcast(InVT, Hi);
	SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
	return DAG.getBitcast(DstVT, Res);
	}

	// AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
	// AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
	if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
	Lo = DAG.getBitcast(InVT, Lo);
	Hi = DAG.getBitcast(InVT, Hi);
	SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);

	// 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
	// so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
	Res = DAG.getBitcast(MVT::v4i64, Res);
	Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});

	if (DstVT.is256BitVector())
	return DAG.getBitcast(DstVT, Res);

	// If 512bit -> 128bit truncate another stage.
	EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
	Res = DAG.getBitcast(PackedVT, Res);
	return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
	}

	// Recursively pack lower/upper subvectors, concat result and pack again.
	assert(SrcSizeInBits >= 512 && "Expected 512-bit vector or greater");
	EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
	Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
	Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);

	PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
	return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
	}

	static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {

	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT InVT = In.getSimpleValueType();

	assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");

	// Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
	unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
	if (InVT.getScalarSizeInBits() <= 16) {
	if (Subtarget.hasBWI()) {
	// legal, will go to VPMOVB2M, VPMOVW2M
	if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
	// We need to shift to get the lsb into sign position.
	// Shift packed bytes not supported natively, bitcast to word
	MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
	In = DAG.getNode(ISD::SHL, DL, ExtVT,
	DAG.getBitcast(ExtVT, In),
	DAG.getConstant(ShiftInx, DL, ExtVT));
	In = DAG.getBitcast(InVT, In);
	}
	return DAG.getNode(X86ISD::CVT2MASK, DL, VT, In);
	}
	// Use TESTD/Q, extended vector to packed dword/qword.
	assert((InVT.is256BitVector() \|\| InVT.is128BitVector()) &&
	"Unexpected vector type.");
	unsigned NumElts = InVT.getVectorNumElements();
	MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
	MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
	In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
	InVT = ExtVT;
	ShiftInx = InVT.getScalarSizeInBits() - 1;
	}

	if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
	// We need to shift to get the lsb into sign position.
	In = DAG.getNode(ISD::SHL, DL, InVT, In,
	DAG.getConstant(ShiftInx, DL, InVT));
	}
	return DAG.getNode(X86ISD::TESTM, DL, VT, In, In);
	}

	SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT InVT = In.getSimpleValueType();
	unsigned InNumEltBits = InVT.getScalarSizeInBits();

	assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
	"Invalid TRUNCATE operation");

	if (VT.getVectorElementType() == MVT::i1)
	return LowerTruncateVecI1(Op, DAG, Subtarget);

	// vpmovqb/w/d, vpmovdb/w, vpmovwb
	if (Subtarget.hasAVX512()) {
	// word to byte only under BWI
	if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
	return DAG.getNode(X86ISD::VTRUNC, DL, VT,
	getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
	return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
	}

	// Truncate with PACKSS if we are truncating a vector with sign-bits that
	// extend all the way to the packed/truncated value.
	unsigned NumPackedBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
	if ((InNumEltBits - NumPackedBits) < DAG.ComputeNumSignBits(In))
	if (SDValue V =
	truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
	return V;

	// Truncate with PACKUS if we are truncating a vector with leading zero bits
	// that extend all the way to the packed/truncated value.
	// Pre-SSE41 we can only use PACKUSWB.
	KnownBits Known;
	DAG.computeKnownBits(In, Known);
	NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
	if ((InNumEltBits - NumPackedBits) <= Known.countMinLeadingZeros())
	if (SDValue V =
	truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
	return V;

	if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
	// On AVX2, v4i64 -> v4i32 becomes VPERMD.
	if (Subtarget.hasInt256()) {
	static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
	In = DAG.getBitcast(MVT::v8i32, In);
	In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
	DAG.getIntPtrConstant(0, DL));
	}

	SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(0, DL));
	SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(2, DL));
	OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
	OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
	static const int ShufMask[] = {0, 2, 4, 6};
	return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
	}

	if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
	// On AVX2, v8i32 -> v8i16 becomes PSHUFB.
	if (Subtarget.hasInt256()) {
	In = DAG.getBitcast(MVT::v32i8, In);

	// The PSHUFB mask:
	static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
	-1, -1, -1, -1, -1, -1, -1, -1,
	16, 17, 20, 21, 24, 25, 28, 29,
	-1, -1, -1, -1, -1, -1, -1, -1 };
	In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
	In = DAG.getBitcast(MVT::v4i64, In);

	static const int ShufMask2[] = {0, 2, -1, -1};
	In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
	In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getBitcast(VT, In);
	}

	SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
	DAG.getIntPtrConstant(0, DL));

	SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
	DAG.getIntPtrConstant(4, DL));

	OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
	OpHi = DAG.getBitcast(MVT::v16i8, OpHi);

	// The PSHUFB mask:
	static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
	-1, -1, -1, -1, -1, -1, -1, -1};

	OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
	OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);

	OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
	OpHi = DAG.getBitcast(MVT::v4i32, OpHi);

	// The MOVLHPS Mask:
	static const int ShufMask2[] = {0, 1, 4, 5};
	SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
	return DAG.getBitcast(MVT::v8i16, res);
	}

	// Handle truncation of V256 to V128 using shuffles.
	if (!VT.is128BitVector() \|\| !InVT.is256BitVector())
	return SDValue();

	assert(Subtarget.hasFp256() && "256-bit vector without AVX!");

	unsigned NumElems = VT.getVectorNumElements();
	MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);

	SmallVector<int, 16> MaskVec(NumElems * 2, -1);
	// Prepare truncation shuffle mask
	for (unsigned i = 0; i != NumElems; ++i)
	MaskVec[i] = i * 2;
	In = DAG.getBitcast(NVT, In);
	SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
	DAG.getIntPtrConstant(0, DL));
	}

	SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
	bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
	MVT VT = Op.getSimpleValueType();

	if (VT.isVector()) {
	SDValue Src = Op.getOperand(0);
	SDLoc dl(Op);

	if (VT == MVT::v2i1 && Src.getSimpleValueType() == MVT::v2f64) {
	MVT ResVT = MVT::v4i32;
	MVT TruncVT = MVT::v4i1;
	unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
	if (!IsSigned && !Subtarget.hasVLX()) {
	// Widen to 512-bits.
	ResVT = MVT::v8i32;
	TruncVT = MVT::v8i1;
	Opc = ISD::FP_TO_UINT;
	Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
	DAG.getUNDEF(MVT::v8f64),
	Src, DAG.getIntPtrConstant(0, dl));
	}
	SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
	Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
	DAG.getIntPtrConstant(0, dl));
	}

	assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
	if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
	return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
	DAG.getUNDEF(MVT::v2f32)));
	}

	return SDValue();
	}

	assert(!VT.isVector());

	std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
	IsSigned, /IsReplace=/ false);
	SDValue FIST = Vals.first, StackSlot = Vals.second;
	// If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
	if (!FIST.getNode())
	return Op;

	if (StackSlot.getNode())
	// Load the result.
	return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());

	// The node is the result.
	return FIST;
	}

	static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT SVT = In.getSimpleValueType();

	assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");

	return DAG.getNode(X86ISD::VFPEXT, DL, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
	In, DAG.getUNDEF(SVT)));
	}

	/// The only differences between FABS and FNEG are the mask and the logic op.
	/// FNEG also has a folding opportunity for FNEG(FABS(x)).
	static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
	assert((Op.getOpcode() == ISD::FABS \|\| Op.getOpcode() == ISD::FNEG) &&
	"Wrong opcode for lowering FABS or FNEG.");

	bool IsFABS = (Op.getOpcode() == ISD::FABS);

	// If this is a FABS and it has an FNEG user, bail out to fold the combination
	// into an FNABS. We'll lower the FABS after that if it is still in use.
	if (IsFABS)
	for (SDNode *User : Op->uses())
	if (User->getOpcode() == ISD::FNEG)
	return Op;

	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	bool IsF128 = (VT == MVT::f128);

	// FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
	// decide if we should generate a 16-byte constant mask when we only need 4 or
	// 8 bytes for the scalar case.

	MVT LogicVT;
	MVT EltVT;

	if (VT.isVector()) {
	LogicVT = VT;
	EltVT = VT.getVectorElementType();
	} else if (IsF128) {
	// SSE instructions are used for optimized f128 logical operations.
	LogicVT = MVT::f128;
	EltVT = VT;
	} else {
	// There are no scalar bitwise logical SSE/AVX instructions, so we
	// generate a 16-byte vector constant and logic op even for the scalar case.
	// Using a 16-byte mask allows folding the load of the mask with
	// the logic op, so it can save (~4 bytes) on code size.
	LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
	EltVT = VT;
	}

	unsigned EltBits = EltVT.getSizeInBits();
	// For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
	APInt MaskElt =
	IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
	const fltSemantics &Sem =
	EltVT == MVT::f64 ? APFloat::IEEEdouble() :
	(IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
	SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);

	SDValue Op0 = Op.getOperand(0);
	bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
	unsigned LogicOp =
	IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
	SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;

	if (VT.isVector() \|\| IsF128)
	return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);

	// For the scalar case extend to a 128-bit vector, perform the logic op,
	// and extract the scalar result back out.
	Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
	SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
	DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
	SDValue Mag = Op.getOperand(0);
	SDValue Sign = Op.getOperand(1);
	SDLoc dl(Op);

	// If the sign operand is smaller, extend it first.
	MVT VT = Op.getSimpleValueType();
	if (Sign.getSimpleValueType().bitsLT(VT))
	Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);

	// And if it is bigger, shrink it first.
	if (Sign.getSimpleValueType().bitsGT(VT))
	Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));

	// At this point the operands and the result should have the same
	// type, and that won't be f80 since that is not custom lowered.
	bool IsF128 = (VT == MVT::f128);
	assert((VT == MVT::f64 \|\| VT == MVT::f32 \|\| VT == MVT::f128 \|\|
	VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v4f32 \|\|
	VT == MVT::v8f32 \|\| VT == MVT::v8f64 \|\| VT == MVT::v16f32) &&
	"Unexpected type in LowerFCOPYSIGN");

	MVT EltVT = VT.getScalarType();
	const fltSemantics &Sem =
	EltVT == MVT::f64 ? APFloat::IEEEdouble()
	: (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());

	// Perform all scalar logic operations as 16-byte vectors because there are no
	// scalar FP logic instructions in SSE.
	// TODO: This isn't necessary. If we used scalar types, we might avoid some
	// unnecessary splats, but we might miss load folding opportunities. Should
	// this decision be based on OptimizeForSize?
	bool IsFakeVector = !VT.isVector() && !IsF128;
	MVT LogicVT = VT;
	if (IsFakeVector)
	LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;

	// The mask constants are automatically splatted for vector types.
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	SDValue SignMask = DAG.getConstantFP(
	APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
	SDValue MagMask = DAG.getConstantFP(
	APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);

	// First, clear all bits but the sign bit from the second operand (sign).
	if (IsFakeVector)
	Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
	SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);

	// Next, clear the sign bit from the first operand (magnitude).
	// TODO: If we had general constant folding for FP logic ops, this check
	// wouldn't be necessary.
	SDValue MagBits;
	if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
	APFloat APF = Op0CN->getValueAPF();
	APF.clearSign();
	MagBits = DAG.getConstantFP(APF, dl, LogicVT);
	} else {
	// If the magnitude operand wasn't a constant, we need to AND out the sign.
	if (IsFakeVector)
	Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
	MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
	}

	// OR the magnitude value with the sign bit.
	SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
	return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
	DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
	SDValue N0 = Op.getOperand(0);
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	MVT OpVT = N0.getSimpleValueType();
	assert((OpVT == MVT::f32 \|\| OpVT == MVT::f64) &&
	"Unexpected type for FGETSIGN");

	// Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
	MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
	SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
	Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
	Res = DAG.getZExtOrTrunc(Res, dl, VT);
	Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
	return Res;
	}

	// Check whether an OR'd tree is PTEST-able.
	static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");

	if (!Subtarget.hasSSE41())
	return SDValue();

	if (!Op->hasOneUse())
	return SDValue();

	SDNode *N = Op.getNode();
	SDLoc DL(N);

	SmallVector<SDValue, 8> Opnds;
	DenseMap<SDValue, unsigned> VecInMap;
	SmallVector<SDValue, 8> VecIns;
	EVT VT = MVT::Other;

	// Recognize a special case where a vector is casted into wide integer to
	// test all 0s.
	Opnds.push_back(N->getOperand(0));
	Opnds.push_back(N->getOperand(1));

	for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
	SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
	// BFS traverse all OR'd operands.
	if (I->getOpcode() == ISD::OR) {
	Opnds.push_back(I->getOperand(0));
	Opnds.push_back(I->getOperand(1));
	// Re-evaluate the number of nodes to be traversed.
	e += 2; // 2 more nodes (LHS and RHS) are pushed.
	continue;
	}

	// Quit if a non-EXTRACT_VECTOR_ELT
	if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	// Quit if without a constant index.
	SDValue Idx = I->getOperand(1);
	if (!isa<ConstantSDNode>(Idx))
	return SDValue();

	SDValue ExtractedFromVec = I->getOperand(0);
	DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
	if (M == VecInMap.end()) {
	VT = ExtractedFromVec.getValueType();
	// Quit if not 128/256-bit vector.
	if (!VT.is128BitVector() && !VT.is256BitVector())
	return SDValue();
	// Quit if not the same type.
	if (VecInMap.begin() != VecInMap.end() &&
	VT != VecInMap.begin()->first.getValueType())
	return SDValue();
	M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
	VecIns.push_back(ExtractedFromVec);
	}
	M->second \|= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
	}

	assert((VT.is128BitVector() \|\| VT.is256BitVector()) &&
	"Not extracted from 128-/256-bit vector.");

	unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;

	for (DenseMap<SDValue, unsigned>::const_iterator
	I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
	// Quit if not all elements are used.
	if (I->second != FullMask)
	return SDValue();
	}

	MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;

	// Cast all vectors into TestVT for PTEST.
	for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
	VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);

	// If more than one full vector is evaluated, OR them first before PTEST.
	for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
	// Each iteration will OR 2 nodes and append the result until there is only
	// 1 node left, i.e. the final OR'd value of all vectors.
	SDValue LHS = VecIns[Slot];
	SDValue RHS = VecIns[Slot + 1];
	VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
	}

	return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
	}

	/// \brief return true if \c Op has a use that doesn't just read flags.
	static bool hasNonFlagsUse(SDValue Op) {
	for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
	++UI) {
	SDNode User = UI;
	unsigned UOpNo = UI.getOperandNo();
	if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
	// Look pass truncate.
	UOpNo = User->use_begin().getOperandNo();
	User = *User->use_begin();
	}

	if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
	!(User->getOpcode() == ISD::SELECT && UOpNo == 0))
	return true;
	}
	return false;
	}

	-// Emit KTEST instruction for bit vectors on AVX-512
	-static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
	- const X86Subtarget &Subtarget) {
	- if (Op.getOpcode() == ISD::BITCAST) {
	- auto hasKTEST = [&](MVT VT) {
	- unsigned SizeInBits = VT.getSizeInBits();
	- return (Subtarget.hasDQI() && (SizeInBits == 8 \|\| SizeInBits == 16)) \|\|
	- (Subtarget.hasBWI() && (SizeInBits == 32 \|\| SizeInBits == 64));
	- };
	- SDValue Op0 = Op.getOperand(0);
	- MVT Op0VT = Op0.getValueType().getSimpleVT();
	- if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
	- hasKTEST(Op0VT))
	- return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
	- }
	- return SDValue();
	-}
	-
	/// Emit nodes that will be selected as "test Op0,Op0", or something
	/// equivalent.
	SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
	SelectionDAG &DAG) const {
	if (Op.getValueType() == MVT::i1) {
	SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
	DAG.getConstant(0, dl, MVT::i8));
	}
	// CF and OF aren't always set the way we want. Determine which
	// of these we need.
	bool NeedCF = false;
	bool NeedOF = false;
	switch (X86CC) {
	default: break;
	case X86::COND_A: case X86::COND_AE:
	case X86::COND_B: case X86::COND_BE:
	NeedCF = true;
	break;
	case X86::COND_G: case X86::COND_GE:
	case X86::COND_L: case X86::COND_LE:
	case X86::COND_O: case X86::COND_NO: {
	// Check if we really need to set the
	// Overflow flag. If NoSignedWrap is present
	// that is not actually needed.
	switch (Op->getOpcode()) {
	case ISD::ADD:
	case ISD::SUB:
	case ISD::MUL:
	case ISD::SHL:
	if (Op.getNode()->getFlags().hasNoSignedWrap())
	break;
	LLVM_FALLTHROUGH;
	default:
	NeedOF = true;
	break;
	}
	break;
	}
	}
	// See if we can use the EFLAGS value from the operand instead of
	// doing a separate TEST. TEST always sets OF and CF to 0, so unless
	// we prove that the arithmetic won't overflow, we can't use OF or CF.
	if (Op.getResNo() != 0 \|\| NeedOF \|\| NeedCF) {
	- // Emit KTEST for bit vectors
	- if (auto Node = EmitKTEST(Op, DAG, Subtarget))
	- return Node;
	// Emit a CMP with 0, which is the TEST pattern.
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, Op.getValueType()));
	}
	unsigned Opcode = 0;
	unsigned NumOperands = 0;

	// Truncate operations may prevent the merge of the SETCC instruction
	// and the arithmetic instruction before it. Attempt to truncate the operands
	// of the arithmetic instruction and use a reduced bit-width instruction.
	bool NeedTruncation = false;
	SDValue ArithOp = Op;
	if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
	SDValue Arith = Op->getOperand(0);
	// Both the trunc and the arithmetic op need to have one user each.
	if (Arith->hasOneUse())
	switch (Arith.getOpcode()) {
	default: break;
	case ISD::ADD:
	case ISD::SUB:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR: {
	NeedTruncation = true;
	ArithOp = Arith;
	}
	}
	}

	// Sometimes flags can be set either with an AND or with an SRL/SHL
	// instruction. SRL/SHL variant should be preferred for masks longer than this
	// number of bits.
	const int ShiftToAndMaxMaskWidth = 32;
	const bool ZeroCheck = (X86CC == X86::COND_E \|\| X86CC == X86::COND_NE);

	// NOTICE: In the code below we use ArithOp to hold the arithmetic operation
	// which may be the result of a CAST. We use the variable 'Op', which is the
	// non-casted variable when we check for possible users.
	switch (ArithOp.getOpcode()) {
	case ISD::ADD:
	// We only want to rewrite this as a target-specific node with attached
	// flags if there is a reasonable chance of either using that to do custom
	// instructions selection that can fold some of the memory operands, or if
	// only the flags are used. If there are other uses, leave the node alone
	// and emit a test instruction.
	for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
	UE = Op.getNode()->use_end(); UI != UE; ++UI)
	if (UI->getOpcode() != ISD::CopyToReg &&
	UI->getOpcode() != ISD::SETCC &&
	UI->getOpcode() != ISD::STORE)
	goto default_case;

	if (auto *C = dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
	// An add of one will be selected as an INC.
	if (C->isOne() &&
	(!Subtarget.slowIncDec() \|\|
	DAG.getMachineFunction().getFunction().optForSize())) {
	Opcode = X86ISD::INC;
	NumOperands = 1;
	break;
	}

	// An add of negative one (subtract of one) will be selected as a DEC.
	if (C->isAllOnesValue() &&
	(!Subtarget.slowIncDec() \|\|
	DAG.getMachineFunction().getFunction().optForSize())) {
	Opcode = X86ISD::DEC;
	NumOperands = 1;
	break;
	}
	}

	// Otherwise use a regular EFLAGS-setting add.
	Opcode = X86ISD::ADD;
	NumOperands = 2;
	break;
	case ISD::SHL:
	case ISD::SRL:
	// If we have a constant logical shift that's only used in a comparison
	// against zero turn it into an equivalent AND. This allows turning it into
	// a TEST instruction later.
	if (ZeroCheck && Op->hasOneUse() &&
	isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
	EVT VT = Op.getValueType();
	unsigned BitWidth = VT.getSizeInBits();
	unsigned ShAmt = Op->getConstantOperandVal(1);
	if (ShAmt >= BitWidth) // Avoid undefined shifts.
	break;
	APInt Mask = ArithOp.getOpcode() == ISD::SRL
	? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
	: APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
	if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
	break;
	Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
	DAG.getConstant(Mask, dl, VT));
	}
	break;

	case ISD::AND:
	// If the primary 'and' result isn't used, don't bother using X86ISD::AND,
	// because a TEST instruction will be better. However, AND should be
	// preferred if the instruction can be combined into ANDN.
	if (!hasNonFlagsUse(Op)) {
	SDValue Op0 = ArithOp->getOperand(0);
	SDValue Op1 = ArithOp->getOperand(1);
	EVT VT = ArithOp.getValueType();
	bool isAndn = isBitwiseNot(Op0) \|\| isBitwiseNot(Op1);
	bool isLegalAndnType = VT == MVT::i32 \|\| VT == MVT::i64;
	bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();

	// If we cannot select an ANDN instruction, check if we can replace
	// AND+IMM64 with a shift before giving up. This is possible for masks
	// like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
	if (!isProperAndn) {
	if (!ZeroCheck)
	break;

	assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
	auto *CN = dyn_cast<ConstantSDNode>(Op1);
	if (!CN)
	break;

	const APInt &Mask = CN->getAPIntValue();
	if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
	break; // Prefer TEST instruction.

	unsigned BitWidth = Mask.getBitWidth();
	unsigned LeadingOnes = Mask.countLeadingOnes();
	unsigned TrailingZeros = Mask.countTrailingZeros();

	if (LeadingOnes + TrailingZeros == BitWidth) {
	assert(TrailingZeros < VT.getSizeInBits() &&
	"Shift amount should be less than the type width");
	MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
	SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
	Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
	break;
	}

	unsigned LeadingZeros = Mask.countLeadingZeros();
	unsigned TrailingOnes = Mask.countTrailingOnes();

	if (LeadingZeros + TrailingOnes == BitWidth) {
	assert(LeadingZeros < VT.getSizeInBits() &&
	"Shift amount should be less than the type width");
	MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
	SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
	Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
	break;
	}

	break;
	}
	}
	LLVM_FALLTHROUGH;
	case ISD::SUB:
	case ISD::OR:
	case ISD::XOR:
	// Similar to ISD::ADD above, check if the uses will preclude useful
	// lowering of the target-specific node.
	for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
	UE = Op.getNode()->use_end(); UI != UE; ++UI)
	if (UI->getOpcode() != ISD::CopyToReg &&
	UI->getOpcode() != ISD::SETCC &&
	UI->getOpcode() != ISD::STORE)
	goto default_case;

	// Otherwise use a regular EFLAGS-setting instruction.
	switch (ArithOp.getOpcode()) {
	default: llvm_unreachable("unexpected operator!");
	case ISD::SUB: Opcode = X86ISD::SUB; break;
	case ISD::XOR: Opcode = X86ISD::XOR; break;
	case ISD::AND: Opcode = X86ISD::AND; break;
	case ISD::OR: {
	if (!NeedTruncation && ZeroCheck) {
	if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
	return EFLAGS;
	}
	Opcode = X86ISD::OR;
	break;
	}
	}

	NumOperands = 2;
	break;
	case X86ISD::ADD:
	case X86ISD::SUB:
	case X86ISD::INC:
	case X86ISD::DEC:
	case X86ISD::OR:
	case X86ISD::XOR:
	case X86ISD::AND:
	return SDValue(Op.getNode(), 1);
	default:
	default_case:
	break;
	}

	// If we found that truncation is beneficial, perform the truncation and
	// update 'Op'.
	if (NeedTruncation) {
	EVT VT = Op.getValueType();
	SDValue WideVal = Op->getOperand(0);
	EVT WideVT = WideVal.getValueType();
	unsigned ConvertedOp = 0;
	// Use a target machine opcode to prevent further DAGCombine
	// optimizations that may separate the arithmetic operations
	// from the setcc node.
	switch (WideVal.getOpcode()) {
	default: break;
	case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
	case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
	case ISD::AND: ConvertedOp = X86ISD::AND; break;
	case ISD::OR: ConvertedOp = X86ISD::OR; break;
	case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
	}

	if (ConvertedOp) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
	SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
	SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
	Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
	}
	}
	}

	if (Opcode == 0) {
	- // Emit KTEST for bit vectors
	- if (auto Node = EmitKTEST(Op, DAG, Subtarget))
	- return Node;
	-
	// Emit a CMP with 0, which is the TEST pattern.
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, Op.getValueType()));
	}
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);

	SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
	DAG.ReplaceAllUsesWith(Op, New);
	return SDValue(New.getNode(), 1);
	}

	/// Emit nodes that will be selected as "cmp Op0,Op1", or something
	/// equivalent.
	SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
	const SDLoc &dl, SelectionDAG &DAG) const {
	if (isNullConstant(Op1))
	return EmitTest(Op0, X86CC, dl, DAG);

	assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
	"Unexpected comparison operation for MVT::i1 operands");

	if ((Op0.getValueType() == MVT::i8 \|\| Op0.getValueType() == MVT::i16 \|\|
	Op0.getValueType() == MVT::i32 \|\| Op0.getValueType() == MVT::i64)) {
	// Only promote the compare up to I32 if it is a 16 bit operation
	// with an immediate. 16 bit immediates are to be avoided.
	if ((Op0.getValueType() == MVT::i16 &&
	(isa<ConstantSDNode>(Op0) \|\| isa<ConstantSDNode>(Op1))) &&
	!DAG.getMachineFunction().getFunction().optForMinSize() &&
	!Subtarget.isAtom()) {
	unsigned ExtendOp =
	isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
	Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
	Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
	}
	// Use SUB instead of CMP to enable CSE between SUB and CMP.
	SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
	SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
	return SDValue(Sub.getNode(), 1);
	}
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
	}

	/// Convert a comparison if required by the subtarget.
	SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
	SelectionDAG &DAG) const {
	// If the subtarget does not support the FUCOMI instruction, floating-point
	// comparisons have to be converted.
	if (Subtarget.hasCMov() \|\|
	Cmp.getOpcode() != X86ISD::CMP \|\|
	!Cmp.getOperand(0).getValueType().isFloatingPoint() \|\|
	!Cmp.getOperand(1).getValueType().isFloatingPoint())
	return Cmp;

	// The instruction selector will select an FUCOM instruction instead of
	// FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
	// build an SDNode sequence that transfers the result from FPSW into EFLAGS:
	// (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
	SDLoc dl(Cmp);
	SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
	SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
	SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
	DAG.getConstant(8, dl, MVT::i8));
	SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);

	// Some 64-bit targets lack SAHF support, but they do support FCOMI.
	assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
	return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
	}

	/// Check if replacement of SQRT with RSQRT should be disabled.
	bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();

	// We never want to use both SQRT and RSQRT instructions for the same input.
	if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
	return false;

	if (VT.isVector())
	return Subtarget.hasFastVectorFSQRT();
	return Subtarget.hasFastScalarFSQRT();
	}

	/// The minimum architected relative accuracy is 2^-12. We need one
	/// Newton-Raphson step to have a good float result (24 bits of precision).
	SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
	SelectionDAG &DAG, int Enabled,
	int &RefinementSteps,
	bool &UseOneConstNR,
	bool Reciprocal) const {
	EVT VT = Op.getValueType();

	// SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
	// TODO: Add support for AVX512 (v16f32).
	// It is likely not profitable to do this for f64 because a double-precision
	// rsqrt estimate with refinement on x86 prior to FMA requires at least 16
	// instructions: convert to single, rsqrtss, convert back to double, refine
	// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
	// along with FMA, this could be a throughput win.
	// TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
	// after legalize types.
	if ((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) \|\|
	(VT == MVT::v8f32 && Subtarget.hasAVX())) {
	if (RefinementSteps == ReciprocalEstimate::Unspecified)
	RefinementSteps = 1;

	UseOneConstNR = false;
	return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
	}
	return SDValue();
	}

	/// The minimum architected relative accuracy is 2^-12. We need one
	/// Newton-Raphson step to have a good float result (24 bits of precision).
	SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
	int Enabled,
	int &RefinementSteps) const {
	EVT VT = Op.getValueType();

	// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
	// TODO: Add support for AVX512 (v16f32).
	// It is likely not profitable to do this for f64 because a double-precision
	// reciprocal estimate with refinement on x86 prior to FMA requires
	// 15 instructions: convert to single, rcpss, convert back to double, refine
	// (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
	// along with FMA, this could be a throughput win.

	if ((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v8f32 && Subtarget.hasAVX())) {
	// Enable estimate codegen with 1 refinement step for vector division.
	// Scalar division estimates are disabled because they break too much
	// real-world code. These defaults are intended to match GCC behavior.
	if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
	return SDValue();

	if (RefinementSteps == ReciprocalEstimate::Unspecified)
	RefinementSteps = 1;

	return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
	}
	return SDValue();
	}

	/// If we have at least two divisions that use the same divisor, convert to
	/// multiplication by a reciprocal. This may need to be adjusted for a given
	/// CPU if a division's cost is not at least twice the cost of a multiplication.
	/// This is because we still need one division to calculate the reciprocal and
	/// then we need two multiplies by that reciprocal as replacements for the
	/// original divisions.
	unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
	return 2;
	}

	/// Helper for creating a X86ISD::SETCC node.
	static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
	SelectionDAG &DAG) {
	return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
	DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
	}

	/// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
	/// according to equal/not-equal condition code \p CC.
	static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG) {
	// If Src is i8, promote it to i32 with any_extend. There is no i8 BT
	// instruction. Since the shift amount is in-range-or-undefined, we know
	// that doing a bittest on the i32 value is ok. We extend to i32 because
	// the encoding for the i16 version is larger than the i32 version.
	// Also promote i16 to i32 for performance / code size reason.
	if (Src.getValueType() == MVT::i8 \|\| Src.getValueType() == MVT::i16)
	Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);

	// See if we can use the 32-bit instruction instead of the 64-bit one for a
	// shorter encoding. Since the former takes the modulo 32 of BitNo and the
	// latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
	// known to be zero.
	if (Src.getValueType() == MVT::i64 &&
	DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
	Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);

	// If the operand types disagree, extend the shift amount to match. Since
	// BT ignores high bits (like shifts) we can use anyextend.
	if (Src.getValueType() != BitNo.getValueType())
	BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);

	SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
	X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
	return getSETCC(Cond, BT, dl , DAG);
	}

	/// Result of 'and' is compared against zero. Change to a BT node if possible.
	static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG) {
	assert(And.getOpcode() == ISD::AND && "Expected AND node!");
	SDValue Op0 = And.getOperand(0);
	SDValue Op1 = And.getOperand(1);
	if (Op0.getOpcode() == ISD::TRUNCATE)
	Op0 = Op0.getOperand(0);
	if (Op1.getOpcode() == ISD::TRUNCATE)
	Op1 = Op1.getOperand(0);

	SDValue LHS, RHS;
	if (Op1.getOpcode() == ISD::SHL)
	std::swap(Op0, Op1);
	if (Op0.getOpcode() == ISD::SHL) {
	if (isOneConstant(Op0.getOperand(0))) {
	// If we looked past a truncate, check that it's only truncating away
	// known zeros.
	unsigned BitWidth = Op0.getValueSizeInBits();
	unsigned AndBitWidth = And.getValueSizeInBits();
	if (BitWidth > AndBitWidth) {
	KnownBits Known;
	DAG.computeKnownBits(Op0, Known);
	if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
	return SDValue();
	}
	LHS = Op1;
	RHS = Op0.getOperand(1);
	}
	} else if (Op1.getOpcode() == ISD::Constant) {
	ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
	uint64_t AndRHSVal = AndRHS->getZExtValue();
	SDValue AndLHS = Op0;

	if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
	LHS = AndLHS.getOperand(0);
	RHS = AndLHS.getOperand(1);
	}

	// Use BT if the immediate can't be encoded in a TEST instruction.
	if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
	LHS = AndLHS;
	RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
	}
	}

	if (LHS.getNode())
	return getBitTestCondition(LHS, RHS, CC, dl, DAG);

	return SDValue();
	}

	/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
	/// CMPs.
	static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
	SDValue &Op1) {
	unsigned SSECC;
	bool Swap = false;

	// SSE Condition code mapping:
	// 0 - EQ
	// 1 - LT
	// 2 - LE
	// 3 - UNORD
	// 4 - NEQ
	// 5 - NLT
	// 6 - NLE
	// 7 - ORD
	switch (SetCCOpcode) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETOEQ:
	case ISD::SETEQ: SSECC = 0; break;
	case ISD::SETOGT:
	case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETLT:
	case ISD::SETOLT: SSECC = 1; break;
	case ISD::SETOGE:
	case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETLE:
	case ISD::SETOLE: SSECC = 2; break;
	case ISD::SETUO: SSECC = 3; break;
	case ISD::SETUNE:
	case ISD::SETNE: SSECC = 4; break;
	case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETUGE: SSECC = 5; break;
	case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETUGT: SSECC = 6; break;
	case ISD::SETO: SSECC = 7; break;
	case ISD::SETUEQ: SSECC = 8; break;
	case ISD::SETONE: SSECC = 12; break;
	}
	if (Swap)
	std::swap(Op0, Op1);

	return SSECC;
	}

	/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
	/// concatenate the result back.
	static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
	"Unsupported value type for operation");

	unsigned NumElems = VT.getVectorNumElements();
	SDLoc dl(Op);
	SDValue CC = Op.getOperand(2);

	// Extract the LHS vectors
	SDValue LHS = Op.getOperand(0);
	SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
	SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);

	// Extract the RHS vectors
	SDValue RHS = Op.getOperand(1);
	SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
	SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);

	// Issue the operation on the smaller types and concatenate the result back
	MVT EltVT = VT.getVectorElementType();
	MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
	}

	static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue CC = Op.getOperand(2);
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
	"Unexpected type for boolean compare operation");
	ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
	SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
	DAG.getConstant(-1, dl, VT));
	SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
	DAG.getConstant(-1, dl, VT));
	switch (SetCCOpcode) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETEQ:
	// (x == y) -> ~(x ^ y)
	return DAG.getNode(ISD::XOR, dl, VT,
	DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
	DAG.getConstant(-1, dl, VT));
	case ISD::SETNE:
	// (x != y) -> (x ^ y)
	return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
	case ISD::SETUGT:
	case ISD::SETGT:
	// (x > y) -> (x & ~y)
	return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
	case ISD::SETULT:
	case ISD::SETLT:
	// (x < y) -> (~x & y)
	return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
	case ISD::SETULE:
	case ISD::SETLE:
	// (x <= y) -> (~x \| y)
	return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
	case ISD::SETUGE:
	case ISD::SETGE:
	// (x >=y) -> (x \| ~y)
	return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
	}
	}

	static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue CC = Op.getOperand(2);
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	assert(VT.getVectorElementType() == MVT::i1 &&
	"Cannot set masked compare for this operation");

	ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
	unsigned Opc = 0;
	bool Unsigned = false;
	bool Swap = false;
	unsigned SSECC;
	switch (SetCCOpcode) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETNE: SSECC = 4; break;
	case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
	case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
	case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
	case ISD::SETULT: SSECC = 1; Unsigned = true; break;
	case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
	case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
	case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
	case ISD::SETLE: SSECC = 2; break;
	}

	if (Swap)
	std::swap(Op0, Op1);

	// See if it is the case of CMP(EQ\|NEQ,AND(A,B),ZERO) and change it to TESTM\|NM.
	if ((!Opc && SSECC == 4) \|\| Opc == X86ISD::PCMPEQM) {
	SDValue A = peekThroughBitcasts(Op0);
	if ((A.getOpcode() == ISD::AND \|\| A.getOpcode() == X86ISD::FAND) &&
	ISD::isBuildVectorAllZeros(Op1.getNode())) {
	MVT VT0 = Op0.getSimpleValueType();
	SDValue RHS = DAG.getBitcast(VT0, A.getOperand(0));
	SDValue LHS = DAG.getBitcast(VT0, A.getOperand(1));
	return DAG.getNode(Opc == X86ISD::PCMPEQM ? X86ISD::TESTNM : X86ISD::TESTM,
	dl, VT, RHS, LHS);
	}
	}

	if (Opc)
	return DAG.getNode(Opc, dl, VT, Op0, Op1);
	Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
	return DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(SSECC, dl, MVT::i8));
	}

	/// \brief Try to turn a VSETULT into a VSETULE by modifying its second
	/// operand \p Op1. If non-trivial (for example because it's not constant)
	/// return an empty value.
	static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
	SelectionDAG &DAG) {
	BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
	if (!BV)
	return SDValue();

	MVT VT = Op1.getSimpleValueType();
	MVT EVT = VT.getVectorElementType();
	unsigned n = VT.getVectorNumElements();
	SmallVector<SDValue, 8> ULTOp1;

	for (unsigned i = 0; i < n; ++i) {
	ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
	if (!Elt \|\| Elt->isOpaque() \|\| Elt->getSimpleValueType(0) != EVT)
	return SDValue();

	// Avoid underflow.
	APInt Val = Elt->getAPIntValue();
	if (Val == 0)
	return SDValue();

	ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
	}

	return DAG.getBuildVector(VT, dl, ULTOp1);
	}

	static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue CC = Op.getOperand(2);
	MVT VT = Op.getSimpleValueType();
	ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
	bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
	SDLoc dl(Op);

	if (isFP) {
	#ifndef NDEBUG
	MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
	assert(EltVT == MVT::f32 \|\| EltVT == MVT::f64);
	#endif

	unsigned Opc;
	if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
	assert(VT.getVectorNumElements() <= 16);
	Opc = X86ISD::CMPM;
	} else {
	Opc = X86ISD::CMPP;
	// The SSE/AVX packed FP comparison nodes are defined with a
	// floating-point vector result that matches the operand type. This allows
	// them to work with an SSE1 target (integer vector types are not legal).
	VT = Op0.getSimpleValueType();
	}

	// In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
	// emit two comparisons and a logic op to tie them together.
	SDValue Cmp;
	unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
	if (SSECC >= 8 && !Subtarget.hasAVX()) {
	// LLVM predicate is SETUEQ or SETONE.
	unsigned CC0, CC1;
	unsigned CombineOpc;
	if (Cond == ISD::SETUEQ) {
	CC0 = 3; // UNORD
	CC1 = 0; // EQ
	CombineOpc = X86ISD::FOR;
	} else {
	assert(Cond == ISD::SETONE);
	CC0 = 7; // ORD
	CC1 = 4; // NEQ
	CombineOpc = X86ISD::FAND;
	}

	SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(CC0, dl, MVT::i8));
	SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(CC1, dl, MVT::i8));
	Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
	} else {
	// Handle all other FP comparisons here.
	Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(SSECC, dl, MVT::i8));
	}

	// If this is SSE/AVX CMPP, bitcast the result back to integer to match the
	// result type of SETCC. The bitcast is expected to be optimized away
	// during combining/isel.
	if (Opc == X86ISD::CMPP)
	Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);

	return Cmp;
	}

	MVT VTOp0 = Op0.getSimpleValueType();
	assert(VTOp0 == Op1.getSimpleValueType() &&
	"Expected operands with same type!");
	assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
	"Invalid number of packed elements for source and destination!");

	if (VT.is128BitVector() && VTOp0.is256BitVector()) {
	// On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
	// legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
	// legalizer firstly checks if the first operand in input to the setcc has
	// a legal type. If so, then it promotes the return type to that same type.
	// Otherwise, the return type is promoted to the 'next legal type' which,
	// for a vector of MVT::i1 is always a 128-bit integer vector type.
	//
	// We reach this code only if the following two conditions are met:
	// 1. Both return type and operand type have been promoted to wider types
	// by the type legalizer.
	// 2. The original operand type has been promoted to a 256-bit vector.
	//
	// Note that condition 2. only applies for AVX targets.
	SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond);
	return DAG.getZExtOrTrunc(NewOp, dl, VT);
	}

	// The non-AVX512 code below works under the assumption that source and
	// destination types are the same.
	assert((Subtarget.hasAVX512() \|\| (VT == VTOp0)) &&
	"Value types for source and destination must be the same!");

	// Break 256-bit integer vector compare into smaller ones.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntVSETCC(Op, DAG);

	// Operands are boolean (vectors of i1)
	MVT OpVT = Op1.getSimpleValueType();
	if (OpVT.getVectorElementType() == MVT::i1)
	return LowerBoolVSETCC_AVX512(Op, DAG);

	// The result is boolean, but operands are int/float
	if (VT.getVectorElementType() == MVT::i1) {
	// In AVX-512 architecture setcc returns mask with i1 elements,
	// But there is no compare instruction for i8 and i16 elements in KNL.
	// In this case use SSE compare
	bool UseAVX512Inst =
	(OpVT.is512BitVector() \|\|
	OpVT.getScalarSizeInBits() >= 32 \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX()));

	if (UseAVX512Inst)
	return LowerIntVSETCC_AVX512(Op, DAG);

	return DAG.getNode(ISD::TRUNCATE, dl, VT,
	DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
	}

	// Lower using XOP integer comparisons.
	if ((VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\|
	VT == MVT::v4i32 \|\| VT == MVT::v2i64) && Subtarget.hasXOP()) {
	// Translate compare code to XOP PCOM compare mode.
	unsigned CmpMode = 0;
	switch (Cond) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETULT:
	case ISD::SETLT: CmpMode = 0x00; break;
	case ISD::SETULE:
	case ISD::SETLE: CmpMode = 0x01; break;
	case ISD::SETUGT:
	case ISD::SETGT: CmpMode = 0x02; break;
	case ISD::SETUGE:
	case ISD::SETGE: CmpMode = 0x03; break;
	case ISD::SETEQ: CmpMode = 0x04; break;
	case ISD::SETNE: CmpMode = 0x05; break;
	}

	// Are we comparing unsigned or signed integers?
	unsigned Opc =
	ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;

	return DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(CmpMode, dl, MVT::i8));
	}

	// (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
	// Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
	if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
	SDValue BC0 = peekThroughBitcasts(Op0);
	if (BC0.getOpcode() == ISD::AND) {
	APInt UndefElts;
	SmallVector<APInt, 64> EltBits;
	if (getTargetConstantBitsFromNode(BC0.getOperand(1),
	VT.getScalarSizeInBits(), UndefElts,
	EltBits, false, false)) {
	if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
	Cond = ISD::SETEQ;
	Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
	}
	}
	}
	}

	// We are handling one of the integer comparisons here. Since SSE only has
	// GT and EQ comparisons for integer, swapping operands and multiple
	// operations may be required for some comparisons.
	unsigned Opc = (Cond == ISD::SETEQ \|\| Cond == ISD::SETNE) ? X86ISD::PCMPEQ
	: X86ISD::PCMPGT;
	bool Swap = Cond == ISD::SETLT \|\| Cond == ISD::SETULT \|\|
	Cond == ISD::SETGE \|\| Cond == ISD::SETUGE;
	bool Invert = Cond == ISD::SETNE \|\|
	(Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));

	// If both operands are known non-negative, then an unsigned compare is the
	// same as a signed compare and there's no need to flip signbits.
	// TODO: We could check for more general simplifications here since we're
	// computing known bits.
	bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
	!(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));

	// Special case: Use min/max operations for SETULE/SETUGE
	MVT VET = VT.getVectorElementType();
	bool HasMinMax =
	(Subtarget.hasAVX512() && VET == MVT::i64) \|\|
	(Subtarget.hasSSE41() && (VET == MVT::i16 \|\| VET == MVT::i32)) \|\|
	(Subtarget.hasSSE2() && (VET == MVT::i8));
	bool MinMax = false;
	if (HasMinMax) {
	switch (Cond) {
	default: break;
	case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
	case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
	}

	if (MinMax)
	Swap = Invert = FlipSigns = false;
	}

	bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 \|\| VET == MVT::i16);
	bool Subus = false;
	if (!MinMax && HasSubus) {
	// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
	// Op0 u<= Op1:
	// t = psubus Op0, Op1
	// pcmpeq t, <0..0>
	switch (Cond) {
	default: break;
	case ISD::SETULT: {
	// If the comparison is against a constant we can turn this into a
	// setule. With psubus, setule does not require a swap. This is
	// beneficial because the constant in the register is no longer
	// destructed as the destination so it can be hoisted out of a loop.
	// Only do this pre-AVX since vpcmp* is no longer destructive.
	if (Subtarget.hasAVX())
	break;
	if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
	Op1 = ULEOp1;
	Subus = true; Invert = false; Swap = false;
	}
	break;
	}
	// Psubus is better than flip-sign because it requires no inversion.
	case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
	case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
	}

	if (Subus) {
	Opc = X86ISD::SUBUS;
	FlipSigns = false;
	}
	}

	if (Swap)
	std::swap(Op0, Op1);

	// Check that the operation in question is available (most are plain SSE2,
	// but PCMPGTQ and PCMPEQQ have different requirements).
	if (VT == MVT::v2i64) {
	if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
	assert(Subtarget.hasSSE2() && "Don't know how to lower!");

	// First cast everything to the right type.
	Op0 = DAG.getBitcast(MVT::v4i32, Op0);
	Op1 = DAG.getBitcast(MVT::v4i32, Op1);

	// Since SSE has no unsigned integer comparisons, we need to flip the sign
	// bits of the inputs before performing those operations. The lower
	// compare is always unsigned.
	SDValue SB;
	if (FlipSigns) {
	SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
	} else {
	SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
	SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
	SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
	}
	Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
	Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);

	// Emulate PCMPGTQ with (hi1 > hi2) \| ((hi1 == hi2) & (lo1 > lo2))
	SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
	SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);

	// Create masks for only the low parts/high parts of the 64 bit integers.
	static const int MaskHi[] = { 1, 1, 3, 3 };
	static const int MaskLo[] = { 0, 0, 2, 2 };
	SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
	SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
	SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

	SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
	Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);

	if (Invert)
	Result = DAG.getNOT(dl, Result, MVT::v4i32);

	return DAG.getBitcast(VT, Result);
	}

	if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
	// If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
	// pcmpeqd + pshufd + pand.
	assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");

	// First cast everything to the right type.
	Op0 = DAG.getBitcast(MVT::v4i32, Op0);
	Op1 = DAG.getBitcast(MVT::v4i32, Op1);

	// Do the compare.
	SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);

	// Make sure the lower and upper halves are both all-ones.
	static const int Mask[] = { 1, 0, 3, 2 };
	SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
	Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);

	if (Invert)
	Result = DAG.getNOT(dl, Result, MVT::v4i32);

	return DAG.getBitcast(VT, Result);
	}
	}

	// Since SSE has no unsigned integer comparisons, we need to flip the sign
	// bits of the inputs before performing those operations.
	if (FlipSigns) {
	MVT EltVT = VT.getVectorElementType();
	SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
	VT);
	Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
	Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
	}

	SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);

	// If the logical-not of the result is required, perform that now.
	if (Invert)
	Result = DAG.getNOT(dl, Result, VT);

	if (MinMax)
	Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);

	if (Subus)
	Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
	getZeroVector(VT, Subtarget, DAG, dl));

	return Result;
	}

	+// Try to select this as a KTEST+SETCC if possible.
	+static SDValue EmitKTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
	+ const SDLoc &dl, SelectionDAG &DAG,
	+ const X86Subtarget &Subtarget) {
	+ // Only support equality comparisons.
	+ if (CC != ISD::SETEQ && CC != ISD::SETNE)
	+ return SDValue();
	+
	+ // Must be a bitcast from vXi1.
	+ if (Op0.getOpcode() != ISD::BITCAST)
	+ return SDValue();
	+
	+ Op0 = Op0.getOperand(0);
	+ MVT VT = Op0.getSimpleValueType();
	+ if (!(Subtarget.hasDQI() && (VT == MVT::v8i1 \|\| VT == MVT::v16i1)) &&
	+ !(Subtarget.hasBWI() && (VT == MVT::v32i1 \|\| VT == MVT::v64i1)))
	+ return SDValue();
	+
	+ X86::CondCode X86CC;
	+ if (isNullConstant(Op1)) {
	+ X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
	+ } else
	+ return SDValue();
	+
	+ SDValue KTEST = DAG.getNode(X86ISD::KTEST, dl, MVT::i32, Op0, Op0);
	+ return getSETCC(X86CC, KTEST, dl, DAG);
	+}
	+
	SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

	MVT VT = Op.getSimpleValueType();

	if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);

	assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDLoc dl(Op);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();

	// Optimize to BT if possible.
	// Lower (X & (1 << N)) == 0 to BT(X, N).
	// Lower ((X >>u N) & 1) != 0 to BT(X, N).
	// Lower ((X >>s N) & 1) != 0 to BT(X, N).
	if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	if (SDValue NewSetCC = LowerAndToBT(Op0, CC, dl, DAG))
	return NewSetCC;
	}

	+ // Try to lower using KTEST.
	+ if (SDValue NewSetCC = EmitKTEST(Op0, Op1, CC, dl, DAG, Subtarget))
	+ return NewSetCC;
	+
	// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
	// these.
	if ((isOneConstant(Op1) \|\| isNullConstant(Op1)) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {

	// If the input is a setcc, then reuse the input setcc or use a new one with
	// the inverted condition.
	if (Op0.getOpcode() == X86ISD::SETCC) {
	X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
	bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
	if (!Invert)
	return Op0;

	CCode = X86::GetOppositeBranchCondition(CCode);
	return getSETCC(CCode, Op0.getOperand(1), dl, DAG);
	}
	}

	bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
	X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
	if (X86CC == X86::COND_INVALID)
	return SDValue();

	SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
	EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
	return getSETCC(X86CC, EFLAGS, dl, DAG);
	}

	SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	SDValue Carry = Op.getOperand(2);
	SDValue Cond = Op.getOperand(3);
	SDLoc DL(Op);

	assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
	X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());

	// Recreate the carry if needed.
	EVT CarryVT = Carry.getValueType();
	APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
	Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
	Carry, DAG.getConstant(NegOne, DL, CarryVT));

	SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
	SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
	return getSETCC(CC, Cmp.getValue(1), DL, DAG);
	}

	/// Return true if opcode is a X86 logical comparison.
	static bool isX86LogicalCmp(SDValue Op) {
	unsigned Opc = Op.getOpcode();
	if (Opc == X86ISD::CMP \|\| Opc == X86ISD::COMI \|\| Opc == X86ISD::UCOMI \|\|
	Opc == X86ISD::SAHF)
	return true;
	if (Op.getResNo() == 1 &&
	(Opc == X86ISD::ADD \|\| Opc == X86ISD::SUB \|\| Opc == X86ISD::ADC \|\|
	Opc == X86ISD::SBB \|\| Opc == X86ISD::SMUL \|\|
	Opc == X86ISD::INC \|\| Opc == X86ISD::DEC \|\| Opc == X86ISD::OR \|\|
	Opc == X86ISD::XOR \|\| Opc == X86ISD::AND))
	return true;

	if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
	return true;

	return false;
	}

	static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
	if (V.getOpcode() != ISD::TRUNCATE)
	return false;

	SDValue VOp0 = V.getOperand(0);
	unsigned InBits = VOp0.getValueSizeInBits();
	unsigned Bits = V.getValueSizeInBits();
	return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
	}

	SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
	bool AddTest = true;
	SDValue Cond = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue Op2 = Op.getOperand(2);
	SDLoc DL(Op);
	MVT VT = Op1.getSimpleValueType();
	SDValue CC;

	// Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
	// are available or VBLENDV if AVX is available.
	// Otherwise FP cmovs get lowered into a less efficient branch sequence later.
	if (Cond.getOpcode() == ISD::SETCC &&
	((Subtarget.hasSSE2() && (VT == MVT::f32 \|\| VT == MVT::f64)) \|\|
	(Subtarget.hasSSE1() && VT == MVT::f32)) &&
	VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
	SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
	unsigned SSECC = translateX86FSETCC(
	cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);

	if (Subtarget.hasAVX512()) {
	SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
	CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
	assert(!VT.isVector() && "Not a scalar type?");
	return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
	}

	if (SSECC < 8 \|\| Subtarget.hasAVX()) {
	SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
	DAG.getConstant(SSECC, DL, MVT::i8));

	// If we have AVX, we can use a variable vector select (VBLENDV) instead
	// of 3 logic instructions for size savings and potentially speed.
	// Unfortunately, there is no scalar form of VBLENDV.

	// If either operand is a constant, don't try this. We can expect to
	// optimize away at least one of the logic instructions later in that
	// case, so that sequence would be faster than a variable blend.

	// BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
	// uses XMM0 as the selection register. That may need just as many
	// instructions as the AND/ANDN/OR sequence due to register moves, so
	// don't bother.

	if (Subtarget.hasAVX() &&
	!isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {

	// Convert to vectors, do a VSELECT, and convert back to scalar.
	// All of the conversions should be optimized away.

	MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
	SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
	SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
	SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);

	MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
	VCmp = DAG.getBitcast(VCmpVT, VCmp);

	SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
	VSel, DAG.getIntPtrConstant(0, DL));
	}
	SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
	SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
	return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
	}
	}

	// AVX512 fallback is to lower selects of scalar floats to masked moves.
	if ((VT == MVT::f64 \|\| VT == MVT::f32) && Subtarget.hasAVX512()) {
	SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
	return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
	}

	// For v64i1 without 64-bit support we need to split and rejoin.
	if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
	assert(Subtarget.hasBWI() && "Expected BWI to be legal");
	SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32);
	SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32);
	SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32);
	SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32);
	SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo);
	SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
	}

	if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
	SDValue Op1Scalar;
	if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
	Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
	else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
	Op1Scalar = Op1.getOperand(0);
	SDValue Op2Scalar;
	if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
	Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
	else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
	Op2Scalar = Op2.getOperand(0);
	if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
	SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
	Op1Scalar, Op2Scalar);
	if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
	return DAG.getBitcast(VT, newSelect);
	SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
	DAG.getIntPtrConstant(0, DL));
	}
	}

	if (VT == MVT::v4i1 \|\| VT == MVT::v2i1) {
	SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
	Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
	DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
	Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
	DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
	SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
	}

	if (Cond.getOpcode() == ISD::SETCC) {
	if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
	Cond = NewCond;
	// If the condition was updated, it's possible that the operands of the
	// select were also updated (for example, EmitTest has a RAUW). Refresh
	// the local references to the select operands in case they got stale.
	Op1 = Op.getOperand(1);
	Op2 = Op.getOperand(2);
	}
	}

	// (select (x == 0), -1, y) -> (sign_bit (x - 1)) \| y
	// (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) \| y
	// (select (x != 0), y, -1) -> (sign_bit (x - 1)) \| y
	// (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) \| y
	// (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
	// (select (and (x , 0x1) == 0), y, (z \| y) ) -> (-(and (x , 0x1)) & z ) \| y
	if (Cond.getOpcode() == X86ISD::SETCC &&
	Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
	isNullConstant(Cond.getOperand(1).getOperand(1))) {
	SDValue Cmp = Cond.getOperand(1);
	unsigned CondCode =
	cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();

	if ((isAllOnesConstant(Op1) \|\| isAllOnesConstant(Op2)) &&
	(CondCode == X86::COND_E \|\| CondCode == X86::COND_NE)) {
	SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
	SDValue CmpOp0 = Cmp.getOperand(0);

	// Apply further optimizations for special cases
	// (select (x != 0), -1, 0) -> neg & sbb
	// (select (x == 0), 0, -1) -> neg & sbb
	if (isNullConstant(Y) &&
	(isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
	SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
	SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
	SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	SDValue(Neg.getNode(), 1));
	return Res;
	}

	Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
	CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
	Cmp = ConvertCmpIfNecessary(Cmp, DAG);

	SDValue Res = // Res = 0 or -1.
	DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
	DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);

	if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
	Res = DAG.getNOT(DL, Res, Res.getValueType());

	if (!isNullConstant(Op2))
	Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
	return Res;
	} else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
	Cmp.getOperand(0).getOpcode() == ISD::AND &&
	isOneConstant(Cmp.getOperand(0).getOperand(1))) {
	SDValue CmpOp0 = Cmp.getOperand(0);
	SDValue Src1, Src2;
	// true if Op2 is XOR or OR operator and one of its operands
	// is equal to Op1
	// ( a , a op b) \|\| ( b , a op b)
	auto isOrXorPattern = [&]() {
	if ((Op2.getOpcode() == ISD::XOR \|\| Op2.getOpcode() == ISD::OR) &&
	(Op2.getOperand(0) == Op1 \|\| Op2.getOperand(1) == Op1)) {
	Src1 =
	Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
	Src2 = Op1;
	return true;
	}
	return false;
	};

	if (isOrXorPattern()) {
	SDValue Neg;
	unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
	// we need mask of all zeros or ones with same size of the other
	// operands.
	if (CmpSz > VT.getSizeInBits())
	Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
	else if (CmpSz < VT.getSizeInBits())
	Neg = DAG.getNode(ISD::AND, DL, VT,
	DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
	DAG.getConstant(1, DL, VT));
	else
	Neg = CmpOp0;
	SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	Neg); // -(and (x, 0x1))
	SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
	return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
	}
	}
	}

	// Look past (and (setcc_carry (cmp ...)), 1).
	if (Cond.getOpcode() == ISD::AND &&
	Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
	isOneConstant(Cond.getOperand(1)))
	Cond = Cond.getOperand(0);

	// If condition flag is set by a X86ISD::CMP, then use it as the condition
	// setting operand in place of the X86ISD::SETCC.
	unsigned CondOpcode = Cond.getOpcode();
	if (CondOpcode == X86ISD::SETCC \|\|
	CondOpcode == X86ISD::SETCC_CARRY) {
	CC = Cond.getOperand(0);

	SDValue Cmp = Cond.getOperand(1);
	unsigned Opc = Cmp.getOpcode();
	MVT VT = Op.getSimpleValueType();

	bool IllegalFPCMov = false;
	if (VT.isFloatingPoint() && !VT.isVector() &&
	!isScalarFPTypeInSSEReg(VT)) // FPStack?
	IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());

	if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) \|\|
	Opc == X86ISD::BT) { // FIXME
	Cond = Cmp;
	AddTest = false;
	}
	} else if (CondOpcode == ISD::USUBO \|\| CondOpcode == ISD::SSUBO \|\|
	CondOpcode == ISD::UADDO \|\| CondOpcode == ISD::SADDO \|\|
	((CondOpcode == ISD::UMULO \|\| CondOpcode == ISD::SMULO) &&
	Cond.getOperand(0).getValueType() != MVT::i8)) {
	SDValue LHS = Cond.getOperand(0);
	SDValue RHS = Cond.getOperand(1);
	unsigned X86Opcode;
	unsigned X86Cond;
	SDVTList VTs;
	switch (CondOpcode) {
	case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
	case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
	case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
	case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
	case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
	case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
	default: llvm_unreachable("unexpected overflowing operator");
	}
	if (CondOpcode == ISD::UMULO)
	VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
	MVT::i32);
	else
	VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);

	SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);

	if (CondOpcode == ISD::UMULO)
	Cond = X86Op.getValue(2);
	else
	Cond = X86Op.getValue(1);

	CC = DAG.getConstant(X86Cond, DL, MVT::i8);
	AddTest = false;
	}

	if (AddTest) {
	// Look past the truncate if the high bits are known zero.
	if (isTruncWithZeroHighBitsInput(Cond, DAG))
	Cond = Cond.getOperand(0);

	// We know the result of AND is compared against zero. Try to match
	// it to BT.
	if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
	if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, DL, DAG)) {
	CC = NewSetCC.getOperand(0);
	Cond = NewSetCC.getOperand(1);
	AddTest = false;
	}
	}
	}

	if (AddTest) {
	CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
	Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
	}

	// a < b ? -1 : 0 -> RES = ~setcc_carry
	// a < b ? 0 : -1 -> RES = setcc_carry
	// a >= b ? -1 : 0 -> RES = setcc_carry
	// a >= b ? 0 : -1 -> RES = ~setcc_carry
	if (Cond.getOpcode() == X86ISD::SUB) {
	Cond = ConvertCmpIfNecessary(Cond, DAG);
	unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();

	if ((CondCode == X86::COND_AE \|\| CondCode == X86::COND_B) &&
	(isAllOnesConstant(Op1) \|\| isAllOnesConstant(Op2)) &&
	(isNullConstant(Op1) \|\| isNullConstant(Op2))) {
	SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	Cond);
	if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
	return DAG.getNOT(DL, Res, Res.getValueType());
	return Res;
	}
	}

	// X86 doesn't have an i8 cmov. If both operands are the result of a truncate
	// widen the cmov and push the truncate through. This avoids introducing a new
	// branch during isel and doesn't add any extensions.
	if (Op.getValueType() == MVT::i8 &&
	Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
	SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
	if (T1.getValueType() == T2.getValueType() &&
	// Blacklist CopyFromReg to avoid partial register stalls.
	T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
	SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
	CC, Cond);
	return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
	}
	}

	// X86ISD::CMOV means set the result (which is operand 1) to the RHS if
	// condition is true.
	SDValue Ops[] = { Op2, Op1, CC, Cond };
	return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
	}

	static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
	MVT VTElt = VT.getVectorElementType();
	SDLoc dl(Op);

	unsigned NumElts = VT.getVectorNumElements();

	// Extend VT if the scalar type is v8/v16 and BWI is not supported.
	MVT ExtVT = VT;
	if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)
	ExtVT = MVT::getVectorVT(MVT::i32, NumElts);

	// Widen to 512-bits if VLX is not supported.
	MVT WideVT = ExtVT;
	if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
	NumElts *= 512 / ExtVT.getSizeInBits();
	InVT = MVT::getVectorVT(MVT::i1, NumElts);
	In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
	In, DAG.getIntPtrConstant(0, dl));
	WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
	}

	SDValue V;
	MVT WideEltVT = WideVT.getVectorElementType();
	if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) \|\|
	(Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
	V = getExtendInVec(X86ISD::VSEXT, dl, WideVT, In, DAG);
	} else {
	SDValue NegOne = getOnesVector(WideVT, DAG, dl);
	SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, dl);
	V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
	}

	// Truncate if we had to extend i16/i8 above.
	if (VT != ExtVT) {
	WideVT = MVT::getVectorVT(VTElt, NumElts);
	V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
	}

	// Extract back to 128/256-bit if we widened.
	if (WideVT != VT)
	V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
	DAG.getIntPtrConstant(0, dl));

	return V;
	}

	static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();

	if (InVT.getVectorElementType() == MVT::i1)
	return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

	if (Subtarget.hasFp256())
	if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
	return Res;

	return SDValue();
	}

	// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
	// For sign extend this needs to handle all vector sizes and SSE4.1 and
	// non-SSE4.1 targets. For zero extend this should only handle inputs of
	// MVT::v64i8 when BWI is not supported, but AVX512 is.
	static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = Op->getOperand(0);
	MVT VT = Op->getSimpleValueType(0);
	MVT InVT = In.getSimpleValueType();
	assert(VT.getSizeInBits() == InVT.getSizeInBits());

	MVT SVT = VT.getVectorElementType();
	MVT InSVT = InVT.getVectorElementType();
	assert(SVT.getSizeInBits() > InSVT.getSizeInBits());

	if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
	return SDValue();
	if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
	return SDValue();
	if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
	!(VT.is256BitVector() && Subtarget.hasInt256()) &&
	!(VT.is512BitVector() && Subtarget.hasAVX512()))
	return SDValue();

	SDLoc dl(Op);

	// For 256-bit vectors, we only need the lower (128-bit) half of the input.
	// For 512-bit vectors, we need 128-bits or 256-bits.
	if (VT.getSizeInBits() > 128) {
	// Input needs to be at least the same number of elements as output, and
	// at least 128-bits.
	int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
	In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
	}

	assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG \|\|
	InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");

	// SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
	// so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
	// need to be handled here for 256/512-bit results.
	if (Subtarget.hasInt256()) {
	assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
	unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
	X86ISD::VSEXT : X86ISD::VZEXT;
	return DAG.getNode(ExtOpc, dl, VT, In);
	}

	// We should only get here for sign extend.
	assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
	"Unexpected opcode!");

	// pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
	SDValue Curr = In;
	MVT CurrVT = InVT;

	// As SRAI is only available on i16/i32 types, we expand only up to i32
	// and handle i64 separately.
	while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
	Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
	MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
	CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
	Curr = DAG.getBitcast(CurrVT, Curr);
	}

	SDValue SignExt = Curr;
	if (CurrVT != InVT) {
	unsigned SignExtShift =
	CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
	SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
	DAG.getConstant(SignExtShift, dl, MVT::i8));
	}

	if (CurrVT == VT)
	return SignExt;

	if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
	SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
	DAG.getConstant(31, dl, MVT::i8));
	SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
	return DAG.getBitcast(VT, Ext);
	}

	return SDValue();
	}

	static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	SDLoc dl(Op);

	if (InVT.getVectorElementType() == MVT::i1)
	return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

	if ((VT != MVT::v4i64 \|\| InVT != MVT::v4i32) &&
	(VT != MVT::v8i32 \|\| InVT != MVT::v8i16) &&
	(VT != MVT::v16i16 \|\| InVT != MVT::v16i8) &&
	(VT != MVT::v8i64 \|\| InVT != MVT::v8i32) &&
	(VT != MVT::v8i64 \|\| InVT != MVT::v8i16) &&
	(VT != MVT::v16i32 \|\| InVT != MVT::v16i16) &&
	(VT != MVT::v16i32 \|\| InVT != MVT::v16i8) &&
	(VT != MVT::v32i16 \|\| InVT != MVT::v32i8))
	return SDValue();

	if (Subtarget.hasInt256())
	return DAG.getNode(X86ISD::VSEXT, dl, VT, In);

	// Optimize vectors in AVX mode
	// Sign extend v8i16 to v8i32 and
	// v4i32 to v4i64
	//
	// Divide input vector into two parts
	// for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
	// use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
	// concat the vectors to original VT

	unsigned NumElems = InVT.getVectorNumElements();
	SDValue Undef = DAG.getUNDEF(InVT);

	SmallVector<int,8> ShufMask1(NumElems, -1);
	for (unsigned i = 0; i != NumElems/2; ++i)
	ShufMask1[i] = i;

	SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);

	SmallVector<int,8> ShufMask2(NumElems, -1);
	for (unsigned i = 0; i != NumElems/2; ++i)
	ShufMask2[i] = i + NumElems/2;

	SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);

	MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
	VT.getVectorNumElements() / 2);

	OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
	OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
	}

	// Lower truncating store. We need a special lowering to vXi1 vectors
	static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
	SDLoc dl(St);
	EVT MemVT = St->getMemoryVT();
	assert(St->isTruncatingStore() && "We only custom truncating store.");
	assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
	"Expected truncstore of i1 vector");

	SDValue Op = St->getValue();
	MVT OpVT = Op.getValueType().getSimpleVT();
	unsigned NumElts = OpVT.getVectorNumElements();
	if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) \|\|
	NumElts == 16) {
	// Truncate and store - everything is legal
	Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
	if (MemVT.getSizeInBits() < 8)
	Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
	DAG.getUNDEF(MVT::v8i1), Op,
	DAG.getIntPtrConstant(0, dl));
	return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
	St->getMemOperand());
	}

	// A subset, assume that we have only AVX-512F
	if (NumElts <= 8) {
	if (NumElts < 8) {
	// Extend to 8-elts vector
	MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
	Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
	DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
	}
	Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
	Op = DAG.getBitcast(MVT::i8, Op);
	return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
	St->getMemOperand());
	}
	// v32i8
	assert(OpVT == MVT::v32i8 && "Unexpected operand type");
	// Divide the vector into 2 parts and store each part separately
	SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
	DAG.getIntPtrConstant(0, dl));
	Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
	SDValue BasePtr = St->getBasePtr();
	SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
	St->getMemOperand());
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
	DAG.getIntPtrConstant(16, dl));
	Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);

	SDValue BasePtrHi = DAG.getMemBasePlusOffset(BasePtr, 2, dl);

	SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
	BasePtrHi, St->getPointerInfo().getWithOffset(2),
	MinAlign(St->getAlignment(), 2U),
	St->getMemOperand()->getFlags());
	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
	}

	static SDValue LowerExtended1BitVectorLoad(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {

	LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
	SDLoc dl(Ld);
	EVT MemVT = Ld->getMemoryVT();
	assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
	"Expected i1 vector load");
	unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
	ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
	MVT VT = Op.getValueType().getSimpleVT();
	unsigned NumElts = VT.getVectorNumElements();

	if ((Subtarget.hasBWI() && NumElts >= 32) \|\|
	(Subtarget.hasDQI() && NumElts < 16) \|\|
	NumElts == 16) {
	// Load and extend - everything is legal
	if (NumElts < 8) {
	SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
	Ld->getBasePtr(),
	Ld->getMemOperand());
	// Replace chain users with the new chain.
	assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
	if (Subtarget.hasVLX()) {
	// Extract to v4i1/v2i1.
	SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Load,
	DAG.getIntPtrConstant(0, dl));
	// Finally, do a normal sign-extend to the desired register.
	return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Extract);
	}

	MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
	SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);

	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
	DAG.getIntPtrConstant(0, dl));
	}
	SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
	Ld->getBasePtr(),
	Ld->getMemOperand());
	// Replace chain users with the new chain.
	assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));

	// Finally, do a normal sign-extend to the desired register.
	return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
	}

	if (NumElts <= 8) {
	// A subset, assume that we have only AVX-512F
	SDValue Load = DAG.getLoad(MVT::i8, dl, Ld->getChain(),
	Ld->getBasePtr(),
	Ld->getMemOperand());
	// Replace chain users with the new chain.
	assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));

	SDValue BitVec = DAG.getBitcast(MVT::v8i1, Load);

	if (NumElts == 8)
	return DAG.getNode(ExtOpcode, dl, VT, BitVec);

	if (Subtarget.hasVLX()) {
	// Extract to v4i1/v2i1.
	SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, BitVec,
	DAG.getIntPtrConstant(0, dl));
	// Finally, do a normal sign-extend to the desired register.
	return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Extract);
	}

	MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
	SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
	DAG.getIntPtrConstant(0, dl));
	}

	assert(VT == MVT::v32i8 && "Unexpected extload type");

	SDValue BasePtr = Ld->getBasePtr();
	SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
	Ld->getBasePtr(),
	Ld->getMemOperand());

	SDValue BasePtrHi = DAG.getMemBasePlusOffset(BasePtr, 2, dl);

	SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(), BasePtrHi,
	Ld->getPointerInfo().getWithOffset(2),
	MinAlign(Ld->getAlignment(), 2U),
	Ld->getMemOperand()->getFlags());

	SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	LoadLo.getValue(1), LoadHi.getValue(1));
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);

	SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
	SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
	}

	// Lower vector extended loads using a shuffle. If SSSE3 is not available we
	// may emit an illegal shuffle but the expansion is still better than scalar
	// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
	// we'll emit a shuffle and a arithmetic shift.
	// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
	// TODO: It is possible to support ZExt by zeroing the undef values during
	// the shuffle phase or after the shuffle.
	static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT RegVT = Op.getSimpleValueType();
	assert(RegVT.isVector() && "We only custom lower vector sext loads.");
	assert(RegVT.isInteger() &&
	"We only custom lower integer vector sext loads.");

	// Nothing useful we can do without SSE2 shuffles.
	assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");

	LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
	SDLoc dl(Ld);
	EVT MemVT = Ld->getMemoryVT();
	if (MemVT.getScalarType() == MVT::i1)
	return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned RegSz = RegVT.getSizeInBits();

	ISD::LoadExtType Ext = Ld->getExtensionType();

	assert((Ext == ISD::EXTLOAD \|\| Ext == ISD::SEXTLOAD)
	&& "Only anyext and sext are currently implemented.");
	assert(MemVT != RegVT && "Cannot extend to the same type");
	assert(MemVT.isVector() && "Must load a vector from memory");

	unsigned NumElems = RegVT.getVectorNumElements();
	unsigned MemSz = MemVT.getSizeInBits();
	assert(RegSz > MemSz && "Register size must be greater than the mem size");

	if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
	// The only way in which we have a legal 256-bit vector result but not the
	// integer 256-bit operations needed to directly lower a sextload is if we
	// have AVX1 but not AVX2. In that case, we can always emit a sextload to
	// a 128-bit vector and a normal sign_extend to 256-bits that should get
	// correctly legalized. We do this late to allow the canonical form of
	// sextload to persist throughout the rest of the DAG combiner -- it wants
	// to fold together any extensions it can, and so will fuse a sign_extend
	// of an sextload into a sextload targeting a wider value.
	SDValue Load;
	if (MemSz == 128) {
	// Just switch this to a normal load.
	assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
	"it must be a legal 128-bit vector "
	"type!");
	Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());
	} else {
	assert(MemSz < 128 &&
	"Can't extend a type wider than 128 bits to a 256 bit vector!");
	// Do an sext load to a 128-bit vector type. We want to use the same
	// number of elements, but elements half as wide. This will end up being
	// recursively lowered by this routine, but will succeed as we definitely
	// have all the necessary features if we're using AVX1.
	EVT HalfEltVT =
	EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
	EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
	Load =
	DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());
	}

	// Replace chain users with the new chain.
	assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));

	// Finally, do a normal sign-extend to the desired register.
	return DAG.getSExtOrTrunc(Load, dl, RegVT);
	}

	// All sizes must be a power of two.
	assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
	"Non-power-of-two elements are not custom lowered!");

	// Attempt to load the original value using scalar loads.
	// Find the largest scalar type that divides the total loaded size.
	MVT SclrLoadTy = MVT::i8;
	for (MVT Tp : MVT::integer_valuetypes()) {
	if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
	SclrLoadTy = Tp;
	}
	}

	// On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
	if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
	(64 <= MemSz))
	SclrLoadTy = MVT::f64;

	// Calculate the number of scalar loads that we need to perform
	// in order to load our vector from memory.
	unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();

	assert((Ext != ISD::SEXTLOAD \|\| NumLoads == 1) &&
	"Can only lower sext loads with a single scalar load!");

	unsigned loadRegZize = RegSz;
	if (Ext == ISD::SEXTLOAD && RegSz >= 256)
	loadRegZize = 128;

	// If we don't have BWI we won't be able to create the shuffle needed for
	// v8i8->v8i64.
	if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
	MemVT == MVT::v8i8)
	loadRegZize = 128;

	// Represent our vector as a sequence of elements which are the
	// largest scalar that we can load.
	EVT LoadUnitVecVT = EVT::getVectorVT(
	*DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());

	// Represent the data using the same element type that is stored in
	// memory. In practice, we ''widen'' MemVT.
	EVT WideVecVT =
	EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
	loadRegZize / MemVT.getScalarSizeInBits());

	assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
	"Invalid vector type");

	// We can't shuffle using an illegal type.
	assert(TLI.isTypeLegal(WideVecVT) &&
	"We only lower types that form legal widened vector types");

	SmallVector<SDValue, 8> Chains;
	SDValue Ptr = Ld->getBasePtr();
	SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
	TLI.getPointerTy(DAG.getDataLayout()));
	SDValue Res = DAG.getUNDEF(LoadUnitVecVT);

	for (unsigned i = 0; i < NumLoads; ++i) {
	// Perform a single load.
	SDValue ScalarLoad =
	DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
	Ld->getAlignment(), Ld->getMemOperand()->getFlags());
	Chains.push_back(ScalarLoad.getValue(1));
	// Create the first element type using SCALAR_TO_VECTOR in order to avoid
	// another round of DAGCombining.
	if (i == 0)
	Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
	else
	Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
	ScalarLoad, DAG.getIntPtrConstant(i, dl));

	Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
	}

	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);

	// Bitcast the loaded value to a vector of the original element type, in
	// the size of the target vector type.
	SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
	unsigned SizeRatio = RegSz / MemSz;

	if (Ext == ISD::SEXTLOAD) {
	// If we have SSE4.1, we can directly emit a VSEXT node.
	if (Subtarget.hasSSE41()) {
	SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
	return Sext;
	}

	// Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
	// lanes.
	assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
	"We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");

	SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
	return Shuff;
	}

	if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
	MemVT == MVT::v8i8) {
	SDValue Sext = getExtendInVec(X86ISD::VZEXT, dl, RegVT, SlicedVec, DAG);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
	return Sext;
	}

	// Redistribute the loaded elements into the different locations.
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i * SizeRatio] = i;

	SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
	DAG.getUNDEF(WideVecVT), ShuffleVec);

	// Bitcast to the requested type.
	Shuff = DAG.getBitcast(RegVT, Shuff);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
	return Shuff;
	}

	/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
	/// each of which has no other use apart from the AND / OR.
	static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
	Opc = Op.getOpcode();
	if (Opc != ISD::OR && Opc != ISD::AND)
	return false;
	return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
	Op.getOperand(0).hasOneUse() &&
	Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
	Op.getOperand(1).hasOneUse());
	}

	/// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
	/// SETCC node has a single use.
	static bool isXor1OfSetCC(SDValue Op) {
	if (Op.getOpcode() != ISD::XOR)
	return false;
	if (isOneConstant(Op.getOperand(1)))
	return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
	Op.getOperand(0).hasOneUse();
	return false;
	}

	SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
	bool addTest = true;
	SDValue Chain = Op.getOperand(0);
	SDValue Cond = Op.getOperand(1);
	SDValue Dest = Op.getOperand(2);
	SDLoc dl(Op);
	SDValue CC;
	bool Inverted = false;

	if (Cond.getOpcode() == ISD::SETCC) {
	// Check for setcc([su]{add,sub,mul}o == 0).
	if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
	isNullConstant(Cond.getOperand(1)) &&
	Cond.getOperand(0).getResNo() == 1 &&
	(Cond.getOperand(0).getOpcode() == ISD::SADDO \|\|
	Cond.getOperand(0).getOpcode() == ISD::UADDO \|\|
	Cond.getOperand(0).getOpcode() == ISD::SSUBO \|\|
	Cond.getOperand(0).getOpcode() == ISD::USUBO \|\|
	Cond.getOperand(0).getOpcode() == ISD::SMULO \|\|
	Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
	Inverted = true;
	Cond = Cond.getOperand(0);
	} else {
	if (SDValue NewCond = LowerSETCC(Cond, DAG))
	Cond = NewCond;
	}
	}
	#if 0
	// FIXME: LowerXALUO doesn't handle these!!
	else if (Cond.getOpcode() == X86ISD::ADD \|\|
	Cond.getOpcode() == X86ISD::SUB \|\|
	Cond.getOpcode() == X86ISD::SMUL \|\|
	Cond.getOpcode() == X86ISD::UMUL)
	Cond = LowerXALUO(Cond, DAG);
	#endif

	// Look pass (and (setcc_carry (cmp ...)), 1).
	if (Cond.getOpcode() == ISD::AND &&
	Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
	isOneConstant(Cond.getOperand(1)))
	Cond = Cond.getOperand(0);

	// If condition flag is set by a X86ISD::CMP, then use it as the condition
	// setting operand in place of the X86ISD::SETCC.
	unsigned CondOpcode = Cond.getOpcode();
	if (CondOpcode == X86ISD::SETCC \|\|
	CondOpcode == X86ISD::SETCC_CARRY) {
	CC = Cond.getOperand(0);

	SDValue Cmp = Cond.getOperand(1);
	unsigned Opc = Cmp.getOpcode();
	// FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
	if (isX86LogicalCmp(Cmp) \|\| Opc == X86ISD::BT) {
	Cond = Cmp;
	addTest = false;
	} else {
	switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
	default: break;
	case X86::COND_O:
	case X86::COND_B:
	// These can only come from an arithmetic instruction with overflow,
	// e.g. SADDO, UADDO.
	Cond = Cond.getOperand(1);
	addTest = false;
	break;
	}
	}
	}
	CondOpcode = Cond.getOpcode();
	if (CondOpcode == ISD::UADDO \|\| CondOpcode == ISD::SADDO \|\|
	CondOpcode == ISD::USUBO \|\| CondOpcode == ISD::SSUBO \|\|
	((CondOpcode == ISD::UMULO \|\| CondOpcode == ISD::SMULO) &&
	Cond.getOperand(0).getValueType() != MVT::i8)) {
	SDValue LHS = Cond.getOperand(0);
	SDValue RHS = Cond.getOperand(1);
	unsigned X86Opcode;
	unsigned X86Cond;
	SDVTList VTs;
	// Keep this in sync with LowerXALUO, otherwise we might create redundant
	// instructions that can't be removed afterwards (i.e. X86ISD::ADD and
	// X86ISD::INC).
	switch (CondOpcode) {
	case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
	case ISD::SADDO:
	if (isOneConstant(RHS)) {
	X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
	break;
	}
	X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
	case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
	case ISD::SSUBO:
	if (isOneConstant(RHS)) {
	X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
	break;
	}
	X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
	case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
	case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
	default: llvm_unreachable("unexpected overflowing operator");
	}
	if (Inverted)
	X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
	if (CondOpcode == ISD::UMULO)
	VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
	MVT::i32);
	else
	VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);

	SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);

	if (CondOpcode == ISD::UMULO)
	Cond = X86Op.getValue(2);
	else
	Cond = X86Op.getValue(1);

	CC = DAG.getConstant(X86Cond, dl, MVT::i8);
	addTest = false;
	} else {
	unsigned CondOpc;
	if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
	SDValue Cmp = Cond.getOperand(0).getOperand(1);
	if (CondOpc == ISD::OR) {
	// Also, recognize the pattern generated by an FCMP_UNE. We can emit
	// two branches instead of an explicit OR instruction with a
	// separate test.
	if (Cmp == Cond.getOperand(1).getOperand(1) &&
	isX86LogicalCmp(Cmp)) {
	CC = Cond.getOperand(0).getOperand(0);
	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	CC = Cond.getOperand(1).getOperand(0);
	Cond = Cmp;
	addTest = false;
	}
	} else { // ISD::AND
	// Also, recognize the pattern generated by an FCMP_OEQ. We can emit
	// two branches instead of an explicit AND instruction with a
	// separate test. However, we only do this if this block doesn't
	// have a fall-through edge, because this requires an explicit
	// jmp when the condition is false.
	if (Cmp == Cond.getOperand(1).getOperand(1) &&
	isX86LogicalCmp(Cmp) &&
	Op.getNode()->hasOneUse()) {
	X86::CondCode CCode =
	(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
	CCode = X86::GetOppositeBranchCondition(CCode);
	CC = DAG.getConstant(CCode, dl, MVT::i8);
	SDNode User = Op.getNode()->use_begin();
	// Look for an unconditional branch following this conditional branch.
	// We need this because we need to reverse the successors in order
	// to implement FCMP_OEQ.
	if (User->getOpcode() == ISD::BR) {
	SDValue FalseBB = User->getOperand(1);
	SDNode *NewBR =
	DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
	assert(NewBR == User);
	(void)NewBR;
	Dest = FalseBB;

	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	X86::CondCode CCode =
	(X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
	CCode = X86::GetOppositeBranchCondition(CCode);
	CC = DAG.getConstant(CCode, dl, MVT::i8);
	Cond = Cmp;
	addTest = false;
	}
	}
	}
	} else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
	// Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
	// It should be transformed during dag combiner except when the condition
	// is set by a arithmetics with overflow node.
	X86::CondCode CCode =
	(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
	CCode = X86::GetOppositeBranchCondition(CCode);
	CC = DAG.getConstant(CCode, dl, MVT::i8);
	Cond = Cond.getOperand(0).getOperand(1);
	addTest = false;
	} else if (Cond.getOpcode() == ISD::SETCC &&
	cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
	// For FCMP_OEQ, we can emit
	// two branches instead of an explicit AND instruction with a
	// separate test. However, we only do this if this block doesn't
	// have a fall-through edge, because this requires an explicit
	// jmp when the condition is false.
	if (Op.getNode()->hasOneUse()) {
	SDNode User = Op.getNode()->use_begin();
	// Look for an unconditional branch following this conditional branch.
	// We need this because we need to reverse the successors in order
	// to implement FCMP_OEQ.
	if (User->getOpcode() == ISD::BR) {
	SDValue FalseBB = User->getOperand(1);
	SDNode *NewBR =
	DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
	assert(NewBR == User);
	(void)NewBR;
	Dest = FalseBB;

	SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
	Cond.getOperand(0), Cond.getOperand(1));
	Cmp = ConvertCmpIfNecessary(Cmp, DAG);
	CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
	Cond = Cmp;
	addTest = false;
	}
	}
	} else if (Cond.getOpcode() == ISD::SETCC &&
	cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
	// For FCMP_UNE, we can emit
	// two branches instead of an explicit AND instruction with a
	// separate test. However, we only do this if this block doesn't
	// have a fall-through edge, because this requires an explicit
	// jmp when the condition is false.
	if (Op.getNode()->hasOneUse()) {
	SDNode User = Op.getNode()->use_begin();
	// Look for an unconditional branch following this conditional branch.
	// We need this because we need to reverse the successors in order
	// to implement FCMP_UNE.
	if (User->getOpcode() == ISD::BR) {
	SDValue FalseBB = User->getOperand(1);
	SDNode *NewBR =
	DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
	assert(NewBR == User);
	(void)NewBR;

	SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
	Cond.getOperand(0), Cond.getOperand(1));
	Cmp = ConvertCmpIfNecessary(Cmp, DAG);
	CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
	Cond = Cmp;
	addTest = false;
	Dest = FalseBB;
	}
	}
	}
	}

	if (addTest) {
	// Look pass the truncate if the high bits are known zero.
	if (isTruncWithZeroHighBitsInput(Cond, DAG))
	Cond = Cond.getOperand(0);

	// We know the result of AND is compared against zero. Try to match
	// it to BT.
	if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
	if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, dl, DAG)) {
	CC = NewSetCC.getOperand(0);
	Cond = NewSetCC.getOperand(1);
	addTest = false;
	}
	}
	}

	if (addTest) {
	X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
	CC = DAG.getConstant(X86Cond, dl, MVT::i8);
	Cond = EmitTest(Cond, X86Cond, dl, DAG);
	}
	Cond = ConvertCmpIfNecessary(Cond, DAG);
	return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cond);
	}

	// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
	// Calls to _alloca are needed to probe the stack when allocating more than 4k
	// bytes in one go. Touching the stack at 4K increments is necessary to ensure
	// that the guard pages used by the OS virtual memory manager are allocated in
	// correct sequence.
	SDValue
	X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	bool SplitStack = MF.shouldSplitStack();
	bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
	bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) \|\|
	SplitStack \|\| EmitStackProbe;
	SDLoc dl(Op);

	// Get the inputs.
	SDNode *Node = Op.getNode();
	SDValue Chain = Op.getOperand(0);
	SDValue Size = Op.getOperand(1);
	unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
	EVT VT = Node->getValueType(0);

	// Chain the dynamic stack allocation so that it doesn't modify the stack
	// pointer when other instructions are using the stack.
	Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

	bool Is64Bit = Subtarget.is64Bit();
	MVT SPTy = getPointerTy(DAG.getDataLayout());

	SDValue Result;
	if (!Lower) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
	assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
	" not tell us which reg is the stack pointer!");

	SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
	Chain = SP.getValue(1);
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	unsigned StackAlign = TFI.getStackAlignment();
	Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
	if (Align > StackAlign)
	Result = DAG.getNode(ISD::AND, dl, VT, Result,
	DAG.getConstant(-(uint64_t)Align, dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
	} else if (SplitStack) {
	MachineRegisterInfo &MRI = MF.getRegInfo();

	if (Is64Bit) {
	// The 64 bit implementation of segmented stacks needs to clobber both r10
	// r11. This makes it impossible to use it along with nested parameters.
	const Function &F = MF.getFunction();
	for (const auto &A : F.args()) {
	if (A.hasNestAttr())
	report_fatal_error("Cannot use segmented stacks with functions that "
	"have nested arguments.");
	}
	}

	const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
	unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
	Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
	Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
	DAG.getRegister(Vreg, SPTy));
	} else {
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
	MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);

	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned SPReg = RegInfo->getStackRegister();
	SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
	Chain = SP.getValue(1);

	if (Align) {
	SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
	DAG.getConstant(-(uint64_t)Align, dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
	}

	Result = SP;
	}

	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
	DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);

	SDValue Ops[2] = {Result, Chain};
	return DAG.getMergeValues(Ops, dl);
	}

	SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	auto PtrVT = getPointerTy(MF.getDataLayout());
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	SDLoc DL(Op);

	if (!Subtarget.is64Bit() \|\|
	Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
	// vastart just stores the address of the VarArgsFrameIndex slot into the
	// memory location argument.
	SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
	return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	// __va_list_tag:
	// gp_offset (0 - 6 * 8)
	// fp_offset (48 - 48 + 8 * 16)
	// overflow_arg_area (point to parameters coming in memory).
	// reg_save_area
	SmallVector<SDValue, 8> MemOps;
	SDValue FIN = Op.getOperand(1);
	// Store gp_offset
	SDValue Store = DAG.getStore(
	Op.getOperand(0), DL,
	DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
	MachinePointerInfo(SV));
	MemOps.push_back(Store);

	// Store fp_offset
	FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
	Store = DAG.getStore(
	Op.getOperand(0), DL,
	DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
	MachinePointerInfo(SV, 4));
	MemOps.push_back(Store);

	// Store ptr to overflow_arg_area
	FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
	SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
	Store =
	DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
	MemOps.push_back(Store);

	// Store ptr to reg_save_area.
	FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
	Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
	SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
	Store = DAG.getStore(
	Op.getOperand(0), DL, RSFIN, FIN,
	MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
	MemOps.push_back(Store);
	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
	}

	SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
	assert(Subtarget.is64Bit() &&
	"LowerVAARG only handles 64-bit va_arg!");
	assert(Op.getNumOperands() == 4);

	MachineFunction &MF = DAG.getMachineFunction();
	if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
	// The Win64 ABI uses char* instead of a structure.
	return DAG.expandVAArg(Op.getNode());

	SDValue Chain = Op.getOperand(0);
	SDValue SrcPtr = Op.getOperand(1);
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	unsigned Align = Op.getConstantOperandVal(3);
	SDLoc dl(Op);

	EVT ArgVT = Op.getNode()->getValueType(0);
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
	uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
	uint8_t ArgMode;

	// Decide which area this value should be read from.
	// TODO: Implement the AMD64 ABI in its entirety. This simple
	// selection mechanism works only for the basic types.
	if (ArgVT == MVT::f80) {
	llvm_unreachable("va_arg for f80 not yet implemented");
	} else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /bytes/) {
	ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
	} else if (ArgVT.isInteger() && ArgSize <= 32 /bytes/) {
	ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
	} else {
	llvm_unreachable("Unhandled argument type in LowerVAARG");
	}

	if (ArgMode == 2) {
	// Sanity Check: Make sure using fp_offset makes sense.
	assert(!Subtarget.useSoftFloat() &&
	!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
	Subtarget.hasSSE1());
	}

	// Insert VAARG_64 node into the DAG
	// VAARG_64 returns two values: Variable Argument Address, Chain
	SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
	DAG.getConstant(ArgMode, dl, MVT::i8),
	DAG.getConstant(Align, dl, MVT::i32)};
	SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
	SDValue VAARG = DAG.getMemIntrinsicNode(
	X86ISD::VAARG_64, dl,
	VTs, InstOps, MVT::i64,
	MachinePointerInfo(SV),
	/Align=/0,
	MachineMemOperand::MOLoad \| MachineMemOperand::MOStore);
	Chain = VAARG.getValue(1);

	// Load the next argument and return it
	return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
	}

	static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// X86-64 va_list is a struct { i32, i32, i8, i8 }, except on Windows,
	// where a va_list is still an i8*.
	assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
	if (Subtarget.isCallingConvWin64(
	DAG.getMachineFunction().getFunction().getCallingConv()))
	// Probably a Win64 va_copy.
	return DAG.expandVACopy(Op.getNode());

	SDValue Chain = Op.getOperand(0);
	SDValue DstPtr = Op.getOperand(1);
	SDValue SrcPtr = Op.getOperand(2);
	const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
	const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
	SDLoc DL(Op);

	return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
	DAG.getIntPtrConstant(24, DL), 8, /isVolatile/false,
	false, false,
	MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
	}

	/// Handle vector element shifts where the shift amount is a constant.
	/// Takes immediate version of shift as input.
	static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
	SDValue SrcOp, uint64_t ShiftAmt,
	SelectionDAG &DAG) {
	MVT ElementType = VT.getVectorElementType();

	// Bitcast the source vector to the output type, this is mainly necessary for
	// vXi8/vXi64 shifts.
	if (VT != SrcOp.getSimpleValueType())
	SrcOp = DAG.getBitcast(VT, SrcOp);

	// Fold this packed shift into its first operand if ShiftAmt is 0.
	if (ShiftAmt == 0)
	return SrcOp;

	// Check for ShiftAmt >= element width
	if (ShiftAmt >= ElementType.getSizeInBits()) {
	if (Opc == X86ISD::VSRAI)
	ShiftAmt = ElementType.getSizeInBits() - 1;
	else
	return DAG.getConstant(0, dl, VT);
	}

	assert((Opc == X86ISD::VSHLI \|\| Opc == X86ISD::VSRLI \|\| Opc == X86ISD::VSRAI)
	&& "Unknown target vector shift-by-constant node");

	// Fold this packed vector shift into a build vector if SrcOp is a
	// vector of Constants or UNDEFs.
	if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
	SmallVector<SDValue, 8> Elts;
	unsigned NumElts = SrcOp->getNumOperands();
	ConstantSDNode *ND;

	switch(Opc) {
	default: llvm_unreachable("Unknown opcode!");
	case X86ISD::VSHLI:
	for (unsigned i=0; i!=NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	Elts.push_back(CurrentOp);
	continue;
	}
	ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
	}
	break;
	case X86ISD::VSRLI:
	for (unsigned i=0; i!=NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	Elts.push_back(CurrentOp);
	continue;
	}
	ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
	}
	break;
	case X86ISD::VSRAI:
	for (unsigned i=0; i!=NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	Elts.push_back(CurrentOp);
	continue;
	}
	ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
	}
	break;
	}

	return DAG.getBuildVector(VT, dl, Elts);
	}

	return DAG.getNode(Opc, dl, VT, SrcOp,
	DAG.getConstant(ShiftAmt, dl, MVT::i8));
	}

	/// Handle vector element shifts where the shift amount may or may not be a
	/// constant. Takes immediate version of shift as input.
	static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
	SDValue SrcOp, SDValue ShAmt,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT SVT = ShAmt.getSimpleValueType();
	assert((SVT == MVT::i32 \|\| SVT == MVT::i64) && "Unexpected value type!");

	// Catch shift-by-constant.
	if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
	return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
	CShAmt->getZExtValue(), DAG);

	// Change opcode to non-immediate version
	switch (Opc) {
	default: llvm_unreachable("Unknown target vector shift node");
	case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
	case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
	case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
	}

	// Need to build a vector containing shift amount.
	// SSE/AVX packed shifts only use the lower 64-bit of the shift count.
	// +=================+============+=======================================+
	// \| ShAmt is \| HasSSE4.1? \| Construct ShAmt vector as \|
	// +=================+============+=======================================+
	// \| i64 \| Yes, No \| Use ShAmt as lowest elt \|
	// \| i32 \| Yes \| zero-extend in-reg \|
	// \| (i32 zext(i16)) \| Yes \| zero-extend in-reg \|
	// \| i16/i32 \| No \| v4i32 build_vector(ShAmt, 0, ud, ud)) \|
	// +=================+============+=======================================+

	if (SVT == MVT::i64)
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
	else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
	ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
	ShAmt = ShAmt.getOperand(0);
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
	ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
	} else if (Subtarget.hasSSE41() &&
	ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
	ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
	} else {
	SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT),
	DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
	ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
	}

	// The return type has to be a 128-bit type with the same element
	// type as the input type.
	MVT EltVT = VT.getVectorElementType();
	MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());

	ShAmt = DAG.getBitcast(ShVT, ShAmt);
	return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
	}

	/// \brief Return Mask with the necessary casting or extending
	/// for \p Mask according to \p MaskVT when lowering masking intrinsics
	static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl) {

	if (isAllOnesConstant(Mask))
	return DAG.getConstant(1, dl, MaskVT);
	if (X86::isZeroNode(Mask))
	return DAG.getConstant(0, dl, MaskVT);

	if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
	// Mask should be extended
	Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
	MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
	}

	if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
	if (MaskVT == MVT::v64i1) {
	assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
	// In case 32bit mode, bitcast i64 is illegal, extend/split it.
	SDValue Lo, Hi;
	Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
	DAG.getConstant(0, dl, MVT::i32));
	Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
	DAG.getConstant(1, dl, MVT::i32));

	Lo = DAG.getBitcast(MVT::v32i1, Lo);
	Hi = DAG.getBitcast(MVT::v32i1, Hi);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
	} else {
	// MaskVT require < 64bit. Truncate mask (should succeed in any case),
	// and bitcast.
	MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
	return DAG.getBitcast(MaskVT,
	DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
	}

	} else {
	MVT BitcastVT = MVT::getVectorVT(MVT::i1,
	Mask.getSimpleValueType().getSizeInBits());
	// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
	// are extracted by EXTRACT_SUBVECTOR.
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
	DAG.getBitcast(BitcastVT, Mask),
	DAG.getIntPtrConstant(0, dl));
	}
	}

	/// \brief Return (and \p Op, \p Mask) for compare instructions or
	/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
	/// necessary casting or extending for \p Mask when lowering masking intrinsics
	static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	unsigned OpcodeSelect = ISD::VSELECT;
	SDLoc dl(Op);

	if (isAllOnesConstant(Mask))
	return Op;

	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	switch (Op.getOpcode()) {
	default: break;
	case X86ISD::CMPM:
	case X86ISD::CMPM_RND:
	case X86ISD::CMPMU:
	case X86ISD::VPSHUFBITQMB:
	return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
	case X86ISD::VFPCLASS:
	return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
	case X86ISD::VTRUNC:
	case X86ISD::VTRUNCS:
	case X86ISD::VTRUNCUS:
	case X86ISD::CVTPS2PH:
	// We can't use ISD::VSELECT here because it is not always "Legal"
	// for the destination type. For example vpmovqb require only AVX512
	// and vselect that can operate on byte element type require BWI
	OpcodeSelect = X86ISD::SELECT;
	break;
	}
	if (PreservedSrc.isUndef())
	PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
	return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
	}

	/// \brief Creates an SDNode for a predicated scalar operation.
	/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
	/// The mask is coming as MVT::i8 and it should be transformed
	/// to MVT::v1i1 while lowering masking intrinsics.
	/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
	/// "X86select" instead of "vselect". We just can't create the "vselect" node
	/// for a scalar instruction.
	static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {

	if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
	if (MaskConst->getZExtValue() & 0x1)
	return Op;

	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
	if (Op.getOpcode() == X86ISD::FSETCCM \|\|
	Op.getOpcode() == X86ISD::FSETCCM_RND)
	return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
	if (Op.getOpcode() == X86ISD::VFPCLASSS)
	return DAG.getNode(ISD::OR, dl, VT, Op, IMask);

	if (PreservedSrc.isUndef())
	PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
	return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
	}

	static int getSEHRegistrationNodeSize(const Function *Fn) {
	if (!Fn->hasPersonalityFn())
	report_fatal_error(
	"querying registration node size for function without personality");
	// The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
	// WinEHStatePass for the full struct definition.
	switch (classifyEHPersonality(Fn->getPersonalityFn())) {
	case EHPersonality::MSVC_X86SEH: return 24;
	case EHPersonality::MSVC_CXX: return 16;
	default: break;
	}
	report_fatal_error(
	"can only recover FP for 32-bit MSVC EH personality functions");
	}

	/// When the MSVC runtime transfers control to us, either to an outlined
	/// function or when returning to a parent frame after catching an exception, we
	/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
	/// Here's the math:
	/// RegNodeBase = EntryEBP - RegNodeSize
	/// ParentFP = RegNodeBase - ParentFrameOffset
	/// Subtracting RegNodeSize takes us to the offset of the registration node, and
	/// subtracting the offset (negative on x86) takes us back to the parent FP.
	static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
	SDValue EntryEBP) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDLoc dl;

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

	// It's possible that the parent function no longer has a personality function
	// if the exceptional code was optimized away, in which case we just return
	// the incoming EBP.
	if (!Fn->hasPersonalityFn())
	return EntryEBP;

	// Get an MCSymbol that will ultimately resolve to the frame offset of the EH
	// registration, or the .set_setframe offset.
	MCSymbol *OffsetSym =
	MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
	GlobalValue::dropLLVMManglingEscape(Fn->getName()));
	SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
	SDValue ParentFrameOffset =
	DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);

	// Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
	// prologue to RBP in the parent function.
	const X86Subtarget &Subtarget =
	static_cast<const X86Subtarget &>(DAG.getSubtarget());
	if (Subtarget.is64Bit())
	return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);

	int RegNodeSize = getSEHRegistrationNodeSize(Fn);
	// RegNodeBase = EntryEBP - RegNodeSize
	// ParentFP = RegNodeBase - ParentFrameOffset
	SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
	DAG.getConstant(RegNodeSize, dl, PtrVT));
	return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
	}

	SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
	SelectionDAG &DAG) const {
	// Helper to detect if the operand is CUR_DIRECTION rounding mode.
	auto isRoundModeCurDirection = [](SDValue Rnd) {
	if (!isa<ConstantSDNode>(Rnd))
	return false;

	unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
	return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
	};

	SDLoc dl(Op);
	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	MVT VT = Op.getSimpleValueType();
	const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
	if (IntrData) {
	switch(IntrData->Type) {
	case INTR_TYPE_1OP:
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
	case INTR_TYPE_2OP:
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
	Op.getOperand(2));
	case INTR_TYPE_3OP:
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
	Op.getOperand(2), Op.getOperand(3));
	case INTR_TYPE_4OP:
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
	Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
	case INTR_TYPE_1OP_MASK_RM: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue RoundingMode;
	// We always add rounding mode to the Node.
	// If the rounding mode is not specified, we add the
	// "current direction" mode.
	if (Op.getNumOperands() == 4)
	RoundingMode =
	DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
	else
	RoundingMode = Op.getOperand(4);
	assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
	RoundingMode),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_1OP_MASK: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	// We add rounding mode to the Node when
	// - RM Opcode is specified and
	// - RM is not "current direction".
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(4);
	if (!isRoundModeCurDirection(Rnd)) {
	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, Op.getValueType(),
	Src, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	}
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_SCALAR_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue passThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	// There are 2 kinds of intrinsics in this group:
	// (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
	// (2) With rounding mode and sae - 7 operands.
	bool HasRounding = IntrWithRoundingModeOpcode != 0;
	if (Op.getNumOperands() == (5U + HasRounding)) {
	if (HasRounding) {
	SDValue Rnd = Op.getOperand(5);
	if (!isRoundModeCurDirection(Rnd))
	return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, VT, Src1, Src2, Rnd),
	Mask, passThru, Subtarget, DAG);
	}
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
	Src2),
	Mask, passThru, Subtarget, DAG);
	}

	assert(Op.getNumOperands() == (6U + HasRounding) &&
	"Unexpected intrinsic form");
	SDValue RoundingMode = Op.getOperand(5);
	if (HasRounding) {
	SDValue Sae = Op.getOperand(6);
	if (!isRoundModeCurDirection(Sae))
	return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, VT, Src1, Src2,
	RoundingMode, Sae),
	Mask, passThru, Subtarget, DAG);
	}
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
	Src2, RoundingMode),
	Mask, passThru, Subtarget, DAG);
	}
	case INTR_TYPE_SCALAR_MASK_RM: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src0 = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	// There are 2 kinds of intrinsics in this group:
	// (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
	// (2) With rounding mode and sae - 7 operands.
	if (Op.getNumOperands() == 6) {
	SDValue Sae = Op.getOperand(5);
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
	Sae),
	Mask, Src0, Subtarget, DAG);
	}
	assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
	SDValue RoundingMode = Op.getOperand(5);
	SDValue Sae = Op.getOperand(6);
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
	RoundingMode, Sae),
	Mask, Src0, Subtarget, DAG);
	}
	case INTR_TYPE_2OP_MASK:
	case INTR_TYPE_2OP_IMM8_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);

	if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
	Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);

	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(5);
	if (!isRoundModeCurDirection(Rnd)) {
	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, Op.getValueType(),
	Src1, Src2, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	}
	// TODO: Intrinsics should have fast-math-flags to propagate.
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_2OP_MASK_RM: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	// We specify 2 possible modes for intrinsics, with/without rounding
	// modes.
	// First, we check if the intrinsic have rounding mode (6 operands),
	// if not, we set rounding mode to "current".
	SDValue Rnd;
	if (Op.getNumOperands() == 6)
	Rnd = Op.getOperand(5);
	else
	Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_3OP_SCALAR_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue PassThru = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);

	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(6);
	if (!isRoundModeCurDirection(Rnd))
	return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, VT, Src1, Src2, Src3, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
	Src2, Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_3OP_MASK_RM: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Imm = Op.getOperand(3);
	SDValue PassThru = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	// We specify 2 possible modes for intrinsics, with/without rounding
	// modes.
	// First, we check if the intrinsic have rounding mode (7 operands),
	// if not, we set rounding mode to "current".
	SDValue Rnd;
	if (Op.getNumOperands() == 7)
	Rnd = Op.getOperand(6);
	else
	Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Imm, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_3OP_IMM8_MASK:
	case INTR_TYPE_3OP_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue PassThru = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);

	if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
	Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);

	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(6);
	if (!isRoundModeCurDirection(Rnd)) {
	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, Op.getValueType(),
	Src1, Src2, Src3, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	}
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case VPERM_2OP_MASK : {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);

	// Swap Src1 and Src2 in the node creation
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
	Mask, PassThru, Subtarget, DAG);
	}
	case VPERM_3OP_MASKZ:
	case VPERM_3OP_MASK:{
	MVT VT = Op.getSimpleValueType();
	// Src2 is the PassThru
	SDValue Src1 = Op.getOperand(1);
	// PassThru needs to be the same type as the destination in order
	// to pattern match correctly.
	SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
	SDValue Src3 = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	SDValue PassThru = SDValue();

	// set PassThru element
	if (IntrData->Type == VPERM_3OP_MASKZ)
	PassThru = getZeroVector(VT, Subtarget, DAG, dl);
	else
	PassThru = Src2;

	// Swap Src1 and Src2 in the node creation
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
	dl, Op.getValueType(),
	Src2, Src1, Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case FMA_OP_MASK3:
	case FMA_OP_MASKZ:
	case FMA_OP_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	MVT VT = Op.getSimpleValueType();
	SDValue PassThru = SDValue();

	// set PassThru element
	if (IntrData->Type == FMA_OP_MASKZ)
	PassThru = getZeroVector(VT, Subtarget, DAG, dl);
	else if (IntrData->Type == FMA_OP_MASK3)
	PassThru = Src3;
	else
	PassThru = Src1;

	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(5);
	if (!isRoundModeCurDirection(Rnd))
	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, Op.getValueType(),
	Src1, Src2, Src3, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
	dl, Op.getValueType(),
	Src1, Src2, Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case FMA_OP_SCALAR_MASK:
	case FMA_OP_SCALAR_MASK3:
	case FMA_OP_SCALAR_MASKZ: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	MVT VT = Op.getSimpleValueType();
	SDValue PassThru = SDValue();

	// set PassThru element
	if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
	PassThru = getZeroVector(VT, Subtarget, DAG, dl);
	else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
	PassThru = Src3;
	else
	PassThru = Src1;

	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(5);
	if (!isRoundModeCurDirection(Rnd))
	return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl,
	Op.getValueType(), Src1, Src2,
	Src3, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}

	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
	Op.getValueType(), Src1, Src2,
	Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case IFMA_OP_MASKZ:
	case IFMA_OP_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	MVT VT = Op.getSimpleValueType();
	SDValue PassThru = Src1;

	// set PassThru element
	if (IntrData->Type == IFMA_OP_MASKZ)
	PassThru = getZeroVector(VT, Subtarget, DAG, dl);

	// Node we need to swizzle the operands to pass the multiply operands
	// first.
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
	dl, Op.getValueType(),
	Src2, Src3, Src1),
	Mask, PassThru, Subtarget, DAG);
	}
	case TERLOG_OP_MASK:
	case TERLOG_OP_MASKZ: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
	SDValue Mask = Op.getOperand(5);
	MVT VT = Op.getSimpleValueType();
	SDValue PassThru = Src1;
	// Set PassThru element.
	if (IntrData->Type == TERLOG_OP_MASKZ)
	PassThru = getZeroVector(VT, Subtarget, DAG, dl);

	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Src3, Src4),
	Mask, PassThru, Subtarget, DAG);
	}
	case CVTPD2PS:
	// ISD::FP_ROUND has a second argument that indicates if the truncation
	// does not change the value. Set it to 0 since it can change.
	return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
	DAG.getIntPtrConstant(0, dl));
	case CVTPD2PS_MASK: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	// We add rounding mode to the Node when
	// - RM Opcode is specified and
	// - RM is not "current direction".
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(4);
	if (!isRoundModeCurDirection(Rnd)) {
	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, Op.getValueType(),
	Src, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	}
	assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
	// ISD::FP_ROUND has a second argument that indicates if the truncation
	// does not change the value. Set it to 0 since it can change.
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
	DAG.getIntPtrConstant(0, dl)),
	Mask, PassThru, Subtarget, DAG);
	}
	case FPCLASS: {
	// FPclass intrinsics with mask
	SDValue Src1 = Op.getOperand(1);
	MVT VT = Src1.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	SDValue Imm = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	MVT BitcastVT = MVT::getVectorVT(MVT::i1,
	Mask.getSimpleValueType().getSizeInBits());
	SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
	SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask, SDValue(),
	Subtarget, DAG);
	SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
	DAG.getUNDEF(BitcastVT), FPclassMask,
	DAG.getIntPtrConstant(0, dl));
	return DAG.getBitcast(Op.getValueType(), Res);
	}
	case FPCLASSS: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Imm = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
	SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
	Subtarget, DAG);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, FPclassMask,
	DAG.getIntPtrConstant(0, dl));
	}
	case CMP_MASK:
	case CMP_MASK_CC: {
	// Comparison intrinsics with masks.
	// Example of transformation:
	// (i8 (int_x86_avx512_mask_pcmpeq_q_128
	// (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
	// (i8 (bitcast
	// (v8i1 (insert_subvector undef,
	// (v2i1 (and (PCMPEQM %a, %b),
	// (extract_subvector
	// (v8i1 (bitcast %mask)), 0))), 0))))
	MVT VT = Op.getOperand(1).getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
	MVT BitcastVT = MVT::getVectorVT(MVT::i1,
	Mask.getSimpleValueType().getSizeInBits());
	SDValue Cmp;
	if (IntrData->Type == CMP_MASK_CC) {
	SDValue CC = Op.getOperand(3);
	CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	if (IntrData->Opc1 != 0) {
	SDValue Rnd = Op.getOperand(5);
	if (!isRoundModeCurDirection(Rnd))
	Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
	Op.getOperand(2), CC, Rnd);
	}
	//default rounding mode
	if(!Cmp.getNode())
	Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
	Op.getOperand(2), CC);

	} else {
	assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
	Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
	Op.getOperand(2));
	}
	SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, SDValue(),
	Subtarget, DAG);
	SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
	DAG.getUNDEF(BitcastVT), CmpMask,
	DAG.getIntPtrConstant(0, dl));
	return DAG.getBitcast(Op.getValueType(), Res);
	}
	case CMP_MASK_SCALAR_CC: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
	SDValue Mask = Op.getOperand(4);

	SDValue Cmp;
	if (IntrData->Opc1 != 0) {
	SDValue Rnd = Op.getOperand(5);
	if (!isRoundModeCurDirection(Rnd))
	Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
	}
	//default rounding mode
	if(!Cmp.getNode())
	Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);

	SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
	Subtarget, DAG);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, CmpMask,
	DAG.getIntPtrConstant(0, dl));
	}
	case COMI: { // Comparison intrinsics
	ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
	SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
	SDValue SetCC;
	switch (CC) {
	case ISD::SETEQ: { // (ZF = 0 and PF = 0)
	SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
	SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
	SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
	break;
	}
	case ISD::SETNE: { // (ZF = 1 or PF = 1)
	SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
	SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
	SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
	break;
	}
	case ISD::SETGT: // (CF = 0 and ZF = 0)
	SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
	break;
	case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
	SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
	break;
	}
	case ISD::SETGE: // CF = 0
	SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
	break;
	case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
	SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
	break;
	default:
	llvm_unreachable("Unexpected illegal condition!");
	}
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}
	case COMI_RM: { // Comparison intrinsics with Sae
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
	SDValue Sae = Op.getOperand(4);

	SDValue FCmp;
	if (isRoundModeCurDirection(Sae))
	FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
	DAG.getConstant(CondVal, dl, MVT::i8));
	else
	FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
	DAG.getConstant(CondVal, dl, MVT::i8), Sae);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, FCmp,
	DAG.getIntPtrConstant(0, dl));
	}
	case VSHIFT:
	return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
	Op.getOperand(1), Op.getOperand(2), Subtarget,
	DAG);
	case COMPRESS_EXPAND_IN_REG: {
	SDValue Mask = Op.getOperand(3);
	SDValue DataToCompress = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	if (isAllOnesConstant(Mask)) // return data as is
	return Op.getOperand(1);

	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	DataToCompress),
	Mask, PassThru, Subtarget, DAG);
	}
	case BROADCASTM: {
	SDValue Mask = Op.getOperand(1);
	MVT MaskVT = MVT::getVectorVT(MVT::i1,
	Mask.getSimpleValueType().getSizeInBits());
	Mask = DAG.getBitcast(MaskVT, Mask);
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
	}
	+ case KUNPCK: {
	+ MVT VT = Op.getSimpleValueType();
	+ MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
	+
	+ SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
	+ SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
	+ // Arguments should be swapped.
	+ SDValue Res = DAG.getNode(IntrData->Opc0, dl,
	+ MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
	+ Src2, Src1);
	+ return DAG.getBitcast(VT, Res);
	+ }
	case MASK_BINOP: {
	MVT VT = Op.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());

	SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
	SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
	SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
	return DAG.getBitcast(VT, Res);
	}
	case FIXUPIMMS:
	case FIXUPIMMS_MASKZ:
	case FIXUPIMM:
	case FIXUPIMM_MASKZ:{
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue Imm = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Passthru = (IntrData->Type == FIXUPIMM \|\| IntrData->Type == FIXUPIMMS ) ?
	Src1 : getZeroVector(VT, Subtarget, DAG, dl);
	// We specify 2 possible modes for intrinsics, with/without rounding
	// modes.
	// First, we check if the intrinsic have rounding mode (7 operands),
	// if not, we set rounding mode to "current".
	SDValue Rnd;
	if (Op.getNumOperands() == 7)
	Rnd = Op.getOperand(6);
	else
	Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
	if (IntrData->Type == FIXUPIMM \|\| IntrData->Type == FIXUPIMM_MASKZ)
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Src3, Imm, Rnd),
	Mask, Passthru, Subtarget, DAG);
	else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Src3, Imm, Rnd),
	Mask, Passthru, Subtarget, DAG);
	}
	case CONVERT_TO_MASK: {
	MVT SrcVT = Op.getOperand(1).getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
	MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());

	SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
	Op.getOperand(1));
	SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
	DAG.getUNDEF(BitcastVT), CvtMask,
	DAG.getIntPtrConstant(0, dl));
	return DAG.getBitcast(Op.getValueType(), Res);
	}
	case ROUNDP: {
	assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
	// Clear the upper bits of the rounding immediate so that the legacy
	// intrinsic can't trigger the scaling behavior of VRNDSCALE.
	SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
	Op.getOperand(2),
	DAG.getConstant(0xf, dl, MVT::i32));
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(1), RoundingMode);
	}
	case ROUNDS: {
	assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
	// Clear the upper bits of the rounding immediate so that the legacy
	// intrinsic can't trigger the scaling behavior of VRNDSCALE.
	SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
	Op.getOperand(3),
	DAG.getConstant(0xf, dl, MVT::i32));
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2), RoundingMode);
	}
	default:
	break;
	}
	}

	switch (IntNo) {
	default: return SDValue(); // Don't custom lower most intrinsics.

	case Intrinsic::x86_avx2_permd:
	case Intrinsic::x86_avx2_permps:
	// Operands intentionally swapped. Mask is last operand to intrinsic,
	// but second operand for node/instruction.
	return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
	Op.getOperand(2), Op.getOperand(1));

	// ptest and testp intrinsics. The intrinsic these come from are designed to
	// return an integer value, not just an instruction so lower it to the ptest
	// or testp pattern and a setcc for the result.
	case Intrinsic::x86_sse41_ptestz:
	case Intrinsic::x86_sse41_ptestc:
	case Intrinsic::x86_sse41_ptestnzc:
	case Intrinsic::x86_avx_ptestz_256:
	case Intrinsic::x86_avx_ptestc_256:
	case Intrinsic::x86_avx_ptestnzc_256:
	case Intrinsic::x86_avx_vtestz_ps:
	case Intrinsic::x86_avx_vtestc_ps:
	case Intrinsic::x86_avx_vtestnzc_ps:
	case Intrinsic::x86_avx_vtestz_pd:
	case Intrinsic::x86_avx_vtestc_pd:
	case Intrinsic::x86_avx_vtestnzc_pd:
	case Intrinsic::x86_avx_vtestz_ps_256:
	case Intrinsic::x86_avx_vtestc_ps_256:
	case Intrinsic::x86_avx_vtestnzc_ps_256:
	case Intrinsic::x86_avx_vtestz_pd_256:
	case Intrinsic::x86_avx_vtestc_pd_256:
	case Intrinsic::x86_avx_vtestnzc_pd_256: {
	bool IsTestPacked = false;
	X86::CondCode X86CC;
	switch (IntNo) {
	default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
	case Intrinsic::x86_avx_vtestz_ps:
	case Intrinsic::x86_avx_vtestz_pd:
	case Intrinsic::x86_avx_vtestz_ps_256:
	case Intrinsic::x86_avx_vtestz_pd_256:
	IsTestPacked = true;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestz:
	case Intrinsic::x86_avx_ptestz_256:
	// ZF = 1
	X86CC = X86::COND_E;
	break;
	case Intrinsic::x86_avx_vtestc_ps:
	case Intrinsic::x86_avx_vtestc_pd:
	case Intrinsic::x86_avx_vtestc_ps_256:
	case Intrinsic::x86_avx_vtestc_pd_256:
	IsTestPacked = true;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestc:
	case Intrinsic::x86_avx_ptestc_256:
	// CF = 1
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_avx_vtestnzc_ps:
	case Intrinsic::x86_avx_vtestnzc_pd:
	case Intrinsic::x86_avx_vtestnzc_ps_256:
	case Intrinsic::x86_avx_vtestnzc_pd_256:
	IsTestPacked = true;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestnzc:
	case Intrinsic::x86_avx_ptestnzc_256:
	// ZF and CF = 0
	X86CC = X86::COND_A;
	break;
	}

	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
	SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
	SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}
	case Intrinsic::x86_avx512_kortestz_w:
	case Intrinsic::x86_avx512_kortestc_w: {
	X86::CondCode X86CC =
	(IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
	SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
	SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
	SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
	SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}

	case Intrinsic::x86_avx512_knot_w: {
	SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
	SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
	SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
	return DAG.getBitcast(MVT::i16, Res);
	}

	case Intrinsic::x86_avx512_kandn_w: {
	SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
	// Invert LHS for the not.
	LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
	DAG.getConstant(1, dl, MVT::v16i1));
	SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
	SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
	return DAG.getBitcast(MVT::i16, Res);
	}

	case Intrinsic::x86_avx512_kxnor_w: {
	SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
	SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
	SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
	// Invert result for the not.
	Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
	DAG.getConstant(1, dl, MVT::v16i1));
	return DAG.getBitcast(MVT::i16, Res);
	}

	case Intrinsic::x86_sse42_pcmpistria128:
	case Intrinsic::x86_sse42_pcmpestria128:
	case Intrinsic::x86_sse42_pcmpistric128:
	case Intrinsic::x86_sse42_pcmpestric128:
	case Intrinsic::x86_sse42_pcmpistrio128:
	case Intrinsic::x86_sse42_pcmpestrio128:
	case Intrinsic::x86_sse42_pcmpistris128:
	case Intrinsic::x86_sse42_pcmpestris128:
	case Intrinsic::x86_sse42_pcmpistriz128:
	case Intrinsic::x86_sse42_pcmpestriz128: {
	unsigned Opcode;
	X86::CondCode X86CC;
	switch (IntNo) {
	default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
	case Intrinsic::x86_sse42_pcmpistria128:
	Opcode = X86ISD::PCMPISTRI;
	X86CC = X86::COND_A;
	break;
	case Intrinsic::x86_sse42_pcmpestria128:
	Opcode = X86ISD::PCMPESTRI;
	X86CC = X86::COND_A;
	break;
	case Intrinsic::x86_sse42_pcmpistric128:
	Opcode = X86ISD::PCMPISTRI;
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_sse42_pcmpestric128:
	Opcode = X86ISD::PCMPESTRI;
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_sse42_pcmpistrio128:
	Opcode = X86ISD::PCMPISTRI;
	X86CC = X86::COND_O;
	break;
	case Intrinsic::x86_sse42_pcmpestrio128:
	Opcode = X86ISD::PCMPESTRI;
	X86CC = X86::COND_O;
	break;
	case Intrinsic::x86_sse42_pcmpistris128:
	Opcode = X86ISD::PCMPISTRI;
	X86CC = X86::COND_S;
	break;
	case Intrinsic::x86_sse42_pcmpestris128:
	Opcode = X86ISD::PCMPESTRI;
	X86CC = X86::COND_S;
	break;
	case Intrinsic::x86_sse42_pcmpistriz128:
	Opcode = X86ISD::PCMPISTRI;
	X86CC = X86::COND_E;
	break;
	case Intrinsic::x86_sse42_pcmpestriz128:
	Opcode = X86ISD::PCMPESTRI;
	X86CC = X86::COND_E;
	break;
	}
	SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
	SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}

	case Intrinsic::x86_sse42_pcmpistri128:
	case Intrinsic::x86_sse42_pcmpestri128: {
	unsigned Opcode;
	if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
	Opcode = X86ISD::PCMPISTRI;
	else
	Opcode = X86ISD::PCMPESTRI;

	SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	return DAG.getNode(Opcode, dl, VTs, NewOps);
	}

	case Intrinsic::eh_sjlj_lsda: {
	MachineFunction &MF = DAG.getMachineFunction();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
	auto &Context = MF.getMMI().getContext();
	MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
	Twine(MF.getFunctionNumber()));
	return DAG.getNode(getGlobalWrapperKind(), dl, VT,
	DAG.getMCSymbol(S, PtrVT));
	}

	case Intrinsic::x86_seh_lsda: {
	// Compute the symbol for the LSDA. We know it'll get emitted later.
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Op1 = Op.getOperand(1);
	auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
	MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
	GlobalValue::dropLLVMManglingEscape(Fn->getName()));

	// Generate a simple absolute symbol reference. This intrinsic is only
	// supported on 32-bit Windows, which isn't PIC.
	SDValue Result = DAG.getMCSymbol(LSDASym, VT);
	return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
	}

	case Intrinsic::x86_seh_recoverfp: {
	SDValue FnOp = Op.getOperand(1);
	SDValue IncomingFPOp = Op.getOperand(2);
	GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
	auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
	if (!Fn)
	report_fatal_error(
	"llvm.x86.seh.recoverfp must take a function as the first argument");
	return recoverFramePointer(DAG, Fn, IncomingFPOp);
	}

	case Intrinsic::localaddress: {
	// Returns one of the stack, base, or frame pointer registers, depending on
	// which is used to reference local variables.
	MachineFunction &MF = DAG.getMachineFunction();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned Reg;
	if (RegInfo->hasBasePointer(MF))
	Reg = RegInfo->getBaseRegister();
	else // This function handles the SP or FP case.
	Reg = RegInfo->getPtrSizedFrameRegister(MF);
	return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
	}
	}
	}

	static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
	EVT MaskVT = Mask.getValueType();
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
	SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
	SDValue Segment = DAG.getRegister(0, MVT::i32);
	// If source is undef or we know it won't be used, use a zero vector
	// to break register dependency.
	// TODO: use undef instead and let ExecutionDepsFix deal with it?
	if (Src.isUndef() \|\| ISD::isBuildVectorAllOnes(Mask.getNode()))
	Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
	SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
	SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
	SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
	return DAG.getMergeValues(RetOps, dl);
	}

	static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
	MVT MaskVT = MVT::getVectorVT(MVT::i1,
	Index.getSimpleValueType().getVectorNumElements());

	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
	SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
	SDValue Segment = DAG.getRegister(0, MVT::i32);
	// If source is undef or we know it won't be used, use a zero vector
	// to break register dependency.
	// TODO: use undef instead and let ExecutionDepsFix deal with it?
	if (Src.isUndef() \|\| ISD::isBuildVectorAllOnes(VMask.getNode()))
	Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
	SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
	SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
	SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
	return DAG.getMergeValues(RetOps, dl);
	}

	static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
	SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
	SDValue Segment = DAG.getRegister(0, MVT::i32);
	MVT MaskVT = MVT::getVectorVT(MVT::i1,
	Index.getSimpleValueType().getVectorNumElements());

	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
	SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
	SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
	return SDValue(Res, 1);
	}

	static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Mask, SDValue Base, SDValue Index,
	SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
	SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
	SDValue Segment = DAG.getRegister(0, MVT::i32);
	MVT MaskVT =
	MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
	SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
	return SDValue(Res, 0);
	}

	/// Handles the lowering of builtin intrinsic that return the value
	/// of the extended control register.
	static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SmallVectorImpl<SDValue> &Results) {
	assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue LO, HI;

	// The ECX register is used to select the index of the XCR register to
	// return.
	SDValue Chain =
	DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
	SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
	Chain = SDValue(N1, 0);

	// Reads the content of XCR and returns it in registers EDX:EAX.
	if (Subtarget.is64Bit()) {
	LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
	LO.getValue(2));
	} else {
	LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
	LO.getValue(2));
	}
	Chain = HI.getValue(1);

	if (Subtarget.is64Bit()) {
	// Merge the two 32-bit values into a 64-bit one..
	SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
	DAG.getConstant(32, DL, MVT::i8));
	Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
	Results.push_back(Chain);
	return;
	}

	// Use a buildpair to merge the two 32-bit values into a 64-bit one.
	SDValue Ops[] = { LO, HI };
	SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
	Results.push_back(Pair);
	Results.push_back(Chain);
	}

	/// Handles the lowering of builtin intrinsics that read performance monitor
	/// counters (x86_rdpmc).
	static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SmallVectorImpl<SDValue> &Results) {
	assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue LO, HI;

	// The ECX register is used to select the index of the performance counter
	// to read.
	SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
	N->getOperand(2));
	SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);

	// Reads the content of a 64-bit performance counter and returns it in the
	// registers EDX:EAX.
	if (Subtarget.is64Bit()) {
	LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
	LO.getValue(2));
	} else {
	LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
	LO.getValue(2));
	}
	Chain = HI.getValue(1);

	if (Subtarget.is64Bit()) {
	// The EAX register is loaded with the low-order 32 bits. The EDX register
	// is loaded with the supported high-order bits of the counter.
	SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
	DAG.getConstant(32, DL, MVT::i8));
	Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
	Results.push_back(Chain);
	return;
	}

	// Use a buildpair to merge the two 32-bit values into a 64-bit one.
	SDValue Ops[] = { LO, HI };
	SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
	Results.push_back(Pair);
	Results.push_back(Chain);
	}

	/// Handles the lowering of builtin intrinsics that read the time stamp counter
	/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
	/// READCYCLECOUNTER nodes.
	static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SmallVectorImpl<SDValue> &Results) {
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
	SDValue LO, HI;

	// The processor's time-stamp counter (a 64-bit MSR) is stored into the
	// EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
	// and the EAX register is loaded with the low-order 32 bits.
	if (Subtarget.is64Bit()) {
	LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
	LO.getValue(2));
	} else {
	LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
	LO.getValue(2));
	}
	SDValue Chain = HI.getValue(1);

	if (Opcode == X86ISD::RDTSCP_DAG) {
	assert(N->getNumOperands() == 3 && "Unexpected number of operands!");

	// Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
	// the ECX register. Add 'ecx' explicitly to the chain.
	SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
	HI.getValue(2));
	// Explicitly store the content of ECX at the location passed in input
	// to the 'rdtscp' intrinsic.
	Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
	MachinePointerInfo());
	}

	if (Subtarget.is64Bit()) {
	// The EDX register is loaded with the high-order 32 bits of the MSR, and
	// the EAX register is loaded with the low-order 32 bits.
	SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
	DAG.getConstant(32, DL, MVT::i8));
	Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
	Results.push_back(Chain);
	return;
	}

	// Use a buildpair to merge the two 32-bit values into a 64-bit one.
	SDValue Ops[] = { LO, HI };
	SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
	Results.push_back(Pair);
	Results.push_back(Chain);
	}

	static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SmallVector<SDValue, 2> Results;
	SDLoc DL(Op);
	getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
	Results);
	return DAG.getMergeValues(Results, DL);
	}

	static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Chain = Op.getOperand(0);
	SDValue RegNode = Op.getOperand(2);
	WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
	if (!EHInfo)
	report_fatal_error("EH registrations only live in functions using WinEH");

	// Cast the operand to an alloca, and remember the frame index.
	auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
	if (!FINode)
	report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
	EHInfo->EHRegNodeFrameIndex = FINode->getIndex();

	// Return the chain operand without making any DAG nodes.
	return Chain;
	}

	static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Chain = Op.getOperand(0);
	SDValue EHGuard = Op.getOperand(2);
	WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
	if (!EHInfo)
	report_fatal_error("EHGuard only live in functions using WinEH");

	// Cast the operand to an alloca, and remember the frame index.
	auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
	if (!FINode)
	report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
	EHInfo->EHGuardFrameIndex = FINode->getIndex();

	// Return the chain operand without making any DAG nodes.
	return Chain;
	}

	/// Emit Truncating Store with signed or unsigned saturation.
	static SDValue
	EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
	SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
	SelectionDAG &DAG) {

	SDVTList VTs = DAG.getVTList(MVT::Other);
	SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
	SDValue Ops[] = { Chain, Val, Ptr, Undef };
	return SignedSat ?
	DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
	DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
	}

	/// Emit Masked Truncating Store with signed or unsigned saturation.
	static SDValue
	EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
	SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
	MachineMemOperand *MMO, SelectionDAG &DAG) {

	SDVTList VTs = DAG.getVTList(MVT::Other);
	SDValue Ops[] = { Chain, Ptr, Mask, Val };
	return SignedSat ?
	DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
	DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
	}

	static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();

	const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
	if (!IntrData) {
	switch (IntNo) {
	case llvm::Intrinsic::x86_seh_ehregnode:
	return MarkEHRegistrationNode(Op, DAG);
	case llvm::Intrinsic::x86_seh_ehguard:
	return MarkEHGuard(Op, DAG);
	case llvm::Intrinsic::x86_flags_read_u32:
	case llvm::Intrinsic::x86_flags_read_u64:
	case llvm::Intrinsic::x86_flags_write_u32:
	case llvm::Intrinsic::x86_flags_write_u64: {
	// We need a frame pointer because this will get lowered to a PUSH/POP
	// sequence.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setHasCopyImplyingStackAdjustment(true);
	// Don't do anything here, we will expand these intrinsics out later
	// during ExpandISelPseudos in EmitInstrWithCustomInserter.
	return SDValue();
	}
	case Intrinsic::x86_lwpins32:
	case Intrinsic::x86_lwpins64: {
	SDLoc dl(Op);
	SDValue Chain = Op->getOperand(0);
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
	SDValue LwpIns =
	DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),
	Op->getOperand(3), Op->getOperand(4));
	SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);
	SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
	LwpIns.getValue(1));
	}
	}
	return SDValue();
	}

	SDLoc dl(Op);
	switch(IntrData->Type) {
	default: llvm_unreachable("Unknown Intrinsic Type");
	case RDSEED:
	case RDRAND: {
	// Emit the node with the right value type.
	SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
	SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

	// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
	// Otherwise return the value from Rand, which is always 0, casted to i32.
	SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
	DAG.getConstant(1, dl, Op->getValueType(1)),
	DAG.getConstant(X86::COND_B, dl, MVT::i8),
	SDValue(Result.getNode(), 1) };
	SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);

	// Return { result, isValid, chain }.
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
	SDValue(Result.getNode(), 2));
	}
	case GATHER_AVX2: {
	SDValue Chain = Op.getOperand(0);
	SDValue Src = Op.getOperand(2);
	SDValue Base = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
	Scale, Chain, Subtarget);
	}
	case GATHER: {
	//gather(v1, mask, index, base, scale);
	SDValue Chain = Op.getOperand(0);
	SDValue Src = Op.getOperand(2);
	SDValue Base = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
	Chain, Subtarget);
	}
	case SCATTER: {
	//scatter(base, mask, index, v1, scale);
	SDValue Chain = Op.getOperand(0);
	SDValue Base = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Src = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
	Scale, Chain, Subtarget);
	}
	case PREFETCH: {
	SDValue Hint = Op.getOperand(6);
	unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
	assert((HintVal == 2 \|\| HintVal == 3) &&
	"Wrong prefetch hint in intrinsic: should be 2 or 3");
	unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
	SDValue Chain = Op.getOperand(0);
	SDValue Mask = Op.getOperand(2);
	SDValue Index = Op.getOperand(3);
	SDValue Base = Op.getOperand(4);
	SDValue Scale = Op.getOperand(5);
	return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
	Subtarget);
	}
	// Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
	case RDTSC: {
	SmallVector<SDValue, 2> Results;
	getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
	Results);
	return DAG.getMergeValues(Results, dl);
	}
	// Read Performance Monitoring Counters.
	case RDPMC: {
	SmallVector<SDValue, 2> Results;
	getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
	return DAG.getMergeValues(Results, dl);
	}
	// Get Extended Control Register.
	case XGETBV: {
	SmallVector<SDValue, 2> Results;
	getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
	return DAG.getMergeValues(Results, dl);
	}
	// XTEST intrinsics.
	case XTEST: {
	SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
	SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

	SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
	SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
	Ret, SDValue(InTrans.getNode(), 1));
	}
	// ADC/ADCX/SBB
	case ADX: {
	SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
	SDVTList VTs = DAG.getVTList(Op.getOperand(3).getValueType(), MVT::i32);
	SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
	DAG.getConstant(-1, dl, MVT::i8));
	SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
	Op.getOperand(4), GenCF.getValue(1));
	SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
	Op.getOperand(5), MachinePointerInfo());
	SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
	SDValue Results[] = { SetCC, Store };
	return DAG.getMergeValues(Results, dl);
	}
	case COMPRESS_TO_MEM: {
	SDValue Mask = Op.getOperand(4);
	SDValue DataToCompress = Op.getOperand(3);
	SDValue Addr = Op.getOperand(2);
	SDValue Chain = Op.getOperand(0);
	MVT VT = DataToCompress.getSimpleValueType();

	MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
	assert(MemIntr && "Expected MemIntrinsicSDNode!");

	if (isAllOnesConstant(Mask)) // return just a store
	return DAG.getStore(Chain, dl, DataToCompress, Addr,
	MemIntr->getMemOperand());

	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
	MemIntr->getMemOperand(),
	false /* truncating /, true / compressing */);
	}
	case TRUNCATE_TO_MEM_VI8:
	case TRUNCATE_TO_MEM_VI16:
	case TRUNCATE_TO_MEM_VI32: {
	SDValue Mask = Op.getOperand(4);
	SDValue DataToTruncate = Op.getOperand(3);
	SDValue Addr = Op.getOperand(2);
	SDValue Chain = Op.getOperand(0);

	MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
	assert(MemIntr && "Expected MemIntrinsicSDNode!");

	EVT MemVT = MemIntr->getMemoryVT();

	uint16_t TruncationOp = IntrData->Opc0;
	switch (TruncationOp) {
	case X86ISD::VTRUNC: {
	if (isAllOnesConstant(Mask)) // return just a truncate store
	return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
	MemIntr->getMemOperand());

	MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
	MemIntr->getMemOperand(), true /* truncating */);
	}
	case X86ISD::VTRUNCUS:
	case X86ISD::VTRUNCS: {
	bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
	if (isAllOnesConstant(Mask))
	return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
	MemIntr->getMemOperand(), DAG);

	MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
	VMask, MemVT, MemIntr->getMemOperand(), DAG);
	}
	default:
	llvm_unreachable("Unsupported truncstore intrinsic");
	}
	}

	case EXPAND_FROM_MEM: {
	SDValue Mask = Op.getOperand(4);
	SDValue PassThru = Op.getOperand(3);
	SDValue Addr = Op.getOperand(2);
	SDValue Chain = Op.getOperand(0);
	MVT VT = Op.getSimpleValueType();

	MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
	assert(MemIntr && "Expected MemIntrinsicSDNode!");

	if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
	return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
	if (X86::isZeroNode(Mask))
	return DAG.getUNDEF(VT);

	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
	MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
	true /* expanding */);
	}
	}
	}

	SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setReturnAddressIsTaken(true);

	if (verifyReturnAddressArgumentIsConstant(Op, DAG))
	return SDValue();

	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	SDLoc dl(Op);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	if (Depth > 0) {
	SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
	return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
	DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
	MachinePointerInfo());
	}

	// Just load the return address.
	SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
	return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
	MachinePointerInfo());
	}

	SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
	return getReturnAddressFrameIndex(DAG);
	}

	SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	EVT VT = Op.getValueType();

	MFI.setFrameAddressIsTaken(true);

	if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
	// Depth > 0 makes no sense on targets which use Windows unwind codes. It
	// is not possible to crawl up the stack without looking at the unwind codes
	// simultaneously.
	int FrameAddrIndex = FuncInfo->getFAIndex();
	if (!FrameAddrIndex) {
	// Set up a frame object for the return address.
	unsigned SlotSize = RegInfo->getSlotSize();
	FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
	SlotSize, /Offset=/0, /IsImmutable=/false);
	FuncInfo->setFAIndex(FrameAddrIndex);
	}
	return DAG.getFrameIndex(FrameAddrIndex, VT);
	}

	unsigned FrameReg =
	RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
	SDLoc dl(Op); // FIXME probably not meaningful
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	assert(((FrameReg == X86::RBP && VT == MVT::i64) \|\|
	(FrameReg == X86::EBP && VT == MVT::i32)) &&
	"Invalid Frame Register!");
	SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
	while (Depth--)
	FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
	MachinePointerInfo());
	return FrameAddr;
	}

	// FIXME? Maybe this could be a TableGen attribute on some registers and
	// this table could be generated automatically from RegInfo.
	unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
	SelectionDAG &DAG) const {
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	const MachineFunction &MF = DAG.getMachineFunction();

	unsigned Reg = StringSwitch<unsigned>(RegName)
	.Case("esp", X86::ESP)
	.Case("rsp", X86::RSP)
	.Case("ebp", X86::EBP)
	.Case("rbp", X86::RBP)
	.Default(0);

	if (Reg == X86::EBP \|\| Reg == X86::RBP) {
	if (!TFI.hasFP(MF))
	report_fatal_error("register " + StringRef(RegName) +
	" is allocatable: function has no frame pointer");
	#ifndef NDEBUG
	else {
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned FrameReg =
	RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
	assert((FrameReg == X86::EBP \|\| FrameReg == X86::RBP) &&
	"Invalid Frame Register!");
	}
	#endif
	}

	if (Reg)
	return Reg;

	report_fatal_error("Invalid register name global variable");
	}

	SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
	SelectionDAG &DAG) const {
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
	}

	unsigned X86TargetLowering::getExceptionPointerRegister(
	const Constant *PersonalityFn) const {
	if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
	return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;

	return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
	}

	unsigned X86TargetLowering::getExceptionSelectorRegister(
	const Constant *PersonalityFn) const {
	// Funclet personalities don't use selectors (the runtime does the selection).
	assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
	return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
	}

	bool X86TargetLowering::needsFixedCatchObjects() const {
	return Subtarget.isTargetWin64();
	}

	SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
	SDValue Chain = Op.getOperand(0);
	SDValue Offset = Op.getOperand(1);
	SDValue Handler = Op.getOperand(2);
	SDLoc dl (Op);

	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
	assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) \|\|
	(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
	"Invalid Frame Register!");
	SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
	unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;

	SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
	DAG.getIntPtrConstant(RegInfo->getSlotSize(),
	dl));
	StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
	Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
	Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);

	return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
	DAG.getRegister(StoreAddrReg, PtrVT));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	// If the subtarget is not 64bit, we may need the global base reg
	// after isel expand pseudo, i.e., after CGBR pass ran.
	// Therefore, ask for the GlobalBaseReg now, so that the pass
	// inserts the code for us in case we need it.
	// Otherwise, we will end up in a situation where we will
	// reference a virtual register that is not defined!
	if (!Subtarget.is64Bit()) {
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	(void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
	}
	return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
	DAG.getVTList(MVT::i32, MVT::Other),
	Op.getOperand(0), Op.getOperand(1));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
	Op.getOperand(0), Op.getOperand(1));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
	Op.getOperand(0));
	}

	static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
	return Op.getOperand(0);
	}

	SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue Root = Op.getOperand(0);
	SDValue Trmp = Op.getOperand(1); // trampoline
	SDValue FPtr = Op.getOperand(2); // nested function
	SDValue Nest = Op.getOperand(3); // 'nest' parameter value
	SDLoc dl (Op);

	const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

	if (Subtarget.is64Bit()) {
	SDValue OutChains[6];

	// Large code-model.
	const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
	const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.

	const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
	const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;

	const unsigned char REX_WB = 0x40 \| 0x08 \| 0x01; // REX prefix

	// Load the pointer to the nested function into R11.
	unsigned OpCode = ((MOV64ri \| N86R11) << 8) \| REX_WB; // movabsq r11
	SDValue Addr = Trmp;
	OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(2, dl, MVT::i64));
	OutChains[1] =
	DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
	/* Alignment = */ 2);

	// Load the 'nest' parameter value into R10.
	// R10 is specified in X86CallingConv.td
	OpCode = ((MOV64ri \| N86R10) << 8) \| REX_WB; // movabsq r10
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(10, dl, MVT::i64));
	OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr, 10));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(12, dl, MVT::i64));
	OutChains[3] =
	DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
	/* Alignment = */ 2);

	// Jump to the nested function.
	OpCode = (JMP64r << 8) \| REX_WB; // jmpq *...
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(20, dl, MVT::i64));
	OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr, 20));

	unsigned char ModRM = N86R11 \| (4 << 3) \| (3 << 6); // ...r11
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(22, dl, MVT::i64));
	OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
	Addr, MachinePointerInfo(TrmpAddr, 22));

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	} else {
	const Function *Func =
	cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
	CallingConv::ID CC = Func->getCallingConv();
	unsigned NestReg;

	switch (CC) {
	default:
	llvm_unreachable("Unsupported calling convention");
	case CallingConv::C:
	case CallingConv::X86_StdCall: {
	// Pass 'nest' parameter in ECX.
	// Must be kept in sync with X86CallingConv.td
	NestReg = X86::ECX;

	// Check that ECX wasn't needed by an 'inreg' parameter.
	FunctionType *FTy = Func->getFunctionType();
	const AttributeList &Attrs = Func->getAttributes();

	if (!Attrs.isEmpty() && !Func->isVarArg()) {
	unsigned InRegCount = 0;
	unsigned Idx = 1;

	for (FunctionType::param_iterator I = FTy->param_begin(),
	E = FTy->param_end(); I != E; ++I, ++Idx)
	if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
	auto &DL = DAG.getDataLayout();
	// FIXME: should only count parameters that are lowered to integers.
	InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
	}

	if (InRegCount > 2) {
	report_fatal_error("Nest register in use - reduce number of inreg"
	" parameters!");
	}
	}
	break;
	}
	case CallingConv::X86_FastCall:
	case CallingConv::X86_ThisCall:
	case CallingConv::Fast:
	// Pass 'nest' parameter in EAX.
	// Must be kept in sync with X86CallingConv.td
	NestReg = X86::EAX;
	break;
	}

	SDValue OutChains[4];
	SDValue Addr, Disp;

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(10, dl, MVT::i32));
	Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);

	// This is storing the opcode for MOV32ri.
	const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
	const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
	OutChains[0] =
	DAG.getStore(Root, dl, DAG.getConstant(MOV32ri \| N86Reg, dl, MVT::i8),
	Trmp, MachinePointerInfo(TrmpAddr));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(1, dl, MVT::i32));
	OutChains[1] =
	DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
	/* Alignment = */ 1);

	const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(5, dl, MVT::i32));
	OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
	Addr, MachinePointerInfo(TrmpAddr, 5),
	/* Alignment = */ 1);

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(6, dl, MVT::i32));
	OutChains[3] =
	DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
	/* Alignment = */ 1);

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	}
	}

	SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
	SelectionDAG &DAG) const {
	/*
	The rounding mode is in bits 11:10 of FPSR, and has the following
	settings:
	00 Round to nearest
	01 Round to -inf
	10 Round to +inf
	11 Round to 0

	FLT_ROUNDS, on the other hand, expects the following:
	-1 Undefined
	0 Round to 0
	1 Round to nearest
	2 Round to +inf
	3 Round to -inf

	To perform the conversion, we do:
	(((((FPSR & 0x800) >> 11) \| ((FPSR & 0x400) >> 9)) + 1) & 3)
	*/

	MachineFunction &MF = DAG.getMachineFunction();
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	unsigned StackAlignment = TFI.getStackAlignment();
	MVT VT = Op.getSimpleValueType();
	SDLoc DL(Op);

	// Save FP Control Word to stack slot
	int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
	SDValue StackSlot =
	DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));

	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
	MachineMemOperand::MOStore, 2, 2);

	SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
	SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
	DAG.getVTList(MVT::Other),
	Ops, MVT::i16, MMO);

	// Load FP Control Word from stack slot
	SDValue CWD =
	DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());

	// Transform as necessary
	SDValue CWD1 =
	DAG.getNode(ISD::SRL, DL, MVT::i16,
	DAG.getNode(ISD::AND, DL, MVT::i16,
	CWD, DAG.getConstant(0x800, DL, MVT::i16)),
	DAG.getConstant(11, DL, MVT::i8));
	SDValue CWD2 =
	DAG.getNode(ISD::SRL, DL, MVT::i16,
	DAG.getNode(ISD::AND, DL, MVT::i16,
	CWD, DAG.getConstant(0x400, DL, MVT::i16)),
	DAG.getConstant(9, DL, MVT::i8));

	SDValue RetVal =
	DAG.getNode(ISD::AND, DL, MVT::i16,
	DAG.getNode(ISD::ADD, DL, MVT::i16,
	DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
	DAG.getConstant(1, DL, MVT::i16)),
	DAG.getConstant(3, DL, MVT::i16));

	return DAG.getNode((VT.getSizeInBits() < 16 ?
	ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
	}

	// Split an unary integer op into 2 half sized ops.
	static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	unsigned NumElems = VT.getVectorNumElements();
	unsigned SizeInBits = VT.getSizeInBits();

	// Extract the Lo/Hi vectors
	SDLoc dl(Op);
	SDValue Src = Op.getOperand(0);
	SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
	SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);

	MVT EltVT = VT.getVectorElementType();
	MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
	DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
	}

	// Decompose 256-bit ops into smaller 128-bit ops.
	static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return LowerVectorIntUnary(Op, DAG);
	}

	// Decompose 512-bit ops into smaller 256-bit ops.
	static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().is512BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 512-bit vector integer operation");
	return LowerVectorIntUnary(Op, DAG);
	}

	/// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
	//
	// i8/i16 vector implemented using dword LZCNT vector instruction
	// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
	// split the vector, perform operation on it's Lo a Hi part and
	// concatenate the results.
	static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getOpcode() == ISD::CTLZ);
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElems = VT.getVectorNumElements();

	assert((EltVT == MVT::i8 \|\| EltVT == MVT::i16) &&
	"Unsupported element type");

	// Split vector, it's Lo and Hi parts will be handled in next iteration.
	if (16 < NumElems)
	return LowerVectorIntUnary(Op, DAG);

	MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
	assert((NewVT.is256BitVector() \|\| NewVT.is512BitVector()) &&
	"Unsupported value type for operation");

	// Use native supported vector instruction vplzcntd.
	Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
	SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
	SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
	SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);

	return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
	}

	// Lower CTLZ using a PSHUFB lookup table implementation.
	static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	int NumElts = VT.getVectorNumElements();
	int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
	MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);

	// Per-nibble leading zero PSHUFB lookup table.
	const int LUT[16] = {/* 0 / 4, / 1 / 3, / 2 / 2, / 3 */ 2,
	/* 4 / 1, / 5 / 1, / 6 / 1, / 7 */ 1,
	/* 8 / 0, / 9 / 0, / a / 0, / b */ 0,
	/* c / 0, / d / 0, / e / 0, / f */ 0};

	SmallVector<SDValue, 64> LUTVec;
	for (int i = 0; i < NumBytes; ++i)
	LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
	SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);

	// Begin by bitcasting the input to byte vector, then split those bytes
	// into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
	// If the hi input nibble is zero then we add both results together, otherwise
	// we just take the hi result (by masking the lo result to zero before the
	// add).
	SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
	SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);

	SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
	SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
	SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
	SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
	SDValue HiZ;
	if (CurrVT.is512BitVector()) {
	MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
	HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
	HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
	} else {
	HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
	}

	Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
	Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
	Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
	SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);

	// Merge result back from vXi8 back to VT, working on the lo/hi halves
	// of the current vector width in the same way we did for the nibbles.
	// If the upper half of the input element is zero then add the halves'
	// leading zero counts together, otherwise just use the upper half's.
	// Double the width of the result until we are at target width.
	while (CurrVT != VT) {
	int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
	int CurrNumElts = CurrVT.getVectorNumElements();
	MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
	MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
	SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);

	// Check if the upper half of the input element is zero.
	if (CurrVT.is512BitVector()) {
	MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
	HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
	DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
	HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
	} else {
	HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
	DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
	}
	HiZ = DAG.getBitcast(NextVT, HiZ);

	// Move the upper/lower halves to the lower bits as we'll be extending to
	// NextVT. Mask the lower result to zero if HiZ is true and add the results
	// together.
	SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
	SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
	SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
	R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
	Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
	CurrVT = NextVT;
	}

	return Res;
	}

	static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	if (Subtarget.hasCDI())
	return LowerVectorCTLZ_AVX512CDI(Op, DAG);

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntUnary(Op, DAG);

	// Decompose 512-bit ops into smaller 256-bit ops.
	if (VT.is512BitVector() && !Subtarget.hasBWI())
	return Lower512IntUnary(Op, DAG);

	assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
	return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
	}

	static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT OpVT = VT;
	unsigned NumBits = VT.getSizeInBits();
	SDLoc dl(Op);
	unsigned Opc = Op.getOpcode();

	if (VT.isVector())
	return LowerVectorCTLZ(Op, dl, Subtarget, DAG);

	Op = Op.getOperand(0);
	if (VT == MVT::i8) {
	// Zero extend to i32 since there is not an i8 bsr.
	OpVT = MVT::i32;
	Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
	}

	// Issue a bsr (scan bits in reverse) which also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
	Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);

	if (Opc == ISD::CTLZ) {
	// If src is zero (i.e. bsr sets ZF), returns NumBits.
	SDValue Ops[] = {
	Op,
	DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
	DAG.getConstant(X86::COND_E, dl, MVT::i8),
	Op.getValue(1)
	};
	Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
	}

	// Finally xor with NumBits-1.
	Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
	DAG.getConstant(NumBits - 1, dl, OpVT));

	if (VT == MVT::i8)
	Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
	return Op;
	}

	static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	unsigned NumBits = VT.getScalarSizeInBits();
	SDLoc dl(Op);

	if (VT.isVector()) {
	SDValue N0 = Op.getOperand(0);
	SDValue Zero = DAG.getConstant(0, dl, VT);

	// lsb(x) = (x & -x)
	SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
	DAG.getNode(ISD::SUB, dl, VT, Zero, N0));

	// cttz_undef(x) = (width - 1) - ctlz(lsb)
	if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
	SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
	return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
	DAG.getNode(ISD::CTLZ, dl, VT, LSB));
	}

	// cttz(x) = ctpop(lsb - 1)
	SDValue One = DAG.getConstant(1, dl, VT);
	return DAG.getNode(ISD::CTPOP, dl, VT,
	DAG.getNode(ISD::SUB, dl, VT, LSB, One));
	}

	assert(Op.getOpcode() == ISD::CTTZ &&
	"Only scalar CTTZ requires custom lowering");

	// Issue a bsf (scan bits forward) which also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));

	// If src is zero (i.e. bsf sets ZF), returns NumBits.
	SDValue Ops[] = {
	Op,
	DAG.getConstant(NumBits, dl, VT),
	DAG.getConstant(X86::COND_E, dl, MVT::i8),
	Op.getValue(1)
	};
	return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
	}

	/// Break a 256-bit integer operation into two new 128-bit ones and then
	/// concatenate the result back.
	static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	assert(VT.is256BitVector() && VT.isInteger() &&
	"Unsupported value type for operation");

	unsigned NumElems = VT.getVectorNumElements();
	SDLoc dl(Op);

	// Extract the LHS vectors
	SDValue LHS = Op.getOperand(0);
	SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
	SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);

	// Extract the RHS vectors
	SDValue RHS = Op.getOperand(1);
	SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
	SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);

	MVT EltVT = VT.getVectorElementType();
	MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
	}

	/// Break a 512-bit integer operation into two new 256-bit ones and then
	/// concatenate the result back.
	static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	assert(VT.is512BitVector() && VT.isInteger() &&
	"Unsupported value type for operation");

	unsigned NumElems = VT.getVectorNumElements();
	SDLoc dl(Op);

	// Extract the LHS vectors
	SDValue LHS = Op.getOperand(0);
	SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
	SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);

	// Extract the RHS vectors
	SDValue RHS = Op.getOperand(1);
	SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
	SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);

	MVT EltVT = VT.getVectorElementType();
	MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
	}

	static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	if (VT.getScalarType() == MVT::i1)
	return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
	Op.getOperand(0), Op.getOperand(1));
	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return Lower256IntArith(Op, DAG);
	}

	static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	if (VT == MVT::i16 \|\| VT == MVT::i32 \|\| VT == MVT::i64) {
	// Since X86 does not have CMOV for 8-bit integer, we don't convert
	// 8-bit integer abs to NEG and CMOV.
	SDLoc DL(Op);
	SDValue N0 = Op.getOperand(0);
	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
	DAG.getConstant(0, DL, VT), N0);
	SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8),
	SDValue(Neg.getNode(), 1)};
	return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
	}

	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return Lower256IntUnary(Op, DAG);
	}

	static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return Lower256IntArith(Op, DAG);
	}

	static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	if (VT.getScalarType() == MVT::i1)
	return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntArith(Op, DAG);

	SDValue A = Op.getOperand(0);
	SDValue B = Op.getOperand(1);

	// Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
	// vector pairs, multiply and truncate.
	if (VT == MVT::v16i8 \|\| VT == MVT::v32i8 \|\| VT == MVT::v64i8) {
	if (Subtarget.hasInt256()) {
	// For 512-bit vectors, split into 256-bit vectors to allow the
	// sign-extension to occur.
	if (VT == MVT::v64i8)
	return Lower512IntArith(Op, DAG);

	// For 256-bit vectors, split into 128-bit vectors to allow the
	// sign-extension to occur. We don't need this on AVX512BW as we can
	// safely sign-extend to v32i16.
	if (VT == MVT::v32i8 && !Subtarget.hasBWI())
	return Lower256IntArith(Op, DAG);

	MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
	return DAG.getNode(
	ISD::TRUNCATE, dl, VT,
	DAG.getNode(ISD::MUL, dl, ExVT,
	DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
	DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
	}

	assert(VT == MVT::v16i8 &&
	"Pre-AVX2 support only supports v16i8 multiplication");
	MVT ExVT = MVT::v8i16;

	// Extract the lo parts and sign extend to i16
	SDValue ALo, BLo;
	if (Subtarget.hasSSE41()) {
	ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
	BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
	} else {
	const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
	-1, 4, -1, 5, -1, 6, -1, 7};
	ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	ALo = DAG.getBitcast(ExVT, ALo);
	BLo = DAG.getBitcast(ExVT, BLo);
	ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
	BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
	}

	// Extract the hi parts and sign extend to i16
	SDValue AHi, BHi;
	if (Subtarget.hasSSE41()) {
	const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
	-1, -1, -1, -1, -1, -1, -1, -1};
	AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
	BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
	} else {
	const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
	-1, 12, -1, 13, -1, 14, -1, 15};
	AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	AHi = DAG.getBitcast(ExVT, AHi);
	BHi = DAG.getBitcast(ExVT, BHi);
	AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
	BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
	}

	// Multiply, mask the lower 8bits of the lo/hi results and pack
	SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
	SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
	RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
	RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}

	// Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
	if (VT == MVT::v4i32) {
	assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
	"Should not custom lower when pmulld is available!");

	// If the upper 17 bits of each element are zero then we can use PMADD.
	APInt Mask17 = APInt::getHighBitsSet(32, 17);
	if (DAG.MaskedValueIsZero(A, Mask17) && DAG.MaskedValueIsZero(B, Mask17))
	return DAG.getNode(X86ISD::VPMADDWD, dl, VT,
	DAG.getBitcast(MVT::v8i16, A),
	DAG.getBitcast(MVT::v8i16, B));

	// Extract the odd parts.
	static const int UnpackMask[] = { 1, -1, 3, -1 };
	SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
	SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);

	// Multiply the even parts.
	SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
	// Now multiply odd parts.
	SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);

	Evens = DAG.getBitcast(VT, Evens);
	Odds = DAG.getBitcast(VT, Odds);

	// Merge the two vectors back together with a shuffle. This expands into 2
	// shuffles.
	static const int ShufMask[] = { 0, 4, 2, 6 };
	return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
	}

	assert((VT == MVT::v2i64 \|\| VT == MVT::v4i64 \|\| VT == MVT::v8i64) &&
	"Only know how to lower V2I64/V4I64/V8I64 multiply");

	// 32-bit vector types used for MULDQ/MULUDQ.
	MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

	// MULDQ returns the 64-bit result of the signed multiplication of the lower
	// 32-bits. We can lower with this if the sign bits stretch that far.
	if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
	DAG.ComputeNumSignBits(B) > 32) {
	return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
	DAG.getBitcast(MulVT, B));
	}

	// Ahi = psrlqi(a, 32);
	// Bhi = psrlqi(b, 32);
	//
	// AloBlo = pmuludq(a, b);
	// AloBhi = pmuludq(a, Bhi);
	// AhiBlo = pmuludq(Ahi, b);
	//
	// Hi = psllqi(AloBhi + AhiBlo, 32);
	// return AloBlo + Hi;
	APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
	bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
	bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);

	APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
	bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
	bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);

	// If DQI is supported we can use MULLQ, but MULUDQ is still better if the
	// the high bits are known to be zero.
	if (Subtarget.hasDQI() && (!AHiIsZero \|\| !BHiIsZero))
	return Op;

	// Bit cast to 32-bit vectors for MULUDQ.
	SDValue Alo = DAG.getBitcast(MulVT, A);
	SDValue Blo = DAG.getBitcast(MulVT, B);

	SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);

	// Only multiply lo/hi halves that aren't known to be zero.
	SDValue AloBlo = Zero;
	if (!ALoIsZero && !BLoIsZero)
	AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);

	SDValue AloBhi = Zero;
	if (!ALoIsZero && !BHiIsZero) {
	SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
	Bhi = DAG.getBitcast(MulVT, Bhi);
	AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
	}

	SDValue AhiBlo = Zero;
	if (!AHiIsZero && !BLoIsZero) {
	SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
	Ahi = DAG.getBitcast(MulVT, Ahi);
	AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
	}

	SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
	Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);

	return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
	}

	static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntArith(Op, DAG);

	// Only i8 vectors should need custom lowering after this.
	assert((VT == MVT::v16i8 \|\| (VT == MVT::v32i8 && Subtarget.hasInt256()) \|\|
	(VT == MVT::v64i8 && Subtarget.hasBWI())) &&
	"Unsupported vector type");

	// Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
	// logical shift down the upper half and pack back to i8.
	SDValue A = Op.getOperand(0);
	SDValue B = Op.getOperand(1);

	// With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
	// and then ashr/lshr the upper bits down to the lower bits before multiply.
	unsigned Opcode = Op.getOpcode();
	unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
	unsigned ExAVX = (ISD::MULHU == Opcode ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND);

	// For 512-bit vectors, split into 256-bit vectors to allow the
	// sign-extension to occur.
	if (VT == MVT::v64i8)
	return Lower512IntArith(Op, DAG);

	// AVX2 implementations - extend xmm subvectors to ymm.
	if (Subtarget.hasInt256()) {
	unsigned NumElems = VT.getVectorNumElements();
	SDValue Lo = DAG.getIntPtrConstant(0, dl);
	SDValue Hi = DAG.getIntPtrConstant(NumElems / 2, dl);

	if (VT == MVT::v32i8) {
	if (Subtarget.hasBWI()) {
	SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v32i16, A);
	SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v32i16, B);
	SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v32i16, ExA, ExB);
	Mul = DAG.getNode(ISD::SRL, dl, MVT::v32i16, Mul,
	DAG.getConstant(8, dl, MVT::v32i16));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
	}
	SDValue ALo = extract128BitVector(A, 0, DAG, dl);
	SDValue BLo = extract128BitVector(B, 0, DAG, dl);
	SDValue AHi = extract128BitVector(A, NumElems / 2, DAG, dl);
	SDValue BHi = extract128BitVector(B, NumElems / 2, DAG, dl);
	ALo = DAG.getNode(ExAVX, dl, MVT::v16i16, ALo);
	BLo = DAG.getNode(ExAVX, dl, MVT::v16i16, BLo);
	AHi = DAG.getNode(ExAVX, dl, MVT::v16i16, AHi);
	BHi = DAG.getNode(ExAVX, dl, MVT::v16i16, BHi);
	Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
	DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
	DAG.getConstant(8, dl, MVT::v16i16));
	Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
	DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
	DAG.getConstant(8, dl, MVT::v16i16));
	// The ymm variant of PACKUS treats the 128-bit lanes separately, so before
	// using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
	const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
	16, 17, 18, 19, 20, 21, 22, 23};
	const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
	24, 25, 26, 27, 28, 29, 30, 31};
	return DAG.getNode(X86ISD::PACKUS, dl, VT,
	DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
	DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
	}

	assert(VT == MVT::v16i8 && "Unexpected VT");

	SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A);
	SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B);
	SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
	Mul = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
	DAG.getConstant(8, dl, MVT::v16i16));
	// If we have BWI we can use truncate instruction.
	if (Subtarget.hasBWI())
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
	Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Lo);
	Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Hi);
	return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
	}

	assert(VT == MVT::v16i8 &&
	"Pre-AVX2 support only supports v16i8 multiplication");
	MVT ExVT = MVT::v8i16;
	unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);

	// Extract the lo parts and zero/sign extend to i16.
	SDValue ALo, BLo;
	if (Subtarget.hasSSE41()) {
	ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
	BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
	} else {
	const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
	-1, 4, -1, 5, -1, 6, -1, 7};
	ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	ALo = DAG.getBitcast(ExVT, ALo);
	BLo = DAG.getBitcast(ExVT, BLo);
	ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
	BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
	}

	// Extract the hi parts and zero/sign extend to i16.
	SDValue AHi, BHi;
	if (Subtarget.hasSSE41()) {
	const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
	-1, -1, -1, -1, -1, -1, -1, -1};
	AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
	BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
	} else {
	const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
	-1, 12, -1, 13, -1, 14, -1, 15};
	AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	AHi = DAG.getBitcast(ExVT, AHi);
	BHi = DAG.getBitcast(ExVT, BHi);
	AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
	BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
	}

	// Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
	// pack back to v16i8.
	SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
	SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
	RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
	RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}

	SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
	assert(Subtarget.isTargetWin64() && "Unexpected target");
	EVT VT = Op.getValueType();
	assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
	"Unexpected return type for lowering");

	RTLIB::Libcall LC;
	bool isSigned;
	switch (Op->getOpcode()) {
	default: llvm_unreachable("Unexpected request for libcall!");
	case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
	case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
	case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
	case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
	case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
	case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
	}

	SDLoc dl(Op);
	SDValue InChain = DAG.getEntryNode();

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
	EVT ArgVT = Op->getOperand(i).getValueType();
	assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
	"Unexpected argument type for lowering");
	SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
	Entry.Node = StackPtr;
	InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
	MachinePointerInfo(), /* Alignment = */ 16);
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
	Entry.Ty = PointerType::get(ArgTy,0);
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);
	}

	SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
	getPointerTy(DAG.getDataLayout()));

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(InChain)
	.setLibCallee(
	getLibcallCallingConv(LC),
	static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
	std::move(Args))
	.setInRegister()
	.setSExtResult(isSigned)
	.setZExtResult(!isSigned);

	std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
	return DAG.getBitcast(VT, CallInfo.first);
	}

	static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
	MVT VT = Op0.getSimpleValueType();
	SDLoc dl(Op);

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256()) {
	unsigned Opcode = Op.getOpcode();
	unsigned NumElems = VT.getVectorNumElements();
	MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
	SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
	SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
	SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
	SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
	SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
	SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
	SDValue Ops[] = {
	DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
	DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
	};
	return DAG.getMergeValues(Ops, dl);
	}

	assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) \|\|
	(VT == MVT::v8i32 && Subtarget.hasInt256()) \|\|
	(VT == MVT::v16i32 && Subtarget.hasAVX512()));

	int NumElts = VT.getVectorNumElements();

	// PMULxD operations multiply each even value (starting at 0) of LHS with
	// the related value of RHS and produce a widen result.
	// E.g., PMULUDQ <4 x i32> <a\|b\|c\|d>, <4 x i32> <e\|f\|g\|h>
	// => <2 x i64> <ae\|cg>
	//
	// In other word, to have all the results, we need to perform two PMULxD:
	// 1. one with the even values.
	// 2. one with the odd values.
	// To achieve #2, with need to place the odd values at an even position.
	//
	// Place the odd value at an even position (basically, shift all values 1
	// step to the left):
	const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1};
	// <a\|b\|c\|d> => <b\|undef\|d\|undef>
	SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
	makeArrayRef(&Mask[0], NumElts));
	// <e\|f\|g\|h> => <f\|undef\|h\|undef>
	SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
	makeArrayRef(&Mask[0], NumElts));

	// Emit two multiplies, one for the lower 2 ints and one for the higher 2
	// ints.
	MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
	bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
	unsigned Opcode =
	(!IsSigned \|\| !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
	// PMULUDQ <4 x i32> <a\|b\|c\|d>, <4 x i32> <e\|f\|g\|h>
	// => <2 x i64> <ae\|cg>
	SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
	// PMULUDQ <4 x i32> <b\|undef\|d\|undef>, <4 x i32> <f\|undef\|h\|undef>
	// => <2 x i64> <bf\|dh>
	SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));

	// Shuffle it back into the right order.
	SmallVector<int, 16> HighMask(NumElts);
	SmallVector<int, 16> LowMask(NumElts);
	for (int i = 0; i != NumElts; ++i) {
	HighMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
	LowMask[i] = (i / 2) * 2 + ((i % 2) * NumElts);
	}

	SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
	SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);

	// If we have a signed multiply but no PMULDQ fix up the high parts of a
	// unsigned multiply.
	if (IsSigned && !Subtarget.hasSSE41()) {
	SDValue ShAmt = DAG.getConstant(
	31, dl,
	DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
	SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
	SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);

	SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
	Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
	}

	// The first result of MUL_LOHI is actually the low value, followed by the
	// high value.
	SDValue Ops[] = {Lows, Highs};
	return DAG.getMergeValues(Ops, dl);
	}

	// Return true if the required (according to Opcode) shift-imm form is natively
	// supported by the Subtarget
	static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {
	if (VT.getScalarSizeInBits() < 16)
	return false;

	if (VT.is512BitVector() && Subtarget.hasAVX512() &&
	(VT.getScalarSizeInBits() > 16 \|\| Subtarget.hasBWI()))
	return true;

	bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(VT.is256BitVector() && Subtarget.hasInt256());

	bool AShift = LShift && (Subtarget.hasAVX512() \|\|
	(VT != MVT::v2i64 && VT != MVT::v4i64));
	return (Opcode == ISD::SRA) ? AShift : LShift;
	}

	// The shift amount is a variable, but it is the same for all vector lanes.
	// These instructions are defined together with shift-immediate.
	static
	bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {
	return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
	}

	// Return true if the required (according to Opcode) variable-shift form is
	// natively supported by the Subtarget
	static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {

	if (!Subtarget.hasInt256() \|\| VT.getScalarSizeInBits() < 16)
	return false;

	// vXi16 supported only on AVX-512, BWI
	if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
	return false;

	if (Subtarget.hasAVX512())
	return true;

	bool LShift = VT.is128BitVector() \|\| VT.is256BitVector();
	bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
	return (Opcode == ISD::SRA) ? AShift : LShift;
	}

	static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);

	unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
	(Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;

	auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
	assert((VT == MVT::v2i64 \|\| VT == MVT::v4i64) && "Unexpected SRA type");
	MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
	SDValue Ex = DAG.getBitcast(ExVT, R);

	// ashr(R, 63) === cmp_slt(R, 0)
	if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
	assert((VT != MVT::v4i64 \|\| Subtarget.hasInt256()) &&
	"Unsupported PCMPGT op");
	return DAG.getNode(X86ISD::PCMPGT, dl, VT,
	getZeroVector(VT, Subtarget, DAG, dl), R);
	}

	if (ShiftAmt >= 32) {
	// Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
	SDValue Upper =
	getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
	SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
	ShiftAmt - 32, DAG);
	if (VT == MVT::v2i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
	if (VT == MVT::v4i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
	{9, 1, 11, 3, 13, 5, 15, 7});
	} else {
	// SRA upper i32, SHL whole i64 and select lower i32.
	SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
	ShiftAmt, DAG);
	SDValue Lower =
	getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
	Lower = DAG.getBitcast(ExVT, Lower);
	if (VT == MVT::v2i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
	if (VT == MVT::v4i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
	{8, 1, 10, 3, 12, 5, 14, 7});
	}
	return DAG.getBitcast(VT, Ex);
	};

	// Optimize shl/srl/sra with constant shift amount.
	if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
	if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
	uint64_t ShiftAmt = ShiftConst->getZExtValue();

	if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
	return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);

	// i64 SRA needs to be performed as partial shifts.
	if (((!Subtarget.hasXOP() && VT == MVT::v2i64) \|\|
	(Subtarget.hasInt256() && VT == MVT::v4i64)) &&
	Op.getOpcode() == ISD::SRA)
	return ArithmeticShiftRight64(ShiftAmt);

	if (VT == MVT::v16i8 \|\|
	(Subtarget.hasInt256() && VT == MVT::v32i8) \|\|
	VT == MVT::v64i8) {
	unsigned NumElts = VT.getVectorNumElements();
	MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

	// Simple i8 add case
	if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
	return DAG.getNode(ISD::ADD, dl, VT, R, R);

	// ashr(R, 7) === cmp_slt(R, 0)
	if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
	SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
	if (VT.is512BitVector()) {
	assert(VT == MVT::v64i8 && "Unexpected element type!");
	SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
	return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
	}
	return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
	}

	// XOP can shift v16i8 directly instead of as shift v8i16 + mask.
	if (VT == MVT::v16i8 && Subtarget.hasXOP())
	return SDValue();

	if (Op.getOpcode() == ISD::SHL) {
	// Make a large shift.
	SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
	R, ShiftAmt, DAG);
	SHL = DAG.getBitcast(VT, SHL);
	// Zero out the rightmost bits.
	return DAG.getNode(ISD::AND, dl, VT, SHL,
	DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
	}
	if (Op.getOpcode() == ISD::SRL) {
	// Make a large shift.
	SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
	R, ShiftAmt, DAG);
	SRL = DAG.getBitcast(VT, SRL);
	// Zero out the leftmost bits.
	return DAG.getNode(ISD::AND, dl, VT, SRL,
	DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
	}
	if (Op.getOpcode() == ISD::SRA) {
	// ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
	SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);

	SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
	Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
	Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
	return Res;
	}
	llvm_unreachable("Unknown shift opcode.");
	}
	}
	}

	// Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
	// TODO: Replace constant extraction with getTargetConstantBitsFromNode.
	if (!Subtarget.hasXOP() &&
	(VT == MVT::v2i64 \|\| (Subtarget.hasInt256() && VT == MVT::v4i64) \|\|
	(Subtarget.hasAVX512() && VT == MVT::v8i64))) {

	// AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
	unsigned SubVectorScale = 1;
	if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
	SubVectorScale =
	Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
	Amt = Amt.getOperand(0);
	}

	// Peek through any splat that was introduced for i64 shift vectorization.
	int SplatIndex = -1;
	if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
	if (SVN->isSplat()) {
	SplatIndex = SVN->getSplatIndex();
	Amt = Amt.getOperand(0);
	assert(SplatIndex < (int)VT.getVectorNumElements() &&
	"Splat shuffle referencing second operand");
	}

	if (Amt.getOpcode() != ISD::BITCAST \|\|
	Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	Amt = Amt.getOperand(0);
	unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
	(SubVectorScale * VT.getVectorNumElements());
	unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
	uint64_t ShiftAmt = 0;
	unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
	for (unsigned i = 0; i != Ratio; ++i) {
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
	if (!C)
	return SDValue();
	// 6 == Log2(64)
	ShiftAmt \|= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
	}

	// Check remaining shift amounts (if not a splat).
	if (SplatIndex < 0) {
	for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
	uint64_t ShAmt = 0;
	for (unsigned j = 0; j != Ratio; ++j) {
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
	if (!C)
	return SDValue();
	// 6 == Log2(64)
	ShAmt \|= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
	}
	if (ShAmt != ShiftAmt)
	return SDValue();
	}
	}

	if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
	return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);

	if (Op.getOpcode() == ISD::SRA)
	return ArithmeticShiftRight64(ShiftAmt);
	}

	return SDValue();
	}

	static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);

	unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
	(Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;

	unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
	(Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;

	if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
	SDValue BaseShAmt;
	MVT EltVT = VT.getVectorElementType();

	if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
	// Check if this build_vector node is doing a splat.
	// If so, then set BaseShAmt equal to the splat value.
	BaseShAmt = BV->getSplatValue();
	if (BaseShAmt && BaseShAmt.isUndef())
	BaseShAmt = SDValue();
	} else {
	if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
	Amt = Amt.getOperand(0);

	ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
	if (SVN && SVN->isSplat()) {
	unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
	SDValue InVec = Amt.getOperand(0);
	if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
	assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
	"Unexpected shuffle index found!");
	BaseShAmt = InVec.getOperand(SplatIdx);
	} else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
	if (ConstantSDNode *C =
	dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
	if (C->getZExtValue() == SplatIdx)
	BaseShAmt = InVec.getOperand(1);
	}
	}

	if (!BaseShAmt)
	// Avoid introducing an extract element from a shuffle.
	BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
	DAG.getIntPtrConstant(SplatIdx, dl));
	}
	}

	if (BaseShAmt.getNode()) {
	assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
	if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
	BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
	else if (EltVT.bitsLT(MVT::i32))
	BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);

	return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
	}
	}

	// Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
	if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
	Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
	Amt = Amt.getOperand(0);
	unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
	VT.getVectorNumElements();
	std::vector<SDValue> Vals(Ratio);
	for (unsigned i = 0; i != Ratio; ++i)
	Vals[i] = Amt.getOperand(i);
	for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
	for (unsigned j = 0; j != Ratio; ++j)
	if (Vals[j] != Amt.getOperand(i + j))
	return SDValue();
	}

	if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
	return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
	}
	return SDValue();
	}

	static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

	assert(VT.isVector() && "Custom lowering only for vector shifts!");
	assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");

	if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
	return V;

	if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
	return V;

	if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
	return Op;

	// XOP has 128-bit variable logical/arithmetic shifts.
	// +ve/-ve Amt = shift left/right.
	if (Subtarget.hasXOP() &&
	(VT == MVT::v2i64 \|\| VT == MVT::v4i32 \|\|
	VT == MVT::v8i16 \|\| VT == MVT::v16i8)) {
	if (Op.getOpcode() == ISD::SRL \|\| Op.getOpcode() == ISD::SRA) {
	SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
	Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
	}
	if (Op.getOpcode() == ISD::SHL \|\| Op.getOpcode() == ISD::SRL)
	return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
	if (Op.getOpcode() == ISD::SRA)
	return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
	}

	// 2i64 vector logical shifts can efficiently avoid scalarization - do the
	// shifts per-lane and then shuffle the partial results back together.
	if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
	// Splat the shift amounts so the scalar shifts above will catch it.
	SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
	SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
	SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
	SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
	return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
	}

	// i64 vector arithmetic shift can be emulated with the transform:
	// M = lshr(SIGN_MASK, Amt)
	// ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
	if ((VT == MVT::v2i64 \|\| (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
	Op.getOpcode() == ISD::SRA) {
	SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
	SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
	R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
	R = DAG.getNode(ISD::XOR, dl, VT, R, M);
	R = DAG.getNode(ISD::SUB, dl, VT, R, M);
	return R;
	}

	// If possible, lower this packed shift into a vector multiply instead of
	// expanding it into a sequence of scalar shifts.
	// Do this only if the vector shift count is a constant build_vector.
	if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
	(VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
	(Subtarget.hasInt256() && VT == MVT::v16i16))) {
	SmallVector<SDValue, 8> Elts;
	MVT SVT = VT.getVectorElementType();
	unsigned SVTBits = SVT.getSizeInBits();
	APInt One(SVTBits, 1);
	unsigned NumElems = VT.getVectorNumElements();

	for (unsigned i=0; i !=NumElems; ++i) {
	SDValue Op = Amt->getOperand(i);
	if (Op->isUndef()) {
	Elts.push_back(Op);
	continue;
	}

	ConstantSDNode *ND = cast<ConstantSDNode>(Op);
	APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
	uint64_t ShAmt = C.getZExtValue();
	if (ShAmt >= SVTBits) {
	Elts.push_back(DAG.getUNDEF(SVT));
	continue;
	}
	Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
	}
	SDValue BV = DAG.getBuildVector(VT, dl, Elts);
	return DAG.getNode(ISD::MUL, dl, VT, R, BV);
	}

	// Lower SHL with variable shift amount.
	if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
	Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));

	Op = DAG.getNode(ISD::ADD, dl, VT, Op,
	DAG.getConstant(0x3f800000U, dl, VT));
	Op = DAG.getBitcast(MVT::v4f32, Op);
	Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
	return DAG.getNode(ISD::MUL, dl, VT, Op, R);
	}

	// If possible, lower this shift as a sequence of two shifts by
	// constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
	// Example:
	// (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
	//
	// Could be rewritten as:
	// (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
	//
	// The advantage is that the two shifts from the example would be
	// lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
	// the vector shift into four scalar shifts plus four pairs of vector
	// insert/extract.
	if (ConstantAmt && (VT == MVT::v8i16 \|\| VT == MVT::v4i32)) {
	bool UseMOVSD = false;
	bool CanBeSimplified;
	// The splat value for the first packed shift (the 'X' from the example).
	SDValue Amt1 = Amt->getOperand(0);
	// The splat value for the second packed shift (the 'Y' from the example).
	SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);

	// See if it is possible to replace this node with a sequence of
	// two shifts followed by a MOVSS/MOVSD/PBLEND.
	if (VT == MVT::v4i32) {
	// Check if it is legal to use a MOVSS.
	CanBeSimplified = Amt2 == Amt->getOperand(2) &&
	Amt2 == Amt->getOperand(3);
	if (!CanBeSimplified) {
	// Otherwise, check if we can still simplify this node using a MOVSD.
	CanBeSimplified = Amt1 == Amt->getOperand(1) &&
	Amt->getOperand(2) == Amt->getOperand(3);
	UseMOVSD = true;
	Amt2 = Amt->getOperand(2);
	}
	} else {
	// Do similar checks for the case where the machine value type
	// is MVT::v8i16.
	CanBeSimplified = Amt1 == Amt->getOperand(1);
	for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
	CanBeSimplified = Amt2 == Amt->getOperand(i);

	if (!CanBeSimplified) {
	UseMOVSD = true;
	CanBeSimplified = true;
	Amt2 = Amt->getOperand(4);
	for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
	CanBeSimplified = Amt1 == Amt->getOperand(i);
	for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
	CanBeSimplified = Amt2 == Amt->getOperand(j);
	}
	}

	if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
	isa<ConstantSDNode>(Amt2)) {
	// Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
	SDValue Splat1 =
	DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
	SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
	SDValue Splat2 =
	DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
	SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
	SDValue BitCast1 = DAG.getBitcast(MVT::v4i32, Shift1);
	SDValue BitCast2 = DAG.getBitcast(MVT::v4i32, Shift2);
	if (UseMOVSD)
	return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
	BitCast2, {0, 1, 6, 7}));
	return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
	BitCast2, {0, 5, 6, 7}));
	}
	}

	// v4i32 Non Uniform Shifts.
	// If the shift amount is constant we can shift each lane using the SSE2
	// immediate shifts, else we need to zero-extend each lane to the lower i64
	// and shift using the SSE2 variable shifts.
	// The separate results can then be blended together.
	if (VT == MVT::v4i32) {
	unsigned Opc = Op.getOpcode();
	SDValue Amt0, Amt1, Amt2, Amt3;
	if (ConstantAmt) {
	Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
	Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
	Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
	Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
	} else {
	// ISD::SHL is handled above but we include it here for completeness.
	switch (Opc) {
	default:
	llvm_unreachable("Unknown target vector shift node");
	case ISD::SHL:
	Opc = X86ISD::VSHL;
	break;
	case ISD::SRL:
	Opc = X86ISD::VSRL;
	break;
	case ISD::SRA:
	Opc = X86ISD::VSRA;
	break;
	}
	// The SSE2 shifts use the lower i64 as the same shift amount for
	// all lanes and the upper i64 is ignored. These shuffle masks
	// optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
	SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
	Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
	Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
	Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
	Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
	}

	SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
	SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
	SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
	SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
	SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
	SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
	return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
	}

	// It's worth extending once and using the vXi16/vXi32 shifts for smaller
	// types, but without AVX512 the extra overheads to get from vXi8 to vXi32
	// make the existing SSE solution better.
	if ((Subtarget.hasInt256() && VT == MVT::v8i16) \|\|
	(Subtarget.hasAVX512() && VT == MVT::v16i16) \|\|
	(Subtarget.hasAVX512() && VT == MVT::v16i8) \|\|
	(Subtarget.hasBWI() && VT == MVT::v32i8)) {
	assert((!Subtarget.hasBWI() \|\| VT == MVT::v32i8 \|\| VT == MVT::v16i8) &&
	"Unexpected vector type");
	MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
	MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
	unsigned ExtOpc =
	Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	R = DAG.getNode(ExtOpc, dl, ExtVT, R);
	Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
	return DAG.getNode(ISD::TRUNCATE, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
	}

	if (VT == MVT::v16i8 \|\|
	(VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) \|\|
	(VT == MVT::v64i8 && Subtarget.hasBWI())) {
	MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
	unsigned ShiftOpcode = Op->getOpcode();

	auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
	if (VT.is512BitVector()) {
	// On AVX512BW targets we make use of the fact that VSELECT lowers
	// to a masked blend which selects bytes based just on the sign bit
	// extracted to a mask.
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	V0 = DAG.getBitcast(VT, V0);
	V1 = DAG.getBitcast(VT, V1);
	Sel = DAG.getBitcast(VT, Sel);
	Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
	return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
	} else if (Subtarget.hasSSE41()) {
	// On SSE41 targets we make use of the fact that VSELECT lowers
	// to PBLENDVB which selects bytes based just on the sign bit.
	V0 = DAG.getBitcast(VT, V0);
	V1 = DAG.getBitcast(VT, V1);
	Sel = DAG.getBitcast(VT, Sel);
	return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
	}
	// On pre-SSE41 targets we test for the sign bit by comparing to
	// zero - a negative value will set all bits of the lanes to true
	// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
	SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
	SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
	return DAG.getSelect(dl, SelVT, C, V0, V1);
	};

	// Turn 'a' into a mask suitable for VSELECT: a = a << 5;
	// We can safely do this using i16 shifts as we're only interested in
	// the 3 lower bits of each byte.
	Amt = DAG.getBitcast(ExtVT, Amt);
	Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
	Amt = DAG.getBitcast(VT, Amt);

	if (Op->getOpcode() == ISD::SHL \|\| Op->getOpcode() == ISD::SRL) {
	// r = VSELECT(r, shift(r, 4), a);
	SDValue M =
	DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 2), a);
	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// return VSELECT(r, shift(r, 1), a);
	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);
	return R;
	}

	if (Op->getOpcode() == ISD::SRA) {
	// For SRA we need to unpack each byte to the higher byte of a i16 vector
	// so we can correctly sign extend. We don't care what happens to the
	// lower byte.
	SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
	SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
	SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
	SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
	ALo = DAG.getBitcast(ExtVT, ALo);
	AHi = DAG.getBitcast(ExtVT, AHi);
	RLo = DAG.getBitcast(ExtVT, RLo);
	RHi = DAG.getBitcast(ExtVT, RHi);

	// r = VSELECT(r, shift(r, 4), a);
	SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
	DAG.getConstant(4, dl, ExtVT));
	SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
	DAG.getConstant(4, dl, ExtVT));
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// a += a
	ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
	AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

	// r = VSELECT(r, shift(r, 2), a);
	MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
	DAG.getConstant(2, dl, ExtVT));
	MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
	DAG.getConstant(2, dl, ExtVT));
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// a += a
	ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
	AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

	// r = VSELECT(r, shift(r, 1), a);
	MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
	DAG.getConstant(1, dl, ExtVT));
	MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
	DAG.getConstant(1, dl, ExtVT));
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// Logical shift the result back to the lower byte, leaving a zero upper
	// byte
	// meaning that we can safely pack with PACKUSWB.
	RLo =
	DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
	RHi =
	DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}
	}

	if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
	MVT ExtVT = MVT::v8i32;
	SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
	SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
	SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
	SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
	SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
	ALo = DAG.getBitcast(ExtVT, ALo);
	AHi = DAG.getBitcast(ExtVT, AHi);
	RLo = DAG.getBitcast(ExtVT, RLo);
	RHi = DAG.getBitcast(ExtVT, RHi);
	SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
	SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
	Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
	Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
	return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
	}

	if (VT == MVT::v8i16) {
	unsigned ShiftOpcode = Op->getOpcode();

	// If we have a constant shift amount, the non-SSE41 path is best as
	// avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
	bool UseSSE41 = Subtarget.hasSSE41() &&
	!ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

	auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
	// On SSE41 targets we make use of the fact that VSELECT lowers
	// to PBLENDVB which selects bytes based just on the sign bit.
	if (UseSSE41) {
	MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
	V0 = DAG.getBitcast(ExtVT, V0);
	V1 = DAG.getBitcast(ExtVT, V1);
	Sel = DAG.getBitcast(ExtVT, Sel);
	return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
	}
	// On pre-SSE41 targets we splat the sign bit - a negative value will
	// set all bits of the lanes to true and VSELECT uses that in
	// its OR(AND(V0,C),AND(V1,~C)) lowering.
	SDValue C =
	DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
	return DAG.getSelect(dl, VT, C, V0, V1);
	};

	// Turn 'a' into a mask suitable for VSELECT: a = a << 12;
	if (UseSSE41) {
	// On SSE41 targets we need to replicate the shift mask in both
	// bytes for PBLENDVB.
	Amt = DAG.getNode(
	ISD::OR, dl, VT,
	DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
	DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
	} else {
	Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
	}

	// r = VSELECT(r, shift(r, 8), a);
	SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 4), a);
	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 2), a);
	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// return VSELECT(r, shift(r, 1), a);
	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
	R = SignBitSelect(Amt, M, R);
	return R;
	}

	// Decompose 256-bit shifts into smaller 128-bit shifts.
	if (VT.is256BitVector())
	return Lower256IntArith(Op, DAG);

	return SDValue();
	}

	static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDLoc DL(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	unsigned Opcode = Op.getOpcode();
	unsigned EltSizeInBits = VT.getScalarSizeInBits();

	if (Subtarget.hasAVX512()) {
	// Attempt to rotate by immediate.
	APInt UndefElts;
	SmallVector<APInt, 16> EltBits;
	if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
	if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
	return EltBits[0] == V;
	})) {
	unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
	uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
	return DAG.getNode(Op, DL, VT, R,
	DAG.getConstant(RotateAmt, DL, MVT::i8));
	}
	}

	// Else, fall-back on VPROLV/VPRORV.
	return Op;
	}

	assert(VT.isVector() && "Custom lowering only for vector rotates!");
	assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
	assert((Opcode == ISD::ROTL) && "Only ROTL supported");

	// XOP has 128-bit vector variable + immediate rotates.
	// +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.

	// Split 256-bit integers.
	if (VT.is256BitVector())
	return Lower256IntArith(Op, DAG);

	assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");

	// Attempt to rotate by immediate.
	if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
	if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
	uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
	assert(RotateAmt < EltSizeInBits && "Rotation out of range");
	return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
	DAG.getConstant(RotateAmt, DL, MVT::i8));
	}
	}

	// Use general rotate by variable (per-element).
	return Op;
	}

	static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
	// Lower the "add/sub/mul with overflow" instruction into a regular ins plus
	// a "setcc" instruction that checks the overflow flag. The "brcond" lowering
	// looks for this combo and may remove the "setcc" instruction if the "setcc"
	// has only one use.
	SDNode *N = Op.getNode();
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	unsigned BaseOp = 0;
	X86::CondCode Cond;
	SDLoc DL(Op);
	switch (Op.getOpcode()) {
	default: llvm_unreachable("Unknown ovf instruction!");
	case ISD::SADDO:
	// A subtract of one will be selected as a INC. Note that INC doesn't
	// set CF, so we can't do this for UADDO.
	if (isOneConstant(RHS)) {
	BaseOp = X86ISD::INC;
	Cond = X86::COND_O;
	break;
	}
	BaseOp = X86ISD::ADD;
	Cond = X86::COND_O;
	break;
	case ISD::UADDO:
	BaseOp = X86ISD::ADD;
	Cond = X86::COND_B;
	break;
	case ISD::SSUBO:
	// A subtract of one will be selected as a DEC. Note that DEC doesn't
	// set CF, so we can't do this for USUBO.
	if (isOneConstant(RHS)) {
	BaseOp = X86ISD::DEC;
	Cond = X86::COND_O;
	break;
	}
	BaseOp = X86ISD::SUB;
	Cond = X86::COND_O;
	break;
	case ISD::USUBO:
	BaseOp = X86ISD::SUB;
	Cond = X86::COND_B;
	break;
	case ISD::SMULO:
	BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
	Cond = X86::COND_O;
	break;
	case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
	if (N->getValueType(0) == MVT::i8) {
	BaseOp = X86ISD::UMUL8;
	Cond = X86::COND_O;
	break;
	}
	SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
	MVT::i32);
	SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);

	SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);

	if (N->getValueType(1) == MVT::i1)
	SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
	}
	}

	// Also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
	SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);

	SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);

	if (N->getValueType(1) == MVT::i1)
	SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
	}

	/// Returns true if the operand type is exactly twice the native width, and
	/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
	/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
	/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
	bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
	unsigned OpWidth = MemType->getPrimitiveSizeInBits();

	if (OpWidth == 64)
	return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
	else if (OpWidth == 128)
	return Subtarget.hasCmpxchg16b();
	else
	return false;
	}

	bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
	return needsCmpXchgNb(SI->getValueOperand()->getType());
	}

	// Note: this turns large loads into lock cmpxchg8b/16b.
	// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
	TargetLowering::AtomicExpansionKind
	X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
	auto PTy = cast<PointerType>(LI->getPointerOperandType());
	return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	}

	TargetLowering::AtomicExpansionKind
	X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
	unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
	Type *MemType = AI->getType();

	// If the operand is too big, we must see if cmpxchg8/16b is available
	// and default to library calls otherwise.
	if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
	return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	}

	AtomicRMWInst::BinOp Op = AI->getOperation();
	switch (Op) {
	default:
	llvm_unreachable("Unknown atomic operation");
	case AtomicRMWInst::Xchg:
	case AtomicRMWInst::Add:
	case AtomicRMWInst::Sub:
	// It's better to use xadd, xsub or xchg for these in all cases.
	return AtomicExpansionKind::None;
	case AtomicRMWInst::Or:
	case AtomicRMWInst::And:
	case AtomicRMWInst::Xor:
	// If the atomicrmw's result isn't actually used, we can just add a "lock"
	// prefix to a normal instruction for these operations.
	return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	case AtomicRMWInst::Nand:
	case AtomicRMWInst::Max:
	case AtomicRMWInst::Min:
	case AtomicRMWInst::UMax:
	case AtomicRMWInst::UMin:
	// These always require a non-trivial set of data operations on x86. We must
	// use a cmpxchg loop.
	return AtomicExpansionKind::CmpXChg;
	}
	}

	LoadInst *
	X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
	unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
	Type *MemType = AI->getType();
	// Accesses larger than the native width are turned into cmpxchg/libcalls, so
	// there is no benefit in turning such RMWs into loads, and it is actually
	// harmful as it introduces a mfence.
	if (MemType->getPrimitiveSizeInBits() > NativeWidth)
	return nullptr;

	auto Builder = IRBuilder<>(AI);
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	auto SSID = AI->getSyncScopeID();
	// We must restrict the ordering to avoid generating loads with Release or
	// ReleaseAcquire orderings.
	auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
	auto Ptr = AI->getPointerOperand();

	// Before the load we need a fence. Here is an example lifted from
	// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
	// is required:
	// Thread 0:
	// x.store(1, relaxed);
	// r1 = y.fetch_add(0, release);
	// Thread 1:
	// y.fetch_add(42, acquire);
	// r2 = x.load(relaxed);
	// r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
	// lowered to just a load without a fence. A mfence flushes the store buffer,
	// making the optimization clearly correct.
	// FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
	// otherwise, we might be able to be more aggressive on relaxed idempotent
	// rmw. In practice, they do not look useful, so we don't try to be
	// especially clever.
	if (SSID == SyncScope::SingleThread)
	// FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
	// the IR level, so we must wrap it in an intrinsic.
	return nullptr;

	if (!Subtarget.hasMFence())
	// FIXME: it might make sense to use a locked operation here but on a
	// different cache-line to prevent cache-line bouncing. In practice it
	// is probably a small win, and x86 processors without mfence are rare
	// enough that we do not bother.
	return nullptr;

	Function *MFence =
	llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
	Builder.CreateCall(MFence, {});

	// Finally we can emit the atomic load.
	LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
	AI->getType()->getPrimitiveSizeInBits());
	Loaded->setAtomic(Order, SSID);
	AI->replaceAllUsesWith(Loaded);
	AI->eraseFromParent();
	return Loaded;
	}

	static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
	cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
	SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
	cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());

	// The only fence that needs an instruction is a sequentially-consistent
	// cross-thread fence.
	if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
	FenceSSID == SyncScope::System) {
	if (Subtarget.hasMFence())
	return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));

	SDValue Chain = Op.getOperand(0);
	SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
	SDValue Ops[] = {
	DAG.getRegister(X86::ESP, MVT::i32), // Base
	DAG.getTargetConstant(1, dl, MVT::i8), // Scale
	DAG.getRegister(0, MVT::i32), // Index
	DAG.getTargetConstant(0, dl, MVT::i32), // Disp
	DAG.getRegister(0, MVT::i32), // Segment.
	Zero,
	Chain
	};
	SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
	return SDValue(Res, 0);
	}

	// MEMBARRIER is a compiler barrier; it codegens to a no-op.
	return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
	}

	static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT T = Op.getSimpleValueType();
	SDLoc DL(Op);
	unsigned Reg = 0;
	unsigned size = 0;
	switch(T.SimpleTy) {
	default: llvm_unreachable("Invalid value type!");
	case MVT::i8: Reg = X86::AL; size = 1; break;
	case MVT::i16: Reg = X86::AX; size = 2; break;
	case MVT::i32: Reg = X86::EAX; size = 4; break;
	case MVT::i64:
	assert(Subtarget.is64Bit() && "Node not type legal!");
	Reg = X86::RAX; size = 8;
	break;
	}
	SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
	Op.getOperand(2), SDValue());
	SDValue Ops[] = { cpIn.getValue(0),
	Op.getOperand(1),
	Op.getOperand(3),
	DAG.getTargetConstant(size, DL, MVT::i8),
	cpIn.getValue(1) };
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
	SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
	Ops, T, MMO);

	SDValue cpOut =
	DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
	SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
	MVT::i32, cpOut.getValue(2));
	SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);

	DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
	DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
	DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
	return SDValue();
	}

	static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT SrcVT = Op.getOperand(0).getSimpleValueType();
	MVT DstVT = Op.getSimpleValueType();

	if (SrcVT == MVT::v2i32 \|\| SrcVT == MVT::v4i16 \|\| SrcVT == MVT::v8i8 \|\|
	SrcVT == MVT::i64) {
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	if (DstVT != MVT::f64)
	// This conversion needs to be expanded.
	return SDValue();

	SDValue Op0 = Op->getOperand(0);
	SmallVector<SDValue, 16> Elts;
	SDLoc dl(Op);
	unsigned NumElts;
	MVT SVT;
	if (SrcVT.isVector()) {
	NumElts = SrcVT.getVectorNumElements();
	SVT = SrcVT.getVectorElementType();

	// Widen the vector in input in the case of MVT::v2i32.
	// Example: from MVT::v2i32 to MVT::v4i32.
	for (unsigned i = 0, e = NumElts; i != e; ++i)
	Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
	DAG.getIntPtrConstant(i, dl)));
	} else {
	assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
	"Unexpected source type in LowerBITCAST");
	Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
	DAG.getIntPtrConstant(0, dl)));
	Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
	DAG.getIntPtrConstant(1, dl)));
	NumElts = 2;
	SVT = MVT::i32;
	}
	// Explicitly mark the extra elements as Undef.
	Elts.append(NumElts, DAG.getUNDEF(SVT));

	EVT NewVT = EVT::getVectorVT(DAG.getContext(), SVT, NumElts 2);
	SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
	SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
	DAG.getIntPtrConstant(0, dl));
	}

	assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
	Subtarget.hasMMX() && "Unexpected custom BITCAST");
	assert((DstVT == MVT::i64 \|\|
	(DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
	"Unexpected custom BITCAST");
	// i64 <=> MMX conversions are Legal.
	if (SrcVT==MVT::i64 && DstVT.isVector())
	return Op;
	if (DstVT==MVT::i64 && SrcVT.isVector())
	return Op;
	// MMX <=> MMX conversions are Legal.
	if (SrcVT.isVector() && DstVT.isVector())
	return Op;
	// All other conversions need to be expanded.
	return SDValue();
	}

	/// Compute the horizontal sum of bytes in V for the elements of VT.
	///
	/// Requires V to be a byte vector and VT to be an integer vector type with
	/// wider elements than V's type. The width of the elements of VT determines
	/// how many bytes of V are summed horizontally to produce each element of the
	/// result.
	static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc DL(V);
	MVT ByteVecVT = V.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
	"Expected value to have byte element type.");
	assert(EltVT != MVT::i8 &&
	"Horizontal byte sum only makes sense for wider elements!");
	unsigned VecSize = VT.getSizeInBits();
	assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");

	// PSADBW instruction horizontally add all bytes and leave the result in i64
	// chunks, thus directly computes the pop count for v2i64 and v4i64.
	if (EltVT == MVT::i64) {
	SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
	MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
	V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
	return DAG.getBitcast(VT, V);
	}

	if (EltVT == MVT::i32) {
	// We unpack the low half and high half into i32s interleaved with zeros so
	// that we can use PSADBW to horizontally sum them. The most useful part of
	// this is that it lines up the results of two PSADBW instructions to be
	// two v2i64 vectors which concatenated are the 4 population counts. We can
	// then use PACKUSWB to shrink and concatenate them into a v4i32 again.
	SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
	SDValue V32 = DAG.getBitcast(VT, V);
	SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
	SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);

	// Do the horizontal sums into two v2i64s.
	Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
	MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
	Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
	DAG.getBitcast(ByteVecVT, Low), Zeros);
	High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
	DAG.getBitcast(ByteVecVT, High), Zeros);

	// Merge them together.
	MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
	V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
	DAG.getBitcast(ShortVecVT, Low),
	DAG.getBitcast(ShortVecVT, High));

	return DAG.getBitcast(VT, V);
	}

	// The only element type left is i16.
	assert(EltVT == MVT::i16 && "Unknown how to handle type");

	// To obtain pop count for each i16 element starting from the pop count for
	// i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
	// right by 8. It is important to shift as i16s as i8 vector shift isn't
	// directly supported.
	SDValue ShifterV = DAG.getConstant(8, DL, VT);
	SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
	V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
	DAG.getBitcast(ByteVecVT, V));
	return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
	}

	static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned VecSize = VT.getSizeInBits();

	// Implement a lookup table in register by using an algorithm based on:
	// http://wm.ite.pl/articles/sse-popcount.html
	//
	// The general idea is that every lower byte nibble in the input vector is an
	// index into a in-register pre-computed pop count table. We then split up the
	// input vector in two new ones: (1) a vector with only the shifted-right
	// higher nibbles for each byte and (2) a vector with the lower nibbles (and
	// masked out higher ones) for each byte. PSHUFB is used separately with both
	// to index the in-register table. Next, both are added and the result is a
	// i8 vector where each element contains the pop count for input byte.
	//
	// To obtain the pop count for elements != i8, we follow up with the same
	// approach and use additional tricks as described below.
	//
	const int LUT[16] = {/* 0 / 0, / 1 / 1, / 2 / 1, / 3 */ 2,
	/* 4 / 1, / 5 / 2, / 6 / 2, / 7 */ 3,
	/* 8 / 1, / 9 / 2, / a / 2, / b */ 3,
	/* c / 2, / d / 3, / e / 3, / f */ 4};

	int NumByteElts = VecSize / 8;
	MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
	SDValue In = DAG.getBitcast(ByteVecVT, Op);
	SmallVector<SDValue, 64> LUTVec;
	for (int i = 0; i < NumByteElts; ++i)
	LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
	SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
	SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);

	// High nibbles
	SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
	SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);

	// Low nibbles
	SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);

	// The input vector is used as the shuffle mask that index elements into the
	// LUT. After counting low and high nibbles, add the vector to obtain the
	// final pop count per i8 element.
	SDValue HighPopCnt =
	DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
	SDValue LowPopCnt =
	DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
	SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);

	if (EltVT == MVT::i8)
	return PopCnt;

	return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
	}

	static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	assert(VT.is128BitVector() &&
	"Only 128-bit vector bitmath lowering supported.");

	int VecSize = VT.getSizeInBits();
	MVT EltVT = VT.getVectorElementType();
	int Len = EltVT.getSizeInBits();

	// This is the vectorized version of the "best" algorithm from
	// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
	// with a minor tweak to use a series of adds + shifts instead of vector
	// multiplications. Implemented for all integer vector types. We only use
	// this when we don't have SSSE3 which allows a LUT-based lowering that is
	// much faster, even faster than using native popcnt instructions.

	auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
	MVT VT = V.getSimpleValueType();
	SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
	return DAG.getNode(OpCode, DL, VT, V, ShifterV);
	};
	auto GetMask = [&](SDValue V, APInt Mask) {
	MVT VT = V.getSimpleValueType();
	SDValue MaskV = DAG.getConstant(Mask, DL, VT);
	return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
	};

	// We don't want to incur the implicit masks required to SRL vNi8 vectors on
	// x86, so set the SRL type to have elements at least i16 wide. This is
	// correct because all of our SRLs are followed immediately by a mask anyways
	// that handles any bits that sneak into the high bits of the byte elements.
	MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);

	SDValue V = Op;

	// v = v - ((v >> 1) & 0x55555555...)
	SDValue Srl =
	DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
	SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
	V = DAG.getNode(ISD::SUB, DL, VT, V, And);

	// v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
	SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
	Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
	SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
	V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);

	// v = (v + (v >> 4)) & 0x0F0F0F0F...
	Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
	V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));

	// At this point, V contains the byte-wise population count, and we are
	// merely doing a horizontal sum if necessary to get the wider element
	// counts.
	if (EltVT == MVT::i8)
	return V;

	return LowerHorizontalByteSum(
	DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
	DAG);
	}

	// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
	// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
	static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	assert((VT.is512BitVector() \|\| VT.is256BitVector() \|\| VT.is128BitVector()) &&
	"Unknown CTPOP type to handle");
	SDLoc DL(Op.getNode());
	SDValue Op0 = Op.getOperand(0);

	// TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
	if (Subtarget.hasVPOPCNTDQ()) {
	unsigned NumElems = VT.getVectorNumElements();
	assert((VT.getVectorElementType() == MVT::i8 \|\|
	VT.getVectorElementType() == MVT::i16) && "Unexpected type");
	if (NumElems <= 16) {
	MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
	Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
	Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
	return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
	}
	}

	if (!Subtarget.hasSSSE3()) {
	// We can't use the fast LUT approach, so fall back on vectorized bitmath.
	assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
	return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
	}

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntUnary(Op, DAG);

	// Decompose 512-bit ops into smaller 256-bit ops.
	if (VT.is512BitVector() && !Subtarget.hasBWI())
	return Lower512IntUnary(Op, DAG);

	return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
	}

	static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().isVector() &&
	"We only do custom lowering for vector population count.");
	return LowerVectorCTPOP(Op, Subtarget, DAG);
	}

	static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	SDLoc DL(Op);

	// For scalars, its still beneficial to transfer to/from the SIMD unit to
	// perform the BITREVERSE.
	if (!VT.isVector()) {
	MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
	SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
	Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	}

	int NumElts = VT.getVectorNumElements();
	int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector())
	return Lower256IntUnary(Op, DAG);

	assert(VT.is128BitVector() &&
	"Only 128-bit vector bitreverse lowering supported.");

	// VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
	// perform the BSWAP in the shuffle.
	// Its best to shuffle using the second operand as this will implicitly allow
	// memory folding for multiple vectors.
	SmallVector<SDValue, 16> MaskElts;
	for (int i = 0; i != NumElts; ++i) {
	for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
	int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
	int PermuteByte = SourceByte \| (2 << 5);
	MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
	}
	}

	SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
	SDValue Res = DAG.getBitcast(MVT::v16i8, In);
	Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
	Res, Mask);
	return DAG.getBitcast(VT, Res);
	}

	static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	if (Subtarget.hasXOP() && !VT.is512BitVector())
	return LowerBITREVERSE_XOP(Op, DAG);

	assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");

	SDValue In = Op.getOperand(0);
	SDLoc DL(Op);

	unsigned NumElts = VT.getVectorNumElements();
	assert(VT.getScalarType() == MVT::i8 &&
	"Only byte vector BITREVERSE supported");

	// Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntUnary(Op, DAG);

	// Perform BITREVERSE using PSHUFB lookups. Each byte is split into
	// two nibbles and a PSHUFB lookup to find the bitreverse of each
	// 0-15 value (moved to the other nibble).
	SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
	SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
	SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));

	const int LoLUT[16] = {
	/* 0 / 0x00, / 1 / 0x80, / 2 / 0x40, / 3 */ 0xC0,
	/* 4 / 0x20, / 5 / 0xA0, / 6 / 0x60, / 7 */ 0xE0,
	/* 8 / 0x10, / 9 / 0x90, / a / 0x50, / b */ 0xD0,
	/* c / 0x30, / d / 0xB0, / e / 0x70, / f */ 0xF0};
	const int HiLUT[16] = {
	/* 0 / 0x00, / 1 / 0x08, / 2 / 0x04, / 3 */ 0x0C,
	/* 4 / 0x02, / 5 / 0x0A, / 6 / 0x06, / 7 */ 0x0E,
	/* 8 / 0x01, / 9 / 0x09, / a / 0x05, / b */ 0x0D,
	/* c / 0x03, / d / 0x0B, / e / 0x07, / f */ 0x0F};

	SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
	for (unsigned i = 0; i < NumElts; ++i) {
	LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
	HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
	}

	SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
	SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
	Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
	Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
	return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
	}

	static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	bool AllowIncDec = true) {
	unsigned NewOpc = 0;
	switch (N->getOpcode()) {
	case ISD::ATOMIC_LOAD_ADD:
	NewOpc = X86ISD::LADD;
	break;
	case ISD::ATOMIC_LOAD_SUB:
	NewOpc = X86ISD::LSUB;
	break;
	case ISD::ATOMIC_LOAD_OR:
	NewOpc = X86ISD::LOR;
	break;
	case ISD::ATOMIC_LOAD_XOR:
	NewOpc = X86ISD::LXOR;
	break;
	case ISD::ATOMIC_LOAD_AND:
	NewOpc = X86ISD::LAND;
	break;
	default:
	llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
	}

	MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();

	if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
	// Convert to inc/dec if they aren't slow or we are optimizing for size.
	if (AllowIncDec && (!Subtarget.slowIncDec() \|\|
	DAG.getMachineFunction().getFunction().optForSize())) {
	if ((NewOpc == X86ISD::LADD && C->isOne()) \|\|
	(NewOpc == X86ISD::LSUB && C->isAllOnesValue()))
	return DAG.getMemIntrinsicNode(X86ISD::LINC, SDLoc(N),
	DAG.getVTList(MVT::i32, MVT::Other),
	{N->getOperand(0), N->getOperand(1)},
	/MemVT=/N->getSimpleValueType(0), MMO);
	if ((NewOpc == X86ISD::LSUB && C->isOne()) \|\|
	(NewOpc == X86ISD::LADD && C->isAllOnesValue()))
	return DAG.getMemIntrinsicNode(X86ISD::LDEC, SDLoc(N),
	DAG.getVTList(MVT::i32, MVT::Other),
	{N->getOperand(0), N->getOperand(1)},
	/MemVT=/N->getSimpleValueType(0), MMO);
	}
	}

	return DAG.getMemIntrinsicNode(
	NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
	{N->getOperand(0), N->getOperand(1), N->getOperand(2)},
	/MemVT=/N->getSimpleValueType(0), MMO);
	}

	/// Lower atomic_load_ops into LOCK-prefixed operations.
	static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Chain = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	unsigned Opc = N->getOpcode();
	MVT VT = N->getSimpleValueType(0);
	SDLoc DL(N);

	// We can lower atomic_load_add into LXADD. However, any other atomicrmw op
	// can only be lowered when the result is unused. They should have already
	// been transformed into a cmpxchg loop in AtomicExpand.
	if (N->hasAnyUseOfValue(0)) {
	// Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
	// select LXADD if LOCK_SUB can't be selected.
	if (Opc == ISD::ATOMIC_LOAD_SUB) {
	AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
	RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
	return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
	RHS, AN->getMemOperand());
	}
	assert(Opc == ISD::ATOMIC_LOAD_ADD &&
	"Used AtomicRMW ops other than Add should have been expanded!");
	return N;
	}

	SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
	// RAUW the chain, but don't worry about the result, as it's unused.
	assert(!N->hasAnyUseOfValue(0));
	DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
	return SDValue();
	}

	static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
	SDNode *Node = Op.getNode();
	SDLoc dl(Node);
	EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();

	// Convert seq_cst store -> xchg
	// Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
	// FIXME: On 32-bit, store -> fist or movq would be more efficient
	// (The only way to get a 16-byte store is cmpxchg16b)
	// FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
	if (cast<AtomicSDNode>(Node)->getOrdering() ==
	AtomicOrdering::SequentiallyConsistent \|\|
	!DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
	SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
	cast<AtomicSDNode>(Node)->getMemoryVT(),
	Node->getOperand(0),
	Node->getOperand(1), Node->getOperand(2),
	cast<AtomicSDNode>(Node)->getMemOperand());
	return Swap.getValue(1);
	}
	// Other atomic stores have a simple pattern.
	return Op;
	}

	static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
	SDNode *N = Op.getNode();
	MVT VT = N->getSimpleValueType(0);

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	SDLoc DL(N);

	// Set the carry flag.
	SDValue Carry = Op.getOperand(2);
	EVT CarryVT = Carry.getValueType();
	APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
	Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
	Carry, DAG.getConstant(NegOne, DL, CarryVT));

	unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
	SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
	Op.getOperand(1), Carry.getValue(1));

	SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
	if (N->getValueType(1) == MVT::i1)
	SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
	}

	static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());

	// For MacOSX, we want to call an alternative entry point: __sincos_stret,
	// which returns the values as { float, float } (in XMM0) or
	// { double, double } (which is returned in XMM0, XMM1).
	SDLoc dl(Op);
	SDValue Arg = Op.getOperand(0);
	EVT ArgVT = Arg.getValueType();
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;

	Entry.Node = Arg;
	Entry.Ty = ArgTy;
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);

	bool isF64 = ArgVT == MVT::f64;
	// Only optimize x86_64 for now. i386 is a bit messy. For f32,
	// the small struct {f32, f32} is returned in (eax, edx). For f64,
	// the results are returned via SRet in memory.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
	const char *LibcallName = TLI.getLibcallName(LC);
	SDValue Callee =
	DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));

	Type RetTy = isF64 ? (Type )StructType::get(ArgTy, ArgTy)
	: (Type *)VectorType::get(ArgTy, 4);

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(DAG.getEntryNode())
	.setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));

	std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);

	if (isF64)
	// Returned in xmm0 and xmm1.
	return CallResult.first;

	// Returned in bits 0:31 and 32:64 xmm0.
	SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
	CallResult.first, DAG.getIntPtrConstant(0, dl));
	SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
	CallResult.first, DAG.getIntPtrConstant(1, dl));
	SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
	}

	/// Widen a vector input to a vector of NVT. The
	/// input vector must have the same element type as NVT.
	static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
	bool FillWithZeroes = false) {
	// Check if InOp already has the right width.
	MVT InVT = InOp.getSimpleValueType();
	if (InVT == NVT)
	return InOp;

	if (InOp.isUndef())
	return DAG.getUNDEF(NVT);

	assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
	"input and widen element type must match");

	unsigned InNumElts = InVT.getVectorNumElements();
	unsigned WidenNumElts = NVT.getVectorNumElements();
	assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
	"Unexpected request for vector widening");

	SDLoc dl(InOp);
	if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
	InOp.getNumOperands() == 2) {
	SDValue N1 = InOp.getOperand(1);
	if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) \|\|
	N1.isUndef()) {
	InOp = InOp.getOperand(0);
	InVT = InOp.getSimpleValueType();
	InNumElts = InVT.getVectorNumElements();
	}
	}
	if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) \|\|
	ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
	SmallVector<SDValue, 16> Ops;
	for (unsigned i = 0; i < InNumElts; ++i)
	Ops.push_back(InOp.getOperand(i));

	EVT EltVT = InOp.getOperand(0).getValueType();

	SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
	DAG.getUNDEF(EltVT);
	for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
	Ops.push_back(FillVal);
	return DAG.getBuildVector(NVT, dl, Ops);
	}
	SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
	DAG.getUNDEF(NVT);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
	InOp, DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"MGATHER/MSCATTER are supported on AVX-512 arch only");

	MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
	SDValue Src = N->getValue();
	MVT VT = Src.getSimpleValueType();
	assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
	SDLoc dl(Op);

	SDValue Index = N->getIndex();
	SDValue Mask = N->getMask();
	SDValue Chain = N->getChain();
	SDValue BasePtr = N->getBasePtr();
	MVT MemVT = N->getMemoryVT().getSimpleVT();
	MVT IndexVT = Index.getSimpleValueType();
	MVT MaskVT = Mask.getSimpleValueType();

	if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
	// The v2i32 value was promoted to v2i64.
	// Now we "redo" the type legalizer's work and widen the original
	// v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
	// with a shuffle.
	assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
	"Unexpected memory type");
	int ShuffleMask[] = {0, 2, -1, -1};
	Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
	DAG.getUNDEF(MVT::v4i32), ShuffleMask);
	// Now we have 4 elements instead of 2.
	// Expand the index.
	MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
	Index = ExtendToType(Index, NewIndexVT, DAG);

	// Expand the mask with zeroes
	// Mask may be <2 x i64> or <2 x i1> at this moment
	assert((MaskVT == MVT::v2i1 \|\| MaskVT == MVT::v2i64) &&
	"Unexpected mask type");
	MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
	Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
	VT = MVT::v4i32;
	}

	unsigned NumElts = VT.getVectorNumElements();
	if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
	!Index.getSimpleValueType().is512BitVector()) {
	// AVX512F supports only 512-bit vectors. Or data or index should
	// be 512 bit wide. If now the both index and data are 256-bit, but
	// the vector contains 8 elements, we just sign-extend the index
	if (IndexVT == MVT::v8i32)
	// Just extend index
	Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
	else {
	// The minimal number of elts in scatter is 8
	NumElts = 8;
	// Index
	MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
	// Use original index here, do not modify the index twice
	Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
	if (IndexVT.getScalarType() == MVT::i32)
	Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);

	// Mask
	// At this point we have promoted mask operand
	assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
	MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
	// Use the original mask here, do not modify the mask twice
	Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);

	// The value that should be stored
	MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
	Src = ExtendToType(Src, NewVT, DAG);
	}
	}
	// If the mask is "wide" at this point - truncate it to i1 vector
	MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
	Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);

	// The mask is killed by scatter, add it to the values
	SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
	SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
	SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
	VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
	DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
	return SDValue(NewScatter.getNode(), 1);
	}

	static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {

	MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
	MVT VT = Op.getSimpleValueType();
	MVT ScalarVT = VT.getScalarType();
	SDValue Mask = N->getMask();
	SDLoc dl(Op);

	assert((!N->isExpandingLoad() \|\| Subtarget.hasAVX512()) &&
	"Expanding masked load is supported on AVX-512 target only!");

	assert((!N->isExpandingLoad() \|\| ScalarVT.getSizeInBits() >= 32) &&
	"Expanding masked load is supported for 32 and 64-bit types only!");

	// 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
	// VLX. These types for exp-loads are handled here.
	if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
	return Op;

	assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
	"Cannot lower masked load op.");

	assert((ScalarVT.getSizeInBits() >= 32 \|\|
	(Subtarget.hasBWI() &&
	(ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) &&
	"Unsupported masked load op.");

	// This operation is legal for targets with VLX, but without
	// VLX the vector should be widened to 512 bit
	unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
	MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
	SDValue Src0 = N->getSrc0();
	Src0 = ExtendToType(Src0, WideDataVT, DAG);

	// Mask element has to be i1.
	MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
	assert((MaskEltTy == MVT::i1 \|\| VT.getVectorNumElements() <= 4) &&
	"We handle 4x32, 4x64 and 2x64 vectors only in this case");

	MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);

	Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
	if (MaskEltTy != MVT::i1)
	Mask = DAG.getNode(ISD::TRUNCATE, dl,
	MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
	SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
	N->getBasePtr(), Mask, Src0,
	N->getMemoryVT(), N->getMemOperand(),
	N->getExtensionType(),
	N->isExpandingLoad());

	SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
	NewLoad.getValue(0),
	DAG.getIntPtrConstant(0, dl));
	SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
	return DAG.getMergeValues(RetOps, dl);
	}

	static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
	SDValue DataToStore = N->getValue();
	MVT VT = DataToStore.getSimpleValueType();
	MVT ScalarVT = VT.getScalarType();
	SDValue Mask = N->getMask();
	SDLoc dl(Op);

	assert((!N->isCompressingStore() \|\| Subtarget.hasAVX512()) &&
	"Expanding masked load is supported on AVX-512 target only!");

	assert((!N->isCompressingStore() \|\| ScalarVT.getSizeInBits() >= 32) &&
	"Expanding masked load is supported for 32 and 64-bit types only!");

	// 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
	if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
	return Op;

	assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
	"Cannot lower masked store op.");

	assert((ScalarVT.getSizeInBits() >= 32 \|\|
	(Subtarget.hasBWI() &&
	(ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) &&
	"Unsupported masked store op.");

	// This operation is legal for targets with VLX, but without
	// VLX the vector should be widened to 512 bit
	unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
	MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);

	// Mask element has to be i1.
	MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
	assert((MaskEltTy == MVT::i1 \|\| VT.getVectorNumElements() <= 4) &&
	"We handle 4x32, 4x64 and 2x64 vectors only in this case");

	MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);

	DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
	Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
	if (MaskEltTy != MVT::i1)
	Mask = DAG.getNode(ISD::TRUNCATE, dl,
	MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
	return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
	Mask, N->getMemoryVT(), N->getMemOperand(),
	N->isTruncatingStore(), N->isCompressingStore());
	}

	static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX2() &&
	"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");

	MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue Index = N->getIndex();
	SDValue Mask = N->getMask();
	SDValue Src0 = N->getValue();
	MVT IndexVT = Index.getSimpleValueType();
	MVT MaskVT = Mask.getSimpleValueType();

	unsigned NumElts = VT.getVectorNumElements();
	assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");

	// If the index is v2i32, we're being called by type legalization.
	if (IndexVT == MVT::v2i32)
	return SDValue();

	if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
	!Index.getSimpleValueType().is512BitVector()) {
	// AVX512F supports only 512-bit vectors. Or data or index should
	// be 512 bit wide. If now the both index and data are 256-bit, but
	// the vector contains 8 elements, we just sign-extend the index
	if (NumElts == 8) {
	Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
	SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
	SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
	N->getMemOperand());
	return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl);
	}

	// Minimal number of elements in Gather
	NumElts = 8;
	// Index
	MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
	Index = ExtendToType(Index, NewIndexVT, DAG);
	if (IndexVT.getScalarType() == MVT::i32)
	Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);

	// Mask
	MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
	// At this point we have promoted mask operand
	assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
	MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
	Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
	Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);

	// The pass-through value
	MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
	Src0 = ExtendToType(Src0, NewVT, DAG);

	SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
	SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	DAG.getVTList(NewVT, MaskBitVT, MVT::Other), Ops, dl, N->getMemoryVT(),
	N->getMemOperand());
	SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
	NewGather.getValue(0),
	DAG.getIntPtrConstant(0, dl));
	SDValue RetOps[] = {Extract, NewGather.getValue(2)};
	return DAG.getMergeValues(RetOps, dl);
	}

	SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
	SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
	N->getMemOperand());
	return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl);
	}

	SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
	SelectionDAG &DAG) const {
	// TODO: Eventually, the lowering of these nodes should be informed by or
	// deferred to the GC strategy for the function in which they appear. For
	// now, however, they must be lowered to something. Since they are logically
	// no-ops in the case of a null GC strategy (or a GC strategy which does not
	// require special handling for these nodes), lower them as literal NOOPs for
	// the time being.
	SmallVector<SDValue, 2> Ops;

	Ops.push_back(Op.getOperand(0));
	if (Op->getGluedNode())
	Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));

	SDLoc OpDL(Op);
	SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);

	return NOOP;
	}

	SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
	SelectionDAG &DAG) const {
	// TODO: Eventually, the lowering of these nodes should be informed by or
	// deferred to the GC strategy for the function in which they appear. For
	// now, however, they must be lowered to something. Since they are logically
	// no-ops in the case of a null GC strategy (or a GC strategy which does not
	// require special handling for these nodes), lower them as literal NOOPs for
	// the time being.
	SmallVector<SDValue, 2> Ops;

	Ops.push_back(Op.getOperand(0));
	if (Op->getGluedNode())
	Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));

	SDLoc OpDL(Op);
	SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);

	return NOOP;
	}

	/// Provide custom lowering hooks for some operations.
	SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
	switch (Op.getOpcode()) {
	default: llvm_unreachable("Should not custom lower this!");
	case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
	return LowerCMP_SWAP(Op, Subtarget, DAG);
	case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
	case ISD::ATOMIC_LOAD_ADD:
	case ISD::ATOMIC_LOAD_SUB:
	case ISD::ATOMIC_LOAD_OR:
	case ISD::ATOMIC_LOAD_XOR:
	case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
	case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
	case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
	case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
	case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
	case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
	case ISD::VSELECT: return LowerVSELECT(Op, DAG);
	case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
	case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
	case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
	case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
	case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
	case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
	case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
	case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
	case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
	case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
	case ISD::SHL_PARTS:
	case ISD::SRA_PARTS:
	case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
	case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
	case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
	case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
	case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
	case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
	case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
	case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
	case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
	case ISD::FABS:
	case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
	case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
	case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
	case ISD::SETCC: return LowerSETCC(Op, DAG);
	case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
	case ISD::SELECT: return LowerSELECT(Op, DAG);
	case ISD::BRCOND: return LowerBRCOND(Op, DAG);
	case ISD::JumpTable: return LowerJumpTable(Op, DAG);
	case ISD::VASTART: return LowerVASTART(Op, DAG);
	case ISD::VAARG: return LowerVAARG(Op, DAG);
	case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
	case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
	case ISD::INTRINSIC_VOID:
	case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
	case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
	case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
	case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
	case ISD::FRAME_TO_ARGS_OFFSET:
	return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
	case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
	case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
	case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
	case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
	case ISD::EH_SJLJ_SETUP_DISPATCH:
	return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
	case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
	case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
	case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
	case ISD::CTLZ:
	case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
	case ISD::CTTZ:
	case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
	case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
	case ISD::MULHS:
	case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
	case ISD::UMUL_LOHI:
	case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
	case ISD::ROTL:
	case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
	case ISD::SRA:
	case ISD::SRL:
	case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
	case ISD::SADDO:
	case ISD::UADDO:
	case ISD::SSUBO:
	case ISD::USUBO:
	case ISD::SMULO:
	case ISD::UMULO: return LowerXALUO(Op, DAG);
	case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
	case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
	case ISD::ADDCARRY:
	case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
	case ISD::ADD:
	case ISD::SUB: return LowerADD_SUB(Op, DAG);
	case ISD::SMAX:
	case ISD::SMIN:
	case ISD::UMAX:
	case ISD::UMIN: return LowerMINMAX(Op, DAG);
	case ISD::ABS: return LowerABS(Op, DAG);
	case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
	case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
	case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
	case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
	case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
	case ISD::GC_TRANSITION_START:
	return LowerGC_TRANSITION_START(Op, DAG);
	case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
	case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
	}
	}

	/// Places new result values for the node in Results (their number
	/// and types must exactly match those of the original return values of
	/// the node), or leaves Results empty, which indicates that the node is not
	/// to be custom lowered after all.
	void X86TargetLowering::LowerOperationWrapper(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const {
	SDValue Res = LowerOperation(SDValue(N, 0), DAG);

	if (!Res.getNode())
	return;

	assert((N->getNumValues() <= Res->getNumValues()) &&
	"Lowering returned the wrong number of results!");

	// Places new result values base on N result number.
	// In some cases (LowerSINT_TO_FP for example) Res has more result values
	// than original node, chain should be dropped(last value).
	for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
	Results.push_back(Res.getValue(I));
	}

	/// Replace a node with an illegal result type with a new node built out of
	/// custom code.
	void X86TargetLowering::ReplaceNodeResults(SDNode *N,
	SmallVectorImpl<SDValue>&Results,
	SelectionDAG &DAG) const {
	SDLoc dl(N);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	switch (N->getOpcode()) {
	default:
	llvm_unreachable("Do not know how to custom type legalize this operation!");
	case X86ISD::AVG: {
	// Legalize types for X86ISD::AVG by expanding vectors.
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");

	auto InVT = N->getValueType(0);
	auto InVTSize = InVT.getSizeInBits();
	const unsigned RegSize =
	(InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
	assert((Subtarget.hasBWI() \|\| RegSize < 512) &&
	"512-bit vector requires AVX512BW");
	assert((Subtarget.hasAVX2() \|\| RegSize < 256) &&
	"256-bit vector requires AVX2");

	auto ElemVT = InVT.getVectorElementType();
	auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
	RegSize / ElemVT.getSizeInBits());
	assert(RegSize % InVT.getSizeInBits() == 0);
	unsigned NumConcat = RegSize / InVT.getSizeInBits();

	SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
	Ops[0] = N->getOperand(0);
	SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
	Ops[0] = N->getOperand(1);
	SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);

	SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
	if (!ExperimentalVectorWideningLegalization)
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Res);
	return;
	}
	// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
	case X86ISD::FMINC:
	case X86ISD::FMIN:
	case X86ISD::FMAXC:
	case X86ISD::FMAX: {
	EVT VT = N->getValueType(0);
	assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
	SDValue UNDEF = DAG.getUNDEF(VT);
	SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
	N->getOperand(0), UNDEF);
	SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
	N->getOperand(1), UNDEF);
	Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
	return;
	}
	case ISD::SDIV:
	case ISD::UDIV:
	case ISD::SREM:
	case ISD::UREM:
	case ISD::SDIVREM:
	case ISD::UDIVREM: {
	SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
	Results.push_back(V);
	return;
	}
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT: {
	bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;

	if (N->getValueType(0) == MVT::v2i32) {
	assert((IsSigned \|\| Subtarget.hasAVX512()) &&
	"Can only handle signed conversion without AVX512");
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	SDValue Src = N->getOperand(0);
	if (Src.getValueType() == MVT::v2f64) {
	MVT ResVT = MVT::v4i32;
	unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
	if (!IsSigned && !Subtarget.hasVLX()) {
	// Widen to 512-bits.
	ResVT = MVT::v8i32;
	Opc = ISD::FP_TO_UINT;
	Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
	DAG.getUNDEF(MVT::v8f64),
	Src, DAG.getIntPtrConstant(0, dl));
	}
	SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
	ResVT = ExperimentalVectorWideningLegalization ? MVT::v4i32
	: MVT::v2i32;
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Res,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Res);
	return;
	}
	if (Src.getValueType() == MVT::v2f32) {
	SDValue Idx = DAG.getIntPtrConstant(0, dl);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
	DAG.getUNDEF(MVT::v2f32));
	Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
	: ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
	if (!ExperimentalVectorWideningLegalization)
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
	Results.push_back(Res);
	return;
	}

	// The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
	// so early out here.
	return;
	}

	std::pair<SDValue,SDValue> Vals =
	FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /IsReplace=/ true);
	SDValue FIST = Vals.first, StackSlot = Vals.second;
	if (FIST.getNode()) {
	EVT VT = N->getValueType(0);
	// Return a load from the stack slot.
	if (StackSlot.getNode())
	Results.push_back(
	DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
	else
	Results.push_back(FIST);
	}
	return;
	}
	case ISD::SINT_TO_FP: {
	assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
	SDValue Src = N->getOperand(0);
	if (N->getValueType(0) != MVT::v2f32 \|\| Src.getValueType() != MVT::v2i64)
	return;
	Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
	return;
	}
	case ISD::UINT_TO_FP: {
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	EVT VT = N->getValueType(0);
	if (VT != MVT::v2f32)
	return;
	SDValue Src = N->getOperand(0);
	EVT SrcVT = Src.getValueType();
	if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
	Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
	return;
	}
	if (SrcVT != MVT::v2i32)
	return;
	SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
	SDValue VBias =
	DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
	SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
	DAG.getBitcast(MVT::v2i64, VBias));
	Or = DAG.getBitcast(MVT::v2f64, Or);
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
	Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
	return;
	}
	case ISD::FP_ROUND: {
	if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
	return;
	SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
	Results.push_back(V);
	return;
	}
	case ISD::FP_EXTEND: {
	// Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
	// No other ValueType for FP_EXTEND should reach this point.
	assert(N->getValueType(0) == MVT::v2f32 &&
	"Do not know how to legalize this Node");
	return;
	}
	case ISD::INTRINSIC_W_CHAIN: {
	unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
	switch (IntNo) {
	default : llvm_unreachable("Do not know how to custom type "
	"legalize this intrinsic operation!");
	case Intrinsic::x86_rdtsc:
	return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
	Results);
	case Intrinsic::x86_rdtscp:
	return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
	Results);
	case Intrinsic::x86_rdpmc:
	return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);

	case Intrinsic::x86_xgetbv:
	return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
	}
	}
	case ISD::INTRINSIC_WO_CHAIN: {
	if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG))
	Results.push_back(V);
	return;
	}
	case ISD::READCYCLECOUNTER: {
	return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
	Results);
	}
	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
	EVT T = N->getValueType(0);
	assert((T == MVT::i64 \|\| T == MVT::i128) && "can only expand cmpxchg pair");
	bool Regs64bit = T == MVT::i128;
	MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
	SDValue cpInL, cpInH;
	cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
	DAG.getConstant(0, dl, HalfT));
	cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
	DAG.getConstant(1, dl, HalfT));
	cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
	Regs64bit ? X86::RAX : X86::EAX,
	cpInL, SDValue());
	cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
	Regs64bit ? X86::RDX : X86::EDX,
	cpInH, cpInL.getValue(1));
	SDValue swapInL, swapInH;
	swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
	DAG.getConstant(0, dl, HalfT));
	swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
	DAG.getConstant(1, dl, HalfT));
	swapInH =
	DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
	swapInH, cpInH.getValue(1));
	// If the current function needs the base pointer, RBX,
	// we shouldn't use cmpxchg directly.
	// Indeed the lowering of that instruction will clobber
	// that register and since RBX will be a reserved register
	// the register allocator will not make sure its value will
	// be properly saved and restored around this live-range.
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	SDValue Result;
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	unsigned BasePtr = TRI->getBaseRegister();
	MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
	if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
	(BasePtr == X86::RBX \|\| BasePtr == X86::EBX)) {
	// ISel prefers the LCMPXCHG64 variant.
	// If that assert breaks, that means it is not the case anymore,
	// and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
	// not just EBX. This is a matter of accepting i64 input for that
	// pseudo, and restoring into the register of the right wide
	// in expand pseudo. Everything else should just work.
	assert(((Regs64bit == (BasePtr == X86::RBX)) \|\| BasePtr == X86::EBX) &&
	"Saving only half of the RBX");
	unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
	: X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
	SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
	Regs64bit ? X86::RBX : X86::EBX,
	HalfT, swapInH.getValue(1));
	SDValue Ops[] = {/Chain/ RBXSave.getValue(1), N->getOperand(1), swapInL,
	RBXSave,
	/Glue/ RBXSave.getValue(2)};
	Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
	} else {
	unsigned Opcode =
	Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
	swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
	Regs64bit ? X86::RBX : X86::EBX, swapInL,
	swapInH.getValue(1));
	SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
	swapInL.getValue(1)};
	Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
	}
	SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
	Regs64bit ? X86::RAX : X86::EAX,
	HalfT, Result.getValue(1));
	SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
	Regs64bit ? X86::RDX : X86::EDX,
	HalfT, cpOutL.getValue(2));
	SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};

	SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
	MVT::i32, cpOutH.getValue(2));
	SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
	Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));

	Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
	Results.push_back(Success);
	Results.push_back(EFLAGS.getValue(1));
	return;
	}
	case ISD::ATOMIC_SWAP:
	case ISD::ATOMIC_LOAD_ADD:
	case ISD::ATOMIC_LOAD_SUB:
	case ISD::ATOMIC_LOAD_AND:
	case ISD::ATOMIC_LOAD_OR:
	case ISD::ATOMIC_LOAD_XOR:
	case ISD::ATOMIC_LOAD_NAND:
	case ISD::ATOMIC_LOAD_MIN:
	case ISD::ATOMIC_LOAD_MAX:
	case ISD::ATOMIC_LOAD_UMIN:
	case ISD::ATOMIC_LOAD_UMAX:
	case ISD::ATOMIC_LOAD: {
	// Delegate to generic TypeLegalization. Situations we can really handle
	// should have already been dealt with by AtomicExpandPass.cpp.
	break;
	}
	case ISD::BITCAST: {
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	EVT DstVT = N->getValueType(0);
	EVT SrcVT = N->getOperand(0).getValueType();

	if (SrcVT != MVT::f64 \|\|
	(DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
	return;

	unsigned NumElts = DstVT.getVectorNumElements();
	EVT SVT = DstVT.getVectorElementType();
	EVT WiderVT = EVT::getVectorVT(DAG.getContext(), SVT, NumElts 2);
	SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
	MVT::v2f64, N->getOperand(0));
	SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);

	if (ExperimentalVectorWideningLegalization) {
	// If we are legalizing vectors by widening, we already have the desired
	// legal vector type, just return it.
	Results.push_back(ToVecInt);
	return;
	}

	SmallVector<SDValue, 8> Elts;
	for (unsigned i = 0, e = NumElts; i != e; ++i)
	Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
	ToVecInt, DAG.getIntPtrConstant(i, dl)));

	Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
	return;
	}
	case ISD::MGATHER: {
	EVT VT = N->getValueType(0);
	if (VT == MVT::v2f32 && (Subtarget.hasVLX() \|\| !Subtarget.hasAVX512())) {
	auto *Gather = cast<MaskedGatherSDNode>(N);
	SDValue Index = Gather->getIndex();
	if (Index.getValueType() != MVT::v2i64)
	return;
	SDValue Mask = Gather->getMask();
	assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
	SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
	Gather->getValue(),
	DAG.getUNDEF(MVT::v2f32));
	if (!Subtarget.hasVLX()) {
	// We need to widen the mask, but the instruction will only use 2
	// of its elements. So we can use undef.
	Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
	DAG.getUNDEF(MVT::v2i1));
	Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
	}
	SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
	Index };
	SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
	Gather->getMemoryVT(), Gather->getMemOperand());
	Results.push_back(Res);
	Results.push_back(Res.getValue(2));
	return;
	}
	if (VT == MVT::v2i32) {
	auto *Gather = cast<MaskedGatherSDNode>(N);
	SDValue Index = Gather->getIndex();
	SDValue Mask = Gather->getMask();
	assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
	SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32,
	Gather->getValue(),
	DAG.getUNDEF(MVT::v2i32));
	// If the index is v2i64 we can use it directly.
	if (Index.getValueType() == MVT::v2i64 &&
	(Subtarget.hasVLX() \|\| !Subtarget.hasAVX512())) {
	if (!Subtarget.hasVLX()) {
	// We need to widen the mask, but the instruction will only use 2
	// of its elements. So we can use undef.
	Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
	DAG.getUNDEF(MVT::v2i1));
	Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
	}
	SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
	Index };
	SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
	Gather->getMemoryVT(), Gather->getMemOperand());
	SDValue Chain = Res.getValue(2);
	if (!ExperimentalVectorWideningLegalization)
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Res);
	Results.push_back(Chain);
	return;
	}
	EVT IndexVT = Index.getValueType();
	EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
	IndexVT.getScalarType(), 4);
	// Otherwise we need to custom widen everything to avoid promotion.
	Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
	DAG.getUNDEF(IndexVT));
	Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
	DAG.getConstant(0, dl, MVT::v2i1));
	SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
	Index };
	SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
	Gather->getMemoryVT(), dl, Ops,
	Gather->getMemOperand());
	SDValue Chain = Res.getValue(1);
	if (!ExperimentalVectorWideningLegalization)
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
	DAG.getIntPtrConstant(0, dl));
	Results.push_back(Res);
	Results.push_back(Chain);
	return;
	}
	break;
	}
	}
	}

	const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
	switch ((X86ISD::NodeType)Opcode) {
	case X86ISD::FIRST_NUMBER: break;
	case X86ISD::BSF: return "X86ISD::BSF";
	case X86ISD::BSR: return "X86ISD::BSR";
	case X86ISD::SHLD: return "X86ISD::SHLD";
	case X86ISD::SHRD: return "X86ISD::SHRD";
	case X86ISD::FAND: return "X86ISD::FAND";
	case X86ISD::FANDN: return "X86ISD::FANDN";
	case X86ISD::FOR: return "X86ISD::FOR";
	case X86ISD::FXOR: return "X86ISD::FXOR";
	case X86ISD::FILD: return "X86ISD::FILD";
	case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
	case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
	case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
	case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
	case X86ISD::FLD: return "X86ISD::FLD";
	case X86ISD::FST: return "X86ISD::FST";
	case X86ISD::CALL: return "X86ISD::CALL";
	case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
	case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
	case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
	case X86ISD::BT: return "X86ISD::BT";
	case X86ISD::CMP: return "X86ISD::CMP";
	case X86ISD::COMI: return "X86ISD::COMI";
	case X86ISD::UCOMI: return "X86ISD::UCOMI";
	case X86ISD::CMPM: return "X86ISD::CMPM";
	case X86ISD::CMPMU: return "X86ISD::CMPMU";
	case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
	case X86ISD::SETCC: return "X86ISD::SETCC";
	case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
	case X86ISD::FSETCC: return "X86ISD::FSETCC";
	case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
	case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
	case X86ISD::CMOV: return "X86ISD::CMOV";
	case X86ISD::BRCOND: return "X86ISD::BRCOND";
	case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
	case X86ISD::IRET: return "X86ISD::IRET";
	case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
	case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
	case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
	case X86ISD::Wrapper: return "X86ISD::Wrapper";
	case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
	case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
	case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
	case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
	case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
	case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
	case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
	case X86ISD::PINSRB: return "X86ISD::PINSRB";
	case X86ISD::PINSRW: return "X86ISD::PINSRW";
	case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
	case X86ISD::ANDNP: return "X86ISD::ANDNP";
	case X86ISD::BLENDI: return "X86ISD::BLENDI";
	case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
	case X86ISD::ADDUS: return "X86ISD::ADDUS";
	case X86ISD::SUBUS: return "X86ISD::SUBUS";
	case X86ISD::HADD: return "X86ISD::HADD";
	case X86ISD::HSUB: return "X86ISD::HSUB";
	case X86ISD::FHADD: return "X86ISD::FHADD";
	case X86ISD::FHSUB: return "X86ISD::FHSUB";
	case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
	case X86ISD::FMAX: return "X86ISD::FMAX";
	case X86ISD::FMAXS: return "X86ISD::FMAXS";
	case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
	case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
	case X86ISD::FMIN: return "X86ISD::FMIN";
	case X86ISD::FMINS: return "X86ISD::FMINS";
	case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
	case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
	case X86ISD::FMAXC: return "X86ISD::FMAXC";
	case X86ISD::FMINC: return "X86ISD::FMINC";
	case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
	case X86ISD::FRCP: return "X86ISD::FRCP";
	case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
	case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
	case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
	case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
	case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
	case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
	case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
	case X86ISD::EH_SJLJ_SETUP_DISPATCH:
	return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
	case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
	case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
	case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
	case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
	case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
	case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
	case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
	case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
	return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
	case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
	return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
	case X86ISD::LADD: return "X86ISD::LADD";
	case X86ISD::LSUB: return "X86ISD::LSUB";
	case X86ISD::LOR: return "X86ISD::LOR";
	case X86ISD::LXOR: return "X86ISD::LXOR";
	case X86ISD::LAND: return "X86ISD::LAND";
	case X86ISD::LINC: return "X86ISD::LINC";
	case X86ISD::LDEC: return "X86ISD::LDEC";
	case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
	case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
	case X86ISD::VZEXT: return "X86ISD::VZEXT";
	case X86ISD::VSEXT: return "X86ISD::VSEXT";
	case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
	case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
	case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
	case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
	case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
	case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
	case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
	case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
	case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
	case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
	case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
	case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
	case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
	case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
	case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
	case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
	case X86ISD::VSHL: return "X86ISD::VSHL";
	case X86ISD::VSRL: return "X86ISD::VSRL";
	case X86ISD::VSRA: return "X86ISD::VSRA";
	case X86ISD::VSHLI: return "X86ISD::VSHLI";
	case X86ISD::VSRLI: return "X86ISD::VSRLI";
	case X86ISD::VSRAI: return "X86ISD::VSRAI";
	case X86ISD::VSRAV: return "X86ISD::VSRAV";
	case X86ISD::VROTLI: return "X86ISD::VROTLI";
	case X86ISD::VROTRI: return "X86ISD::VROTRI";
	case X86ISD::VPPERM: return "X86ISD::VPPERM";
	case X86ISD::CMPP: return "X86ISD::CMPP";
	case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
	case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
	case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
	case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
	case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";
	case X86ISD::ADD: return "X86ISD::ADD";
	case X86ISD::SUB: return "X86ISD::SUB";
	case X86ISD::ADC: return "X86ISD::ADC";
	case X86ISD::SBB: return "X86ISD::SBB";
	case X86ISD::SMUL: return "X86ISD::SMUL";
	case X86ISD::UMUL: return "X86ISD::UMUL";
	case X86ISD::SMUL8: return "X86ISD::SMUL8";
	case X86ISD::UMUL8: return "X86ISD::UMUL8";
	case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
	case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
	case X86ISD::INC: return "X86ISD::INC";
	case X86ISD::DEC: return "X86ISD::DEC";
	case X86ISD::OR: return "X86ISD::OR";
	case X86ISD::XOR: return "X86ISD::XOR";
	case X86ISD::AND: return "X86ISD::AND";
	case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
	case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
	case X86ISD::PTEST: return "X86ISD::PTEST";
	case X86ISD::TESTP: return "X86ISD::TESTP";
	case X86ISD::TESTM: return "X86ISD::TESTM";
	case X86ISD::TESTNM: return "X86ISD::TESTNM";
	case X86ISD::KORTEST: return "X86ISD::KORTEST";
	case X86ISD::KTEST: return "X86ISD::KTEST";
	case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
	case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
	case X86ISD::PACKSS: return "X86ISD::PACKSS";
	case X86ISD::PACKUS: return "X86ISD::PACKUS";
	case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
	case X86ISD::VALIGN: return "X86ISD::VALIGN";
	case X86ISD::VSHLD: return "X86ISD::VSHLD";
	case X86ISD::VSHRD: return "X86ISD::VSHRD";
	case X86ISD::VSHLDV: return "X86ISD::VSHLDV";
	case X86ISD::VSHRDV: return "X86ISD::VSHRDV";
	case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
	case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
	case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
	case X86ISD::SHUFP: return "X86ISD::SHUFP";
	case X86ISD::SHUF128: return "X86ISD::SHUF128";
	case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
	case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
	case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
	case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
	case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
	case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
	case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
	case X86ISD::MOVSD: return "X86ISD::MOVSD";
	case X86ISD::MOVSS: return "X86ISD::MOVSS";
	case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
	case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
	case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
	case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
	case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
	case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
	case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
	case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
	case X86ISD::VPERMV: return "X86ISD::VPERMV";
	case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
	case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
	case X86ISD::VPERMI: return "X86ISD::VPERMI";
	case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
	case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
	case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
	case X86ISD::VRANGE: return "X86ISD::VRANGE";
	case X86ISD::VRANGE_RND: return "X86ISD::VRANGE_RND";
	case X86ISD::VRANGES: return "X86ISD::VRANGES";
	case X86ISD::VRANGES_RND: return "X86ISD::VRANGES_RND";
	case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
	case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
	case X86ISD::PSADBW: return "X86ISD::PSADBW";
	case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
	case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
	case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
	case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
	case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
	case X86ISD::MFENCE: return "X86ISD::MFENCE";
	case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
	case X86ISD::SAHF: return "X86ISD::SAHF";
	case X86ISD::RDRAND: return "X86ISD::RDRAND";
	case X86ISD::RDSEED: return "X86ISD::RDSEED";
	case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
	case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
	case X86ISD::VPSHA: return "X86ISD::VPSHA";
	case X86ISD::VPSHL: return "X86ISD::VPSHL";
	case X86ISD::VPCOM: return "X86ISD::VPCOM";
	case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
	case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
	case X86ISD::FMSUB: return "X86ISD::FMSUB";
	case X86ISD::FNMADD: return "X86ISD::FNMADD";
	case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
	case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
	case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
	case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
	case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
	case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
	case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
	case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
	case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
	case X86ISD::FMADDS1: return "X86ISD::FMADDS1";
	case X86ISD::FNMADDS1: return "X86ISD::FNMADDS1";
	case X86ISD::FMSUBS1: return "X86ISD::FMSUBS1";
	case X86ISD::FNMSUBS1: return "X86ISD::FNMSUBS1";
	case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
	case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
	case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
	case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
	case X86ISD::FMADDS3: return "X86ISD::FMADDS3";
	case X86ISD::FNMADDS3: return "X86ISD::FNMADDS3";
	case X86ISD::FMSUBS3: return "X86ISD::FMSUBS3";
	case X86ISD::FNMSUBS3: return "X86ISD::FNMSUBS3";
	case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
	case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
	case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
	case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
	case X86ISD::FMADD4S: return "X86ISD::FMADD4S";
	case X86ISD::FNMADD4S: return "X86ISD::FNMADD4S";
	case X86ISD::FMSUB4S: return "X86ISD::FMSUB4S";
	case X86ISD::FNMSUB4S: return "X86ISD::FNMSUB4S";
	case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
	case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
	case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
	case X86ISD::VRNDSCALE_RND: return "X86ISD::VRNDSCALE_RND";
	case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
	case X86ISD::VRNDSCALES_RND: return "X86ISD::VRNDSCALES_RND";
	case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
	case X86ISD::VREDUCE_RND: return "X86ISD::VREDUCE_RND";
	case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
	case X86ISD::VREDUCES_RND: return "X86ISD::VREDUCES_RND";
	case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
	case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND";
	case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
	case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND";
	case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
	case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
	case X86ISD::XTEST: return "X86ISD::XTEST";
	case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
	case X86ISD::EXPAND: return "X86ISD::EXPAND";
	case X86ISD::SELECT: return "X86ISD::SELECT";
	case X86ISD::SELECTS: return "X86ISD::SELECTS";
	case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
	case X86ISD::RCP14: return "X86ISD::RCP14";
	case X86ISD::RCP14S: return "X86ISD::RCP14S";
	case X86ISD::RCP28: return "X86ISD::RCP28";
	case X86ISD::RCP28S: return "X86ISD::RCP28S";
	case X86ISD::EXP2: return "X86ISD::EXP2";
	case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
	case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
	case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
	case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
	case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
	case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
	case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
	case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
	case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
	case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
	case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
	case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
	case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
	case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
	case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
	case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
	case X86ISD::SCALEF: return "X86ISD::SCALEF";
	case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
	case X86ISD::ADDS: return "X86ISD::ADDS";
	case X86ISD::SUBS: return "X86ISD::SUBS";
	case X86ISD::AVG: return "X86ISD::AVG";
	case X86ISD::MULHRS: return "X86ISD::MULHRS";
	case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
	case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
	case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
	case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
	case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
	case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
	case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
	case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
	case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
	case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
	case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
	case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
	case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
	case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
	case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
	case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
	case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
	case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND";
	case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
	case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
	case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
	case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
	case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
	case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
	case X86ISD::LWPINS: return "X86ISD::LWPINS";
	case X86ISD::MGATHER: return "X86ISD::MGATHER";
	case X86ISD::MSCATTER: return "X86ISD::MSCATTER";
	case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD";
	case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS";
	case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD";
	case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS";
	case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB";
	case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB";
	case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB";
	case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB";
	}
	return nullptr;
	}

	/// Return true if the addressing mode represented by AM is legal for this
	/// target, for a load/store of the specified type.
	bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS,
	Instruction *I) const {
	// X86 supports extremely general addressing modes.
	CodeModel::Model M = getTargetMachine().getCodeModel();

	// X86 allows a sign-extended 32-bit immediate field as a displacement.
	if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
	return false;

	if (AM.BaseGV) {
	unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);

	// If a reference to this global requires an extra load, we can't fold it.
	if (isGlobalStubReference(GVFlags))
	return false;

	// If BaseGV requires a register for the PIC base, we cannot also have a
	// BaseReg specified.
	if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
	return false;

	// If lower 4G is not available, then we must use rip-relative addressing.
	if ((M != CodeModel::Small \|\| isPositionIndependent()) &&
	Subtarget.is64Bit() && (AM.BaseOffs \|\| AM.Scale > 1))
	return false;
	}

	switch (AM.Scale) {
	case 0:
	case 1:
	case 2:
	case 4:
	case 8:
	// These scales always work.
	break;
	case 3:
	case 5:
	case 9:
	// These scales are formed with basereg+scalereg. Only accept if there is
	// no basereg yet.
	if (AM.HasBaseReg)
	return false;
	break;
	default: // Other stuff never works.
	return false;
	}

	return true;
	}

	bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
	unsigned Bits = Ty->getScalarSizeInBits();

	// 8-bit shifts are always expensive, but versions with a scalar amount aren't
	// particularly cheaper than those without.
	if (Bits == 8)
	return false;

	// AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
	// shifts just as cheap as scalar ones.
	if (Subtarget.hasAVX2() && (Bits == 32 \|\| Bits == 64))
	return false;

	// Otherwise, it's significantly cheaper to shift by a scalar amount than by a
	// fully general vector.
	return true;
	}

	bool X86TargetLowering::isTruncateFree(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;
	unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
	unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
	return NumBits1 > NumBits2;
	}

	bool X86TargetLowering::allowTruncateForTailCall(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;

	if (!isTypeLegal(EVT::getEVT(Ty1)))
	return false;

	assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");

	// Assuming the caller doesn't have a zeroext or signext return parameter,
	// truncation all the way down to i1 is valid.
	return true;
	}

	bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
	return isInt<32>(Imm);
	}

	bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
	// Can also use sub to handle negated immediates.
	return isInt<32>(Imm);
	}

	bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
	if (!VT1.isInteger() \|\| !VT2.isInteger())
	return false;
	unsigned NumBits1 = VT1.getSizeInBits();
	unsigned NumBits2 = VT2.getSizeInBits();
	return NumBits1 > NumBits2;
	}

	bool X86TargetLowering::isZExtFree(Type Ty1, Type Ty2) const {
	// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
	return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
	}

	bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
	// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
	return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
	}

	bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
	EVT VT1 = Val.getValueType();
	if (isZExtFree(VT1, VT2))
	return true;

	if (Val.getOpcode() != ISD::LOAD)
	return false;

	if (!VT1.isSimple() \|\| !VT1.isInteger() \|\|
	!VT2.isSimple() \|\| !VT2.isInteger())
	return false;

	switch (VT1.getSimpleVT().SimpleTy) {
	default: break;
	case MVT::i8:
	case MVT::i16:
	case MVT::i32:
	// X86 has 8, 16, and 32-bit zero-extending loads.
	return true;
	}

	return false;
	}

	bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }

	bool
	X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
	if (!Subtarget.hasAnyFMA())
	return false;

	VT = VT.getScalarType();

	if (!VT.isSimple())
	return false;

	switch (VT.getSimpleVT().SimpleTy) {
	case MVT::f32:
	case MVT::f64:
	return true;
	default:
	break;
	}

	return false;
	}

	bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
	// i16 instructions are longer (0x66 prefix) and potentially slower.
	return !(VT1 == MVT::i32 && VT2 == MVT::i16);
	}

	/// Targets can use this to indicate that they only support some
	/// VECTOR_SHUFFLE operations, those with specific masks.
	/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
	/// are assumed to be legal.
	bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
	if (!VT.isSimple())
	return false;

	// Not for i1 vectors
	if (VT.getSimpleVT().getScalarType() == MVT::i1)
	return false;

	// Very little shuffling can be done for 64-bit vectors right now.
	if (VT.getSimpleVT().getSizeInBits() == 64)
	return false;

	// We only care that the types being shuffled are legal. The lowering can
	// handle any possible shuffle mask that results.
	return isTypeLegal(VT.getSimpleVT());
	}

	bool
	X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
	EVT VT) const {
	// Just delegate to the generic legality, clear masks aren't special.
	return isShuffleMaskLegal(Mask, VT);
	}

	bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
	// If the subtarget is using retpolines, we need to not generate jump tables.
	if (Subtarget.useRetpoline())
	return false;

	// Otherwise, fallback on the generic logic.
	return TargetLowering::areJTsAllowed(Fn);
	}

	//===----------------------------------------------------------------------===//
	// X86 Scheduler Hooks
	//===----------------------------------------------------------------------===//

	/// Utility function to emit xbegin specifying the start of an RTM region.
	static MachineBasicBlock emitXBegin(MachineInstr &MI, MachineBasicBlock MBB,
	const TargetInstrInfo *TII) {
	DebugLoc DL = MI.getDebugLoc();

	const BasicBlock *BB = MBB->getBasicBlock();
	MachineFunction::iterator I = ++MBB->getIterator();

	// For the v = xbegin(), we generate
	//
	// thisMBB:
	// xbegin sinkMBB
	//
	// mainMBB:
	// s0 = -1
	//
	// fallBB:
	// eax = # XABORT_DEF
	// s1 = eax
	//
	// sinkMBB:
	// v = phi(s0/mainBB, s1/fallBB)

	MachineBasicBlock *thisMBB = MBB;
	MachineFunction *MF = MBB->getParent();
	MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(I, mainMBB);
	MF->insert(I, fallMBB);
	MF->insert(I, sinkMBB);

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

	MachineRegisterInfo &MRI = MF->getRegInfo();
	unsigned DstReg = MI.getOperand(0).getReg();
	const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
	unsigned mainDstReg = MRI.createVirtualRegister(RC);
	unsigned fallDstReg = MRI.createVirtualRegister(RC);

	// thisMBB:
	// xbegin fallMBB
	// # fallthrough to mainMBB
	// # abortion to fallMBB
	BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
	thisMBB->addSuccessor(mainMBB);
	thisMBB->addSuccessor(fallMBB);

	// mainMBB:
	// mainDstReg := -1
	BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
	BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
	mainMBB->addSuccessor(sinkMBB);

	// fallMBB:
	// ; pseudo instruction to model hardware's definition from XABORT
	// EAX := XABORT_DEF
	// fallDstReg := EAX
	BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
	BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
	.addReg(X86::EAX);
	fallMBB->addSuccessor(sinkMBB);

	// sinkMBB:
	// DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
	BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
	.addReg(mainDstReg).addMBB(mainMBB)
	.addReg(fallDstReg).addMBB(fallMBB);

	MI.eraseFromParent();
	return sinkMBB;
	}

	// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
	// or XMM0_V32I8 in AVX all of this code can be replaced with that
	// in the .td file.
	static MachineBasicBlock emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock BB,
	const TargetInstrInfo *TII) {
	unsigned Opc;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("illegal opcode!");
	case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
	case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
	case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
	case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
	case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
	case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
	case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
	case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
	}

	DebugLoc dl = MI.getDebugLoc();
	MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));

	unsigned NumArgs = MI.getNumOperands();
	for (unsigned i = 1; i < NumArgs; ++i) {
	MachineOperand &Op = MI.getOperand(i);
	if (!(Op.isReg() && Op.isImplicit()))
	MIB.add(Op);
	}
	if (MI.hasOneMemOperand())
	MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());

	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
	.addReg(X86::XMM0);

	MI.eraseFromParent();
	return BB;
	}

	// FIXME: Custom handling because TableGen doesn't support multiple implicit
	// defs in an instruction pattern
	static MachineBasicBlock emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock BB,
	const TargetInstrInfo *TII) {
	unsigned Opc;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("illegal opcode!");
	case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
	case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
	case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
	case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
	case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
	case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
	case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
	case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
	}

	DebugLoc dl = MI.getDebugLoc();
	MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));

	unsigned NumArgs = MI.getNumOperands(); // remove the results
	for (unsigned i = 1; i < NumArgs; ++i) {
	MachineOperand &Op = MI.getOperand(i);
	if (!(Op.isReg() && Op.isImplicit()))
	MIB.add(Op);
	}
	if (MI.hasOneMemOperand())
	MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());

	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
	.addReg(X86::ECX);

	MI.eraseFromParent();
	return BB;
	}

	static MachineBasicBlock emitWRPKRU(MachineInstr &MI, MachineBasicBlock BB,
	const X86Subtarget &Subtarget) {
	DebugLoc dl = MI.getDebugLoc();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();

	// insert input VAL into EAX
	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
	.addReg(MI.getOperand(0).getReg());
	// insert zero to ECX
	BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);

	// insert zero to EDX
	BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);

	// insert WRPKRU instruction
	BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	static MachineBasicBlock emitRDPKRU(MachineInstr &MI, MachineBasicBlock BB,
	const X86Subtarget &Subtarget) {
	DebugLoc dl = MI.getDebugLoc();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();

	// insert zero to ECX
	BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);

	// insert RDPKRU instruction
	BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
	.addReg(X86::EAX);

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	static MachineBasicBlock emitMonitor(MachineInstr &MI, MachineBasicBlock BB,
	const X86Subtarget &Subtarget,
	unsigned Opc) {
	DebugLoc dl = MI.getDebugLoc();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	// Address into RAX/EAX, other two args into ECX, EDX.
	unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
	unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
	MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
	for (int i = 0; i < X86::AddrNumOperands; ++i)
	MIB.add(MI.getOperand(i));

	unsigned ValOps = X86::AddrNumOperands;
	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
	.addReg(MI.getOperand(ValOps).getReg());
	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
	.addReg(MI.getOperand(ValOps + 1).getReg());

	// The instruction doesn't actually take any operands though.
	BuildMI(*BB, MI, dl, TII->get(Opc));

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	static MachineBasicBlock emitClzero(MachineInstr MI, MachineBasicBlock *BB,
	const X86Subtarget &Subtarget) {
	DebugLoc dl = MI->getDebugLoc();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	// Address into RAX/EAX
	unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
	unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
	MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
	for (int i = 0; i < X86::AddrNumOperands; ++i)
	MIB.add(MI->getOperand(i));

	// The instruction doesn't actually take any operands though.
	BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));

	MI->eraseFromParent(); // The pseudo is gone now.
	return BB;
	}



	MachineBasicBlock *
	X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	// Emit va_arg instruction on X86-64.

	// Operands to this pseudo-instruction:
	// 0 ) Output : destination address (reg)
	// 1-5) Input : va_list address (addr, i64mem)
	// 6 ) ArgSize : Size (in bytes) of vararg type
	// 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
	// 8 ) Align : Alignment of type
	// 9 ) EFLAGS (implicit-def)

	assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
	static_assert(X86::AddrNumOperands == 5,
	"VAARG_64 assumes 5 address operands");

	unsigned DestReg = MI.getOperand(0).getReg();
	MachineOperand &Base = MI.getOperand(1);
	MachineOperand &Scale = MI.getOperand(2);
	MachineOperand &Index = MI.getOperand(3);
	MachineOperand &Disp = MI.getOperand(4);
	MachineOperand &Segment = MI.getOperand(5);
	unsigned ArgSize = MI.getOperand(6).getImm();
	unsigned ArgMode = MI.getOperand(7).getImm();
	unsigned Align = MI.getOperand(8).getImm();

	// Memory Reference
	assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
	MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
	MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();

	// Machine Information
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
	const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
	const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
	DebugLoc DL = MI.getDebugLoc();

	// struct va_list {
	// i32 gp_offset
	// i32 fp_offset
	// i64 overflow_area (address)
	// i64 reg_save_area (address)
	// }
	// sizeof(va_list) = 24
	// alignment(va_list) = 8

	unsigned TotalNumIntRegs = 6;
	unsigned TotalNumXMMRegs = 8;
	bool UseGPOffset = (ArgMode == 1);
	bool UseFPOffset = (ArgMode == 2);
	unsigned MaxOffset = TotalNumIntRegs * 8 +
	(UseFPOffset ? TotalNumXMMRegs * 16 : 0);

	/* Align ArgSize to a multiple of 8 */
	unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
	bool NeedsAlign = (Align > 8);

	MachineBasicBlock *thisMBB = MBB;
	MachineBasicBlock *overflowMBB;
	MachineBasicBlock *offsetMBB;
	MachineBasicBlock *endMBB;

	unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
	unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
	unsigned OffsetReg = 0;

	if (!UseGPOffset && !UseFPOffset) {
	// If we only pull from the overflow region, we don't create a branch.
	// We don't need to alter control flow.
	OffsetDestReg = 0; // unused
	OverflowDestReg = DestReg;

	offsetMBB = nullptr;
	overflowMBB = thisMBB;
	endMBB = thisMBB;
	} else {
	// First emit code to check if gp_offset (or fp_offset) is below the bound.
	// If so, pull the argument from reg_save_area. (branch to offsetMBB)
	// If not, pull from overflow_area. (branch to overflowMBB)
	//
	// thisMBB
	// \| .
	// \| .
	// offsetMBB overflowMBB
	// \| .
	// \| .
	// endMBB

	// Registers for the PHI in endMBB
	OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
	OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);

	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
	MachineFunction *MF = MBB->getParent();
	overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	endMBB = MF->CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator MBBIter = ++MBB->getIterator();

	// Insert the new basic blocks
	MF->insert(MBBIter, offsetMBB);
	MF->insert(MBBIter, overflowMBB);
	MF->insert(MBBIter, endMBB);

	// Transfer the remainder of MBB and its successor edges to endMBB.
	endMBB->splice(endMBB->begin(), thisMBB,
	std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
	endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);

	// Make offsetMBB and overflowMBB successors of thisMBB
	thisMBB->addSuccessor(offsetMBB);
	thisMBB->addSuccessor(overflowMBB);

	// endMBB is a successor of both offsetMBB and overflowMBB
	offsetMBB->addSuccessor(endMBB);
	overflowMBB->addSuccessor(endMBB);

	// Load the offset value into a register
	OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
	BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, UseFPOffset ? 4 : 0)
	.add(Segment)
	.setMemRefs(MMOBegin, MMOEnd);

	// Check if there is enough room left to pull this argument.
	BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
	.addReg(OffsetReg)
	.addImm(MaxOffset + 8 - ArgSizeA8);

	// Branch to "overflowMBB" if offset >= max
	// Fall through to "offsetMBB" otherwise
	BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
	.addMBB(overflowMBB);
	}

	// In offsetMBB, emit code to use the reg_save_area.
	if (offsetMBB) {
	assert(OffsetReg != 0);

	// Read the reg_save_area address.
	unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 16)
	.add(Segment)
	.setMemRefs(MMOBegin, MMOEnd);

	// Zero-extend the offset
	unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
	.addImm(0)
	.addReg(OffsetReg)
	.addImm(X86::sub_32bit);

	// Add the offset to the reg_save_area to get the final address.
	BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
	.addReg(OffsetReg64)
	.addReg(RegSaveReg);

	// Compute the offset for the next argument
	unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
	.addReg(OffsetReg)
	.addImm(UseFPOffset ? 16 : 8);

	// Store it back into the va_list.
	BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, UseFPOffset ? 4 : 0)
	.add(Segment)
	.addReg(NextOffsetReg)
	.setMemRefs(MMOBegin, MMOEnd);

	// Jump to endMBB
	BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
	.addMBB(endMBB);
	}

	//
	// Emit code to use overflow area
	//

	// Load the overflow_area address into a register.
	unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 8)
	.add(Segment)
	.setMemRefs(MMOBegin, MMOEnd);

	// If we need to align it, do so. Otherwise, just copy the address
	// to OverflowDestReg.
	if (NeedsAlign) {
	// Align the overflow address
	assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
	unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);

	// aligned_addr = (addr + (align-1)) & ~(align-1)
	BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
	.addReg(OverflowAddrReg)
	.addImm(Align-1);

	BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
	.addReg(TmpReg)
	.addImm(~(uint64_t)(Align-1));
	} else {
	BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
	.addReg(OverflowAddrReg);
	}

	// Compute the next overflow address after this argument.
	// (the overflow address should be kept 8-byte aligned)
	unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
	.addReg(OverflowDestReg)
	.addImm(ArgSizeA8);

	// Store the new overflow address.
	BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 8)
	.add(Segment)
	.addReg(NextAddrReg)
	.setMemRefs(MMOBegin, MMOEnd);

	// If we branched, emit the PHI to the front of endMBB.
	if (offsetMBB) {
	BuildMI(*endMBB, endMBB->begin(), DL,
	TII->get(X86::PHI), DestReg)
	.addReg(OffsetDestReg).addMBB(offsetMBB)
	.addReg(OverflowDestReg).addMBB(overflowMBB);
	}

	// Erase the pseudo instruction
	MI.eraseFromParent();

	return endMBB;
	}

	MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
	MachineInstr &MI, MachineBasicBlock *MBB) const {
	// Emit code to save XMM registers to the stack. The ABI says that the
	// number of registers to save is given in %al, so it's theoretically
	// possible to do an indirect jump trick to avoid saving all of them,
	// however this code takes a simpler approach and just executes all
	// of the stores if %al is non-zero. It's less code, and it's probably
	// easier on the hardware branch predictor, and stores aren't all that
	// expensive anyway.

	// Create the new basic blocks. One block contains all the XMM stores,
	// and one block is the final destination regardless of whether any
	// stores were performed.
	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
	MachineFunction *F = MBB->getParent();
	MachineFunction::iterator MBBIter = ++MBB->getIterator();
	MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
	F->insert(MBBIter, XMMSaveMBB);
	F->insert(MBBIter, EndMBB);

	// Transfer the remainder of MBB and its successor edges to EndMBB.
	EndMBB->splice(EndMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	EndMBB->transferSuccessorsAndUpdatePHIs(MBB);

	// The original block will now fall through to the XMM save block.
	MBB->addSuccessor(XMMSaveMBB);
	// The XMMSaveMBB will fall through to the end block.
	XMMSaveMBB->addSuccessor(EndMBB);

	// Now add the instructions.
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	unsigned CountReg = MI.getOperand(0).getReg();
	int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
	int64_t VarArgsFPOffset = MI.getOperand(2).getImm();

	if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
	// If %al is 0, branch around the XMM save block.
	BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
	BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
	MBB->addSuccessor(EndMBB);
	}

	// Make sure the last operand is EFLAGS, which gets clobbered by the branch
	// that was just emitted, but clearly shouldn't be "saved".
	assert((MI.getNumOperands() <= 3 \|\|
	!MI.getOperand(MI.getNumOperands() - 1).isReg() \|\|
	MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
	"Expected last argument to be EFLAGS");
	unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
	// In the XMM save block, save all the XMM argument registers.
	for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
	int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
	MachineMemOperand *MMO = F->getMachineMemOperand(
	MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
	MachineMemOperand::MOStore,
	/Size=/16, /Align=/16);
	BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
	.addFrameIndex(RegSaveFrameIndex)
	.addImm(/Scale=/1)
	.addReg(/IndexReg=/0)
	.addImm(/Disp=/Offset)
	.addReg(/Segment=/0)
	.addReg(MI.getOperand(i).getReg())
	.addMemOperand(MMO);
	}

	MI.eraseFromParent(); // The pseudo instruction is gone now.

	return EndMBB;
	}

	// The EFLAGS operand of SelectItr might be missing a kill marker
	// because there were multiple uses of EFLAGS, and ISel didn't know
	// which to mark. Figure out whether SelectItr should have had a
	// kill marker, and set it if it should. Returns the correct kill
	// marker value.
	static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
	MachineBasicBlock* BB,
	const TargetRegisterInfo* TRI) {
	// Scan forward through BB for a use/def of EFLAGS.
	MachineBasicBlock::iterator miI(std::next(SelectItr));
	for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
	const MachineInstr& mi = *miI;
	if (mi.readsRegister(X86::EFLAGS))
	return false;
	if (mi.definesRegister(X86::EFLAGS))
	break; // Should have kill-flag - update below.
	}

	// If we hit the end of the block, check whether EFLAGS is live into a
	// successor.
	if (miI == BB->end()) {
	for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
	sEnd = BB->succ_end();
	sItr != sEnd; ++sItr) {
	MachineBasicBlock* succ = *sItr;
	if (succ->isLiveIn(X86::EFLAGS))
	return false;
	}
	}

	// We found a def, or hit the end of the basic block and EFLAGS wasn't live
	// out. SelectMI should have a kill flag on EFLAGS.
	SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
	return true;
	}

	// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
	// together with other CMOV pseudo-opcodes into a single basic-block with
	// conditional jump around it.
	static bool isCMOVPseudo(MachineInstr &MI) {
	switch (MI.getOpcode()) {
	case X86::CMOV_FR32:
	case X86::CMOV_FR64:
	case X86::CMOV_GR8:
	case X86::CMOV_GR16:
	case X86::CMOV_GR32:
	case X86::CMOV_RFP32:
	case X86::CMOV_RFP64:
	case X86::CMOV_RFP80:
	case X86::CMOV_V2F64:
	case X86::CMOV_V2I64:
	case X86::CMOV_V4F32:
	case X86::CMOV_V4F64:
	case X86::CMOV_V4I64:
	case X86::CMOV_V16F32:
	case X86::CMOV_V8F32:
	case X86::CMOV_V8F64:
	case X86::CMOV_V8I64:
	case X86::CMOV_V8I1:
	case X86::CMOV_V16I1:
	case X86::CMOV_V32I1:
	case X86::CMOV_V64I1:
	return true;

	default:
	return false;
	}
	}

	// Helper function, which inserts PHI functions into SinkMBB:
	// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
	// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
	// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
	// the last PHI function inserted.
	static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
	MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
	MachineBasicBlock TrueMBB, MachineBasicBlock FalseMBB,
	MachineBasicBlock *SinkMBB) {
	MachineFunction *MF = TrueMBB->getParent();
	const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
	DebugLoc DL = MIItBegin->getDebugLoc();

	X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
	X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);

	MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();

	// As we are creating the PHIs, we have to be careful if there is more than
	// one. Later CMOVs may reference the results of earlier CMOVs, but later
	// PHIs have to reference the individual true/false inputs from earlier PHIs.
	// That also means that PHI construction must work forward from earlier to
	// later, and that the code must maintain a mapping from earlier PHI's
	// destination registers, and the registers that went into the PHI.
	DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
	MachineInstrBuilder MIB;

	for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
	unsigned DestReg = MIIt->getOperand(0).getReg();
	unsigned Op1Reg = MIIt->getOperand(1).getReg();
	unsigned Op2Reg = MIIt->getOperand(2).getReg();

	// If this CMOV we are generating is the opposite condition from
	// the jump we generated, then we have to swap the operands for the
	// PHI that is going to be generated.
	if (MIIt->getOperand(3).getImm() == OppCC)
	std::swap(Op1Reg, Op2Reg);

	if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
	Op1Reg = RegRewriteTable[Op1Reg].first;

	if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
	Op2Reg = RegRewriteTable[Op2Reg].second;

	MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
	.addReg(Op1Reg)
	.addMBB(FalseMBB)
	.addReg(Op2Reg)
	.addMBB(TrueMBB);

	// Add this PHI to the rewrite table.
	RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
	}

	return MIB;
	}

	// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
	MachineBasicBlock *
	X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
	MachineInstr &SecondCascadedCMOV,
	MachineBasicBlock *ThisMBB) const {
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = FirstCMOV.getDebugLoc();

	// We lower cascaded CMOVs such as
	//
	// (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
	//
	// to two successive branches.
	//
	// Without this, we would add a PHI between the two jumps, which ends up
	// creating a few copies all around. For instance, for
	//
	// (sitofp (zext (fcmp une)))
	//
	// we would generate:
	//
	// ucomiss %xmm1, %xmm0
	// movss <1.0f>, %xmm0
	// movaps %xmm0, %xmm1
	// jne .LBB5_2
	// xorps %xmm1, %xmm1
	// .LBB5_2:
	// jp .LBB5_4
	// movaps %xmm1, %xmm0
	// .LBB5_4:
	// retq
	//
	// because this custom-inserter would have generated:
	//
	// A
	// \| \
	// \| B
	// \| /
	// C
	// \| \
	// \| D
	// \| /
	// E
	//
	// A: X = ...; Y = ...
	// B: empty
	// C: Z = PHI [X, A], [Y, B]
	// D: empty
	// E: PHI [X, C], [Z, D]
	//
	// If we lower both CMOVs in a single step, we can instead generate:
	//
	// A
	// \| \
	// \| C
	// \| /\|
	// \|/ \|
	// \| \|
	// \| D
	// \| /
	// E
	//
	// A: X = ...; Y = ...
	// D: empty
	// E: PHI [X, A], [X, C], [Y, D]
	//
	// Which, in our sitofp/fcmp example, gives us something like:
	//
	// ucomiss %xmm1, %xmm0
	// movss <1.0f>, %xmm0
	// jne .LBB5_4
	// jp .LBB5_4
	// xorps %xmm0, %xmm0
	// .LBB5_4:
	// retq
	//

	// We lower cascaded CMOV into two successive branches to the same block.
	// EFLAGS is used by both, so mark it as live in the second.
	const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
	MachineFunction *F = ThisMBB->getParent();
	MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator It = ++ThisMBB->getIterator();
	F->insert(It, FirstInsertedMBB);
	F->insert(It, SecondInsertedMBB);
	F->insert(It, SinkMBB);

	// For a cascaded CMOV, we lower it to two successive branches to
	// the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
	// the FirstInsertedMBB.
	FirstInsertedMBB->addLiveIn(X86::EFLAGS);

	// If the EFLAGS register isn't dead in the terminator, then claim that it's
	// live into the sink and copy blocks.
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
	!checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
	SecondInsertedMBB->addLiveIn(X86::EFLAGS);
	SinkMBB->addLiveIn(X86::EFLAGS);
	}

	// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
	SinkMBB->splice(SinkMBB->begin(), ThisMBB,
	std::next(MachineBasicBlock::iterator(FirstCMOV)),
	ThisMBB->end());
	SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

	// Fallthrough block for ThisMBB.
	ThisMBB->addSuccessor(FirstInsertedMBB);
	// The true block target of the first branch is always SinkMBB.
	ThisMBB->addSuccessor(SinkMBB);
	// Fallthrough block for FirstInsertedMBB.
	FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
	// The true block for the branch of FirstInsertedMBB.
	FirstInsertedMBB->addSuccessor(SinkMBB);
	// This is fallthrough.
	SecondInsertedMBB->addSuccessor(SinkMBB);

	// Create the conditional branch instructions.
	X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
	unsigned Opc = X86::GetCondBranchFromCond(FirstCC);
	BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);

	X86::CondCode SecondCC =
	X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
	unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC);
	BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB);

	// SinkMBB:
	// %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
	unsigned DestReg = FirstCMOV.getOperand(0).getReg();
	unsigned Op1Reg = FirstCMOV.getOperand(1).getReg();
	unsigned Op2Reg = FirstCMOV.getOperand(2).getReg();
	MachineInstrBuilder MIB =
	BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
	.addReg(Op1Reg)
	.addMBB(SecondInsertedMBB)
	.addReg(Op2Reg)
	.addMBB(ThisMBB);

	// The second SecondInsertedMBB provides the same incoming value as the
	// FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
	MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
	// Copy the PHI result to the register defined by the second CMOV.
	BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
	TII->get(TargetOpcode::COPY),
	SecondCascadedCMOV.getOperand(0).getReg())
	.addReg(FirstCMOV.getOperand(0).getReg());

	// Now remove the CMOVs.
	FirstCMOV.eraseFromParent();
	SecondCascadedCMOV.eraseFromParent();

	return SinkMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
	MachineBasicBlock *ThisMBB) const {
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	// To "insert" a SELECT_CC instruction, we actually have to insert the
	// diamond control-flow pattern. The incoming instruction knows the
	// destination vreg to set, the condition code register to branch on, the
	// true/false values to select between and a branch opcode to use.

	// ThisMBB:
	// ...
	// TrueVal = ...
	// cmpTY ccX, r1, r2
	// bCC copy1MBB
	// fallthrough --> FalseMBB

	// This code lowers all pseudo-CMOV instructions. Generally it lowers these
	// as described above, by inserting a BB, and then making a PHI at the join
	// point to select the true and false operands of the CMOV in the PHI.
	//
	// The code also handles two different cases of multiple CMOV opcodes
	// in a row.
	//
	// Case 1:
	// In this case, there are multiple CMOVs in a row, all which are based on
	// the same condition setting (or the exact opposite condition setting).
	// In this case we can lower all the CMOVs using a single inserted BB, and
	// then make a number of PHIs at the join point to model the CMOVs. The only
	// trickiness here, is that in a case like:
	//
	// t2 = CMOV cond1 t1, f1
	// t3 = CMOV cond1 t2, f2
	//
	// when rewriting this into PHIs, we have to perform some renaming on the
	// temps since you cannot have a PHI operand refer to a PHI result earlier
	// in the same block. The "simple" but wrong lowering would be:
	//
	// t2 = PHI t1(BB1), f1(BB2)
	// t3 = PHI t2(BB1), f2(BB2)
	//
	// but clearly t2 is not defined in BB1, so that is incorrect. The proper
	// renaming is to note that on the path through BB1, t2 is really just a
	// copy of t1, and do that renaming, properly generating:
	//
	// t2 = PHI t1(BB1), f1(BB2)
	// t3 = PHI t1(BB1), f2(BB2)
	//
	// Case 2:
	// CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
	// function - EmitLoweredCascadedSelect.

	X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
	X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
	MachineInstr *LastCMOV = &MI;
	MachineBasicBlock::iterator NextMIIt =
	std::next(MachineBasicBlock::iterator(MI));

	// Check for case 1, where there are multiple CMOVs with the same condition
	// first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
	// number of jumps the most.

	if (isCMOVPseudo(MI)) {
	// See if we have a string of CMOVS with the same condition.
	while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
	(NextMIIt->getOperand(3).getImm() == CC \|\|
	NextMIIt->getOperand(3).getImm() == OppCC)) {
	LastCMOV = &*NextMIIt;
	++NextMIIt;
	}
	}

	// This checks for case 2, but only do this if we didn't already find
	// case 1, as indicated by LastCMOV == MI.
	if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
	NextMIIt->getOpcode() == MI.getOpcode() &&
	NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
	NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
	NextMIIt->getOperand(1).isKill()) {
	return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
	}

	const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
	MachineFunction *F = ThisMBB->getParent();
	MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator It = ++ThisMBB->getIterator();
	F->insert(It, FalseMBB);
	F->insert(It, SinkMBB);

	// If the EFLAGS register isn't dead in the terminator, then claim that it's
	// live into the sink and copy blocks.
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	if (!LastCMOV->killsRegister(X86::EFLAGS) &&
	!checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
	FalseMBB->addLiveIn(X86::EFLAGS);
	SinkMBB->addLiveIn(X86::EFLAGS);
	}

	// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
	SinkMBB->splice(SinkMBB->begin(), ThisMBB,
	std::next(MachineBasicBlock::iterator(LastCMOV)),
	ThisMBB->end());
	SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

	// Fallthrough block for ThisMBB.
	ThisMBB->addSuccessor(FalseMBB);
	// The true block target of the first (or only) branch is always a SinkMBB.
	ThisMBB->addSuccessor(SinkMBB);
	// Fallthrough block for FalseMBB.
	FalseMBB->addSuccessor(SinkMBB);

	// Create the conditional branch instruction.
	unsigned Opc = X86::GetCondBranchFromCond(CC);
	BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);

	// SinkMBB:
	// %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
	// ...
	MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
	MachineBasicBlock::iterator MIItEnd =
	std::next(MachineBasicBlock::iterator(LastCMOV));
	createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);

	// Now remove the CMOV(s).
	ThisMBB->erase(MIItBegin, MIItEnd);

	return SinkMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// Combine the following atomic floating-point modification pattern:
	// a.store(reg OP a.load(acquire), release)
	// Transform them into:
	// OPss (%gpr), %xmm
	// movss %xmm, (%gpr)
	// Or sd equivalent for 64-bit operations.
	unsigned MOp, FOp;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
	case X86::RELEASE_FADD32mr:
	FOp = X86::ADDSSrm;
	MOp = X86::MOVSSmr;
	break;
	case X86::RELEASE_FADD64mr:
	FOp = X86::ADDSDrm;
	MOp = X86::MOVSDmr;
	break;
	}
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
	unsigned ValOpIdx = X86::AddrNumOperands;
	unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(FOp),
	MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
	.addReg(VSrc);
	for (int i = 0; i < X86::AddrNumOperands; ++i) {
	MachineOperand &Operand = MI.getOperand(i);
	// Clear any kill flags on register operands as we'll create a second
	// instruction using the same address operands.
	if (Operand.isReg())
	Operand.setIsKill(false);
	MIB.add(Operand);
	}
	MachineInstr *FOpMI = MIB;
	MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
	for (int i = 0; i < X86::AddrNumOperands; ++i)
	MIB.add(MI.getOperand(i));
	MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	const BasicBlock *LLVM_BB = BB->getBasicBlock();

	assert(MF->shouldSplitStack());

	const bool Is64Bit = Subtarget.is64Bit();
	const bool IsLP64 = Subtarget.isTarget64BitLP64();

	const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
	const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;

	// BB:
	// ... [Till the alloca]
	// If stacklet is not large enough, jump to mallocMBB
	//
	// bumpMBB:
	// Allocate by subtracting from RSP
	// Jump to continueMBB
	//
	// mallocMBB:
	// Allocate by call to runtime
	//
	// continueMBB:
	// ...
	// [rest of original BB]
	//

	MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);

	MachineRegisterInfo &MRI = MF->getRegInfo();
	const TargetRegisterClass *AddrRegClass =
	getRegClassFor(getPointerTy(MF->getDataLayout()));

	unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
	bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
	tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
	SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
	sizeVReg = MI.getOperand(1).getReg(),
	physSPReg =
	IsLP64 \|\| Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;

	MachineFunction::iterator MBBIter = ++BB->getIterator();

	MF->insert(MBBIter, bumpMBB);
	MF->insert(MBBIter, mallocMBB);
	MF->insert(MBBIter, continueMBB);

	continueMBB->splice(continueMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	continueMBB->transferSuccessorsAndUpdatePHIs(BB);

	// Add code to the main basic block to check if the stack limit has been hit,
	// and if so, jump to mallocMBB otherwise to bumpMBB.
	BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
	BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
	.addReg(tmpSPVReg).addReg(sizeVReg);
	BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
	.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
	.addReg(SPLimitVReg);
	BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);

	// bumpMBB simply decreases the stack pointer, since we know the current
	// stacklet has enough space.
	BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
	.addReg(SPLimitVReg);
	BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
	.addReg(SPLimitVReg);
	BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

	// Calls into a routine in libgcc to allocate more space from the heap.
	const uint32_t *RegMask =
	Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
	if (IsLP64) {
	BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
	.addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::RDI, RegState::Implicit)
	.addReg(X86::RAX, RegState::ImplicitDefine);
	} else if (Is64Bit) {
	BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
	.addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::EDI, RegState::Implicit)
	.addReg(X86::EAX, RegState::ImplicitDefine);
	} else {
	BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
	.addImm(12);
	BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::EAX, RegState::ImplicitDefine);
	}

	if (!Is64Bit)
	BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
	.addImm(16);

	BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
	.addReg(IsLP64 ? X86::RAX : X86::EAX);
	BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

	// Set up the CFG correctly.
	BB->addSuccessor(bumpMBB);
	BB->addSuccessor(mallocMBB);
	mallocMBB->addSuccessor(continueMBB);
	bumpMBB->addSuccessor(continueMBB);

	// Take care of the PHI nodes.
	BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
	MI.getOperand(0).getReg())
	.addReg(mallocPtrVReg)
	.addMBB(mallocMBB)
	.addReg(bumpSPPtrVReg)
	.addMBB(bumpMBB);

	// Delete the original pseudo instruction.
	MI.eraseFromParent();

	// And we're done.
	return continueMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
	DebugLoc DL = MI.getDebugLoc();

	assert(!isAsynchronousEHPersonality(
	classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
	"SEH does not use catchret!");

	// Only 32-bit EH needs to worry about manually restoring stack pointers.
	if (!Subtarget.is32Bit())
	return BB;

	// C++ EH creates a new target block to hold the restore code, and wires up
	// the new block to the return destination with a normal JMP_4.
	MachineBasicBlock *RestoreMBB =
	MF->CreateMachineBasicBlock(BB->getBasicBlock());
	assert(BB->succ_size() == 1);
	MF->insert(std::next(BB->getIterator()), RestoreMBB);
	RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
	BB->addSuccessor(RestoreMBB);
	MI.getOperand(0).setMBB(RestoreMBB);

	auto RestoreMBBI = RestoreMBB->begin();
	BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
	BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const Constant *PerFn = MF->getFunction().getPersonalityFn();
	bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
	// Only 32-bit SEH requires special handling for catchpad.
	if (IsSEH && Subtarget.is32Bit()) {
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
	}
	MI.eraseFromParent();
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// So, here we replace TLSADDR with the sequence:
	// adjust_stackdown -> TLSADDR -> adjust_stackup.
	// We need this because TLSADDR is lowered into calls
	// inside MC, therefore without the two markers shrink-wrapping
	// may push the prologue/epilogue pass them.
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction &MF = *BB->getParent();

	// Emit CALLSEQ_START right before the instruction.
	unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
	MachineInstrBuilder CallseqStart =
	BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
	BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);

	// Emit CALLSEQ_END right after the instruction.
	// We don't call erase from parent because we want to keep the
	// original instruction around.
	unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
	MachineInstrBuilder CallseqEnd =
	BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
	BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);

	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// This is pretty easy. We're taking the value that we received from
	// our load from the relocation, sticking it in either RDI (x86-64)
	// or EAX and doing an indirect call. The return value will then
	// be in the normal return register.
	MachineFunction *F = BB->getParent();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
	assert(MI.getOperand(3).isGlobal() && "This should be a global");

	// Get a register mask for the lowered call.
	// FIXME: The 32-bit calls have non-standard calling conventions. Use a
	// proper register mask.
	const uint32_t *RegMask =
	Subtarget.is64Bit() ?
	Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
	Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
	if (Subtarget.is64Bit()) {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
	.addReg(X86::RIP)
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
	addDirectMem(MIB, X86::RDI);
	MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
	} else if (!isPositionIndependent()) {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
	.addReg(0)
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
	addDirectMem(MIB, X86::EAX);
	MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
	} else {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
	.addReg(TII->getGlobalBaseReg(F))
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
	addDirectMem(MIB, X86::EAX);
	MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
	}

	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
	switch (RPOpc) {
	case X86::RETPOLINE_CALL32:
	return X86::CALLpcrel32;
	case X86::RETPOLINE_CALL64:
	return X86::CALL64pcrel32;
	case X86::RETPOLINE_TCRETURN32:
	return X86::TCRETURNdi;
	case X86::RETPOLINE_TCRETURN64:
	return X86::TCRETURNdi64;
	}
	llvm_unreachable("not retpoline opcode");
	}

	static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
	unsigned Reg) {
	+ if (Subtarget.useRetpolineExternalThunk()) {
	+ // When using an external thunk for retpolines, we pick names that match the
	+ // names GCC happens to use as well. This helps simplify the implementation
	+ // of the thunks for kernels where they have no easy ability to create
	+ // aliases and are doing non-trivial configuration of the thunk's body. For
	+ // example, the Linux kernel will do boot-time hot patching of the thunk
	+ // bodies and cannot easily export aliases of these to loaded modules.
	+ //
	+ // Note that at any point in the future, we may need to change the semantics
	+ // of how we implement retpolines and at that time will likely change the
	+ // name of the called thunk. Essentially, there is no hard guarantee that
	+ // LLVM will generate calls to specific thunks, we merely make a best-effort
	+ // attempt to help out kernels and other systems where duplicating the
	+ // thunks is costly.
	+ switch (Reg) {
	+ case X86::EAX:
	+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	+ return "__x86_indirect_thunk_eax";
	+ case X86::ECX:
	+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	+ return "__x86_indirect_thunk_ecx";
	+ case X86::EDX:
	+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	+ return "__x86_indirect_thunk_edx";
	+ case X86::EDI:
	+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	+ return "__x86_indirect_thunk_edi";
	+ case X86::R11:
	+ assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
	+ return "__x86_indirect_thunk_r11";
	+ }
	+ llvm_unreachable("unexpected reg for retpoline");
	+ }
	+
	+ // When targeting an internal COMDAT thunk use an LLVM-specific name.
	switch (Reg) {
	- case 0:
	- assert(!Subtarget.is64Bit() && "R11 should always be available on x64");
	- return Subtarget.useRetpolineExternalThunk()
	- ? "__llvm_external_retpoline_push"
	- : "__llvm_retpoline_push";
	case X86::EAX:
	- return Subtarget.useRetpolineExternalThunk()
	- ? "__llvm_external_retpoline_eax"
	- : "__llvm_retpoline_eax";
	+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	+ return "__llvm_retpoline_eax";
	case X86::ECX:
	- return Subtarget.useRetpolineExternalThunk()
	- ? "__llvm_external_retpoline_ecx"
	- : "__llvm_retpoline_ecx";
	+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	+ return "__llvm_retpoline_ecx";
	case X86::EDX:
	- return Subtarget.useRetpolineExternalThunk()
	- ? "__llvm_external_retpoline_edx"
	- : "__llvm_retpoline_edx";
	+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	+ return "__llvm_retpoline_edx";
	+ case X86::EDI:
	+ assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
	+ return "__llvm_retpoline_edi";
	case X86::R11:
	- return Subtarget.useRetpolineExternalThunk()
	- ? "__llvm_external_retpoline_r11"
	- : "__llvm_retpoline_r11";
	+ assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
	+ return "__llvm_retpoline_r11";
	}
	llvm_unreachable("unexpected reg for retpoline");
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// Copy the virtual register into the R11 physical register and
	// call the retpoline thunk.
	DebugLoc DL = MI.getDebugLoc();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	unsigned CalleeVReg = MI.getOperand(0).getReg();
	unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());

	// Find an available scratch register to hold the callee. On 64-bit, we can
	// just use R11, but we scan for uses anyway to ensure we don't generate
	// incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
	// already a register use operand to the call to hold the callee. If none
	- // are available, push the callee instead. This is less efficient, but is
	- // necessary for functions using 3 regparms. Such function calls are
	- // (currently) not eligible for tail call optimization, because there is no
	- // scratch register available to hold the address of the callee.
	+ // are available, use EDI instead. EDI is chosen because EBX is the PIC base
	+ // register and ESI is the base pointer to realigned stack frames with VLAs.
	SmallVector<unsigned, 3> AvailableRegs;
	if (Subtarget.is64Bit())
	AvailableRegs.push_back(X86::R11);
	else
	- AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX});
	+ AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});

	// Zero out any registers that are already used.
	for (const auto &MO : MI.operands()) {
	if (MO.isReg() && MO.isUse())
	for (unsigned &Reg : AvailableRegs)
	if (Reg == MO.getReg())
	Reg = 0;
	}

	// Choose the first remaining non-zero available register.
	unsigned AvailableReg = 0;
	for (unsigned MaybeReg : AvailableRegs) {
	if (MaybeReg) {
	AvailableReg = MaybeReg;
	break;
	}
	}
	+ if (!AvailableReg)
	+ report_fatal_error("calling convention incompatible with retpoline, no "
	+ "available registers");

	const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);

	- if (AvailableReg == 0) {
	- // No register available. Use PUSH. This must not be a tailcall, and this
	- // must not be x64.
	- if (Subtarget.is64Bit())
	- report_fatal_error(
	- "Cannot make an indirect call on x86-64 using both retpoline and a "
	- "calling convention that preservers r11");
	- if (Opc != X86::CALLpcrel32)
	- report_fatal_error("Cannot make an indirect tail call on x86 using "
	- "retpoline without a preserved register");
	- BuildMI(*BB, MI, DL, TII->get(X86::PUSH32r)).addReg(CalleeVReg);
	- MI.getOperand(0).ChangeToES(Symbol);
	- MI.setDesc(TII->get(Opc));
	- } else {
	- BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
	- .addReg(CalleeVReg);
	- MI.getOperand(0).ChangeToES(Symbol);
	- MI.setDesc(TII->get(Opc));
	- MachineInstrBuilder(*BB->getParent(), &MI)
	- .addReg(AvailableReg, RegState::Implicit \| RegState::Kill);
	- }
	+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
	+ .addReg(CalleeVReg);
	+ MI.getOperand(0).ChangeToES(Symbol);
	+ MI.setDesc(TII->get(Opc));
	+ MachineInstrBuilder(*BB->getParent(), &MI)
	+ .addReg(AvailableReg, RegState::Implicit \| RegState::Kill);
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	const BasicBlock *BB = MBB->getBasicBlock();
	MachineFunction::iterator I = ++MBB->getIterator();

	// Memory Reference
	MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
	MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();

	unsigned DstReg;
	unsigned MemOpndSlot = 0;

	unsigned CurOp = 0;

	DstReg = MI.getOperand(CurOp++).getReg();
	const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
	assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
	(void)TRI;
	unsigned mainDstReg = MRI.createVirtualRegister(RC);
	unsigned restoreDstReg = MRI.createVirtualRegister(RC);

	MemOpndSlot = CurOp;

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) &&
	"Invalid Pointer Size!");

	// For v = setjmp(buf), we generate
	//
	// thisMBB:
	// buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
	// SjLjSetup restoreMBB
	//
	// mainMBB:
	// v_main = 0
	//
	// sinkMBB:
	// v = phi(main, restore)
	//
	// restoreMBB:
	// if base pointer being used, load it from frame
	// v_restore = 1

	MachineBasicBlock *thisMBB = MBB;
	MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(I, mainMBB);
	MF->insert(I, sinkMBB);
	MF->push_back(restoreMBB);
	restoreMBB->setHasAddressTaken();

	MachineInstrBuilder MIB;

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

	// thisMBB:
	unsigned PtrStoreOpc = 0;
	unsigned LabelReg = 0;
	const int64_t LabelOffset = 1 * PVT.getStoreSize();
	bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
	!isPositionIndependent();

	// Prepare IP either in reg or imm.
	if (!UseImmLabel) {
	PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
	const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
	LabelReg = MRI.createVirtualRegister(PtrRC);
	if (Subtarget.is64Bit()) {
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
	.addReg(X86::RIP)
	.addImm(0)
	.addReg(0)
	.addMBB(restoreMBB)
	.addReg(0);
	} else {
	const X86InstrInfo XII = static_cast<const X86InstrInfo>(TII);
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
	.addReg(XII->getGlobalBaseReg(MF))
	.addImm(0)
	.addReg(0)
	.addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
	.addReg(0);
	}
	} else
	PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
	// Store IP
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
	else
	MIB.add(MI.getOperand(MemOpndSlot + i));
	}
	if (!UseImmLabel)
	MIB.addReg(LabelReg);
	else
	MIB.addMBB(restoreMBB);
	MIB.setMemRefs(MMOBegin, MMOEnd);
	// Setup
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
	.addMBB(restoreMBB);

	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	MIB.addRegMask(RegInfo->getNoPreservedMask());
	thisMBB->addSuccessor(mainMBB);
	thisMBB->addSuccessor(restoreMBB);

	// mainMBB:
	// EAX = 0
	BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
	mainMBB->addSuccessor(sinkMBB);

	// sinkMBB:
	BuildMI(*sinkMBB, sinkMBB->begin(), DL,
	TII->get(X86::PHI), DstReg)
	.addReg(mainDstReg).addMBB(mainMBB)
	.addReg(restoreDstReg).addMBB(restoreMBB);

	// restoreMBB:
	if (RegInfo->hasBasePointer(*MF)) {
	const bool Uses64BitFramePtr =
	Subtarget.isTarget64BitLP64() \|\| Subtarget.isTargetNaCl64();
	X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
	X86FI->setRestoreBasePointer(MF);
	unsigned FramePtr = RegInfo->getFrameRegister(*MF);
	unsigned BasePtr = RegInfo->getBaseRegister();
	unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
	addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
	FramePtr, true, X86FI->getRestoreBasePointerOffset())
	.setMIFlag(MachineInstr::FrameSetup);
	}
	BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
	BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
	restoreMBB->addSuccessor(sinkMBB);

	MI.eraseFromParent();
	return sinkMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	// Memory Reference
	MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
	MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) &&
	"Invalid Pointer Size!");

	const TargetRegisterClass *RC =
	(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
	unsigned Tmp = MRI.createVirtualRegister(RC);
	// Since FP is only updated here but NOT referenced, it's treated as GPR.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
	unsigned SP = RegInfo->getStackRegister();

	MachineInstrBuilder MIB;

	const int64_t LabelOffset = 1 * PVT.getStoreSize();
	const int64_t SPOffset = 2 * PVT.getStoreSize();

	unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
	unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;

	// Reload FP
	MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
	MIB.add(MI.getOperand(i));
	MIB.setMemRefs(MMOBegin, MMOEnd);
	// Reload IP
	MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(i), LabelOffset);
	else
	MIB.add(MI.getOperand(i));
	}
	MIB.setMemRefs(MMOBegin, MMOEnd);
	// Reload SP
	MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(i), SPOffset);
	else
	MIB.add(MI.getOperand(i));
	}
	MIB.setMemRefs(MMOBegin, MMOEnd);
	// Jump
	BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);

	MI.eraseFromParent();
	return MBB;
	}

	void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
	MachineBasicBlock *MBB,
	MachineBasicBlock *DispatchBB,
	int FI) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	MachineRegisterInfo *MRI = &MF->getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) && "Invalid Pointer Size!");

	unsigned Op = 0;
	unsigned VR = 0;

	bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
	!isPositionIndependent();

	if (UseImmLabel) {
	Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
	} else {
	const TargetRegisterClass *TRC =
	(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
	VR = MRI->createVirtualRegister(TRC);
	Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

	if (Subtarget.is64Bit())
	BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
	.addReg(X86::RIP)
	.addImm(1)
	.addReg(0)
	.addMBB(DispatchBB)
	.addReg(0);
	else
	BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
	.addReg(0) /* TII->getGlobalBaseReg(MF) */
	.addImm(1)
	.addReg(0)
	.addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
	.addReg(0);
	}

	MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
	addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
	if (UseImmLabel)
	MIB.addMBB(DispatchBB);
	else
	MIB.addReg(VR);
	}

	MachineBasicBlock *
	X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = BB->getParent();
	MachineFrameInfo &MFI = MF->getFrameInfo();
	MachineRegisterInfo *MRI = &MF->getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	int FI = MFI.getFunctionContextIndex();

	// Get a mapping of the call site numbers to all of the landing pads they're
	// associated with.
	DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
	unsigned MaxCSNum = 0;
	for (auto &MBB : *MF) {
	if (!MBB.isEHPad())
	continue;

	MCSymbol *Sym = nullptr;
	for (const auto &MI : MBB) {
	if (MI.isDebugValue())
	continue;

	assert(MI.isEHLabel() && "expected EH_LABEL");
	Sym = MI.getOperand(0).getMCSymbol();
	break;
	}

	if (!MF->hasCallSiteLandingPad(Sym))
	continue;

	for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
	CallSiteNumToLPad[CSI].push_back(&MBB);
	MaxCSNum = std::max(MaxCSNum, CSI);
	}
	}

	// Get an ordered list of the machine basic blocks for the jump table.
	std::vector<MachineBasicBlock *> LPadList;
	SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
	LPadList.reserve(CallSiteNumToLPad.size());

	for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
	for (auto &LP : CallSiteNumToLPad[CSI]) {
	LPadList.push_back(LP);
	InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
	}
	}

	assert(!LPadList.empty() &&
	"No landing pad destinations for the dispatch jump table!");

	// Create the MBBs for the dispatch code.

	// Shove the dispatch's address into the return slot in the function context.
	MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
	DispatchBB->setIsEHPad(true);

	MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
	BuildMI(TrapBB, DL, TII->get(X86::TRAP));
	DispatchBB->addSuccessor(TrapBB);

	MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
	DispatchBB->addSuccessor(DispContBB);

	// Insert MBBs.
	MF->push_back(DispatchBB);
	MF->push_back(DispContBB);
	MF->push_back(TrapBB);

	// Insert code into the entry block that creates and registers the function
	// context.
	SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);

	// Create the jump table and associated information
	unsigned JTE = getJumpTableEncoding();
	MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
	unsigned MJTI = JTI->createJumpTableIndex(LPadList);

	const X86RegisterInfo &RI = TII->getRegisterInfo();
	// Add a register mask with no preserved registers. This results in all
	// registers being marked as clobbered.
	if (RI.hasBasePointer(*MF)) {
	const bool FPIs64Bit =
	Subtarget.isTarget64BitLP64() \|\| Subtarget.isTargetNaCl64();
	X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
	MFI->setRestoreBasePointer(MF);

	unsigned FP = RI.getFrameRegister(*MF);
	unsigned BP = RI.getBaseRegister();
	unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
	addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
	MFI->getRestoreBasePointerOffset())
	.addRegMask(RI.getNoPreservedMask());
	} else {
	BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
	.addRegMask(RI.getNoPreservedMask());
	}

	// IReg is used as an index in a memory operand and therefore can't be SP
	unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
	addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
	Subtarget.is64Bit() ? 8 : 4);
	BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
	.addReg(IReg)
	.addImm(LPadList.size());
	BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB);

	if (Subtarget.is64Bit()) {
	unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
	unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);

	// leaq .LJTI0_0(%rip), BReg
	BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
	.addReg(X86::RIP)
	.addImm(1)
	.addReg(0)
	.addJumpTableIndex(MJTI)
	.addReg(0);
	// movzx IReg64, IReg
	BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
	.addImm(0)
	.addReg(IReg)
	.addImm(X86::sub_32bit);

	switch (JTE) {
	case MachineJumpTableInfo::EK_BlockAddress:
	// jmpq *(BReg,IReg64,8)
	BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
	.addReg(BReg)
	.addImm(8)
	.addReg(IReg64)
	.addImm(0)
	.addReg(0);
	break;
	case MachineJumpTableInfo::EK_LabelDifference32: {
	unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
	unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
	unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass);

	// movl (BReg,IReg64,4), OReg
	BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
	.addReg(BReg)
	.addImm(4)
	.addReg(IReg64)
	.addImm(0)
	.addReg(0);
	// movsx OReg64, OReg
	BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
	// addq BReg, OReg64, TReg
	BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
	.addReg(OReg64)
	.addReg(BReg);
	// jmpq *TReg
	BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
	break;
	}
	default:
	llvm_unreachable("Unexpected jump table encoding");
	}
	} else {
	// jmpl *.LJTI0_0(,IReg,4)
	BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
	.addReg(0)
	.addImm(4)
	.addReg(IReg)
	.addJumpTableIndex(MJTI)
	.addReg(0);
	}

	// Add the jump table entries as successors to the MBB.
	SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
	for (auto &LP : LPadList)
	if (SeenMBBs.insert(LP).second)
	DispContBB->addSuccessor(LP);

	// N.B. the order the invoke BBs are processed in doesn't matter here.
	SmallVector<MachineBasicBlock *, 64> MBBLPads;
	const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
	for (MachineBasicBlock *MBB : InvokeBBs) {
	// Remove the landing pad successor from the invoke block and replace it
	// with the new dispatch block.
	// Keep a copy of Successors since it's modified inside the loop.
	SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
	MBB->succ_rend());
	// FIXME: Avoid quadratic complexity.
	for (auto MBBS : Successors) {
	if (MBBS->isEHPad()) {
	MBB->removeSuccessor(MBBS);
	MBBLPads.push_back(MBBS);
	}
	}

	MBB->addSuccessor(DispatchBB);

	// Find the invoke call and mark all of the callee-saved registers as
	// 'implicit defined' so that they're spilled. This prevents code from
	// moving instructions to before the EH block, where they will never be
	// executed.
	for (auto &II : reverse(*MBB)) {
	if (!II.isCall())
	continue;

	DenseMap<unsigned, bool> DefRegs;
	for (auto &MOp : II.operands())
	if (MOp.isReg())
	DefRegs[MOp.getReg()] = true;

	MachineInstrBuilder MIB(*MF, &II);
	for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
	unsigned Reg = SavedRegs[RI];
	if (!DefRegs[Reg])
	MIB.addReg(Reg, RegState::ImplicitDefine \| RegState::Dead);
	}

	break;
	}
	}

	// Mark all former landing pads as non-landing pads. The dispatch is the only
	// landing pad now.
	for (auto &LP : MBBLPads)
	LP->setIsEHPad(false);

	// The instruction is gone now.
	MI.eraseFromParent();
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	switch (MI.getOpcode()) {
	default: llvm_unreachable("Unexpected instr type to insert");
	case X86::TAILJMPd64:
	case X86::TAILJMPr64:
	case X86::TAILJMPm64:
	case X86::TAILJMPr64_REX:
	case X86::TAILJMPm64_REX:
	llvm_unreachable("TAILJMP64 would not be touched here.");
	case X86::TCRETURNdi64:
	case X86::TCRETURNri64:
	case X86::TCRETURNmi64:
	return BB;
	case X86::TLS_addr32:
	case X86::TLS_addr64:
	case X86::TLS_base_addr32:
	case X86::TLS_base_addr64:
	return EmitLoweredTLSAddr(MI, BB);
	case X86::RETPOLINE_CALL32:
	case X86::RETPOLINE_CALL64:
	case X86::RETPOLINE_TCRETURN32:
	case X86::RETPOLINE_TCRETURN64:
	return EmitLoweredRetpoline(MI, BB);
	case X86::CATCHRET:
	return EmitLoweredCatchRet(MI, BB);
	case X86::CATCHPAD:
	return EmitLoweredCatchPad(MI, BB);
	case X86::SEG_ALLOCA_32:
	case X86::SEG_ALLOCA_64:
	return EmitLoweredSegAlloca(MI, BB);
	case X86::TLSCall_32:
	case X86::TLSCall_64:
	return EmitLoweredTLSCall(MI, BB);
	case X86::CMOV_FR32:
	case X86::CMOV_FR64:
	case X86::CMOV_FR128:
	case X86::CMOV_GR8:
	case X86::CMOV_GR16:
	case X86::CMOV_GR32:
	case X86::CMOV_RFP32:
	case X86::CMOV_RFP64:
	case X86::CMOV_RFP80:
	case X86::CMOV_V2F64:
	case X86::CMOV_V2I64:
	case X86::CMOV_V4F32:
	case X86::CMOV_V4F64:
	case X86::CMOV_V4I64:
	case X86::CMOV_V16F32:
	case X86::CMOV_V8F32:
	case X86::CMOV_V8F64:
	case X86::CMOV_V8I64:
	case X86::CMOV_V8I1:
	case X86::CMOV_V16I1:
	case X86::CMOV_V32I1:
	case X86::CMOV_V64I1:
	return EmitLoweredSelect(MI, BB);

	case X86::RDFLAGS32:
	case X86::RDFLAGS64: {
	unsigned PushF =
	MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
	unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
	MachineInstr Push = BuildMI(BB, MI, DL, TII->get(PushF));
	// Permit reads of the FLAGS register without it being defined.
	// This intrinsic exists to read external processor state in flags, such as
	// the trap flag, interrupt flag, and direction flag, none of which are
	// modeled by the backend.
	Push->getOperand(2).setIsUndef();
	BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	case X86::WRFLAGS32:
	case X86::WRFLAGS64: {
	unsigned Push =
	MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
	unsigned PopF =
	MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
	BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
	BuildMI(*BB, MI, DL, TII->get(PopF));

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	case X86::RELEASE_FADD32mr:
	case X86::RELEASE_FADD64mr:
	return EmitLoweredAtomicFP(MI, BB);

	case X86::FP32_TO_INT16_IN_MEM:
	case X86::FP32_TO_INT32_IN_MEM:
	case X86::FP32_TO_INT64_IN_MEM:
	case X86::FP64_TO_INT16_IN_MEM:
	case X86::FP64_TO_INT32_IN_MEM:
	case X86::FP64_TO_INT64_IN_MEM:
	case X86::FP80_TO_INT16_IN_MEM:
	case X86::FP80_TO_INT32_IN_MEM:
	case X86::FP80_TO_INT64_IN_MEM: {
	// Change the floating point control register to use "round towards zero"
	// mode when truncating to an integer value.
	int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FNSTCW16m)), CWFrameIdx);

	// Load the old value of the high byte of the control word...
	unsigned OldCW =
	MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
	addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
	CWFrameIdx);

	// Set the high part to be round to zero...
	addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
	.addImm(0xC7F);

	// Reload the modified control word now...
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FLDCW16m)), CWFrameIdx);

	// Restore the memory image of control word to original value
	addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
	.addReg(OldCW);

	// Get the X86 opcode to use.
	unsigned Opc;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("illegal opcode!");
	case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
	case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
	case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
	case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
	case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
	case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
	case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
	case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
	case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
	}

	X86AddressMode AM = getAddressFromInstr(&MI, 0);
	addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
	.addReg(MI.getOperand(X86::AddrNumOperands).getReg());

	// Reload the original control word now.
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FLDCW16m)), CWFrameIdx);

	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}
	// String/text processing lowering.
	case X86::PCMPISTRM128REG:
	case X86::VPCMPISTRM128REG:
	case X86::PCMPISTRM128MEM:
	case X86::VPCMPISTRM128MEM:
	case X86::PCMPESTRM128REG:
	case X86::VPCMPESTRM128REG:
	case X86::PCMPESTRM128MEM:
	case X86::VPCMPESTRM128MEM:
	assert(Subtarget.hasSSE42() &&
	"Target must have SSE4.2 or AVX features enabled");
	return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());

	// String/text processing lowering.
	case X86::PCMPISTRIREG:
	case X86::VPCMPISTRIREG:
	case X86::PCMPISTRIMEM:
	case X86::VPCMPISTRIMEM:
	case X86::PCMPESTRIREG:
	case X86::VPCMPESTRIREG:
	case X86::PCMPESTRIMEM:
	case X86::VPCMPESTRIMEM:
	assert(Subtarget.hasSSE42() &&
	"Target must have SSE4.2 or AVX features enabled");
	return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());

	// Thread synchronization.
	case X86::MONITOR:
	return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
	case X86::MONITORX:
	return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);

	// Cache line zero
	case X86::CLZERO:
	return emitClzero(&MI, BB, Subtarget);

	// PKU feature
	case X86::WRPKRU:
	return emitWRPKRU(MI, BB, Subtarget);
	case X86::RDPKRU:
	return emitRDPKRU(MI, BB, Subtarget);
	// xbegin
	case X86::XBEGIN:
	return emitXBegin(MI, BB, Subtarget.getInstrInfo());

	case X86::VASTART_SAVE_XMM_REGS:
	return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);

	case X86::VAARG_64:
	return EmitVAARG64WithCustomInserter(MI, BB);

	case X86::EH_SjLj_SetJmp32:
	case X86::EH_SjLj_SetJmp64:
	return emitEHSjLjSetJmp(MI, BB);

	case X86::EH_SjLj_LongJmp32:
	case X86::EH_SjLj_LongJmp64:
	return emitEHSjLjLongJmp(MI, BB);

	case X86::Int_eh_sjlj_setup_dispatch:
	return EmitSjLjDispatchBlock(MI, BB);

	case TargetOpcode::STATEPOINT:
	// As an implementation detail, STATEPOINT shares the STACKMAP format at
	// this point in the process. We diverge later.
	return emitPatchPoint(MI, BB);

	case TargetOpcode::STACKMAP:
	case TargetOpcode::PATCHPOINT:
	return emitPatchPoint(MI, BB);

	case TargetOpcode::PATCHABLE_EVENT_CALL:
	// Do nothing here, handle in xray instrumentation pass.
	return BB;

	case X86::LCMPXCHG8B: {
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	// In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
	// requires a memory operand. If it happens that current architecture is
	// i686 and for current function we need a base pointer
	// - which is ESI for i686 - register allocator would not be able to
	// allocate registers for an address in form of X(%reg, %reg, Y)
	// - there never would be enough unreserved registers during regalloc
	// (without the need for base ptr the only option would be X(%edi, %esi, Y).
	// We are giving a hand to register allocator by precomputing the address in
	// a new vreg using LEA.

	// If it is not i686 or there is no base pointer - nothing to do here.
	if (!Subtarget.is32Bit() \|\| !TRI->hasBasePointer(*MF))
	return BB;

	// Even though this code does not necessarily needs the base pointer to
	// be ESI, we check for that. The reason: if this assert fails, there are
	// some changes happened in the compiler base pointer handling, which most
	// probably have to be addressed somehow here.
	assert(TRI->getBaseRegister() == X86::ESI &&
	"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
	"base pointer in mind");

	MachineRegisterInfo &MRI = MF->getRegInfo();
	MVT SPTy = getPointerTy(MF->getDataLayout());
	const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
	unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);

	X86AddressMode AM = getAddressFromInstr(&MI, 0);
	// Regalloc does not need any help when the memory operand of CMPXCHG8B
	// does not use index register.
	if (AM.IndexReg == X86::NoRegister)
	return BB;

	// After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
	// four operand definitions that are E[ABCD] registers. We skip them and
	// then insert the LEA.
	MachineBasicBlock::iterator MBBI(MI);
	while (MBBI->definesRegister(X86::EAX) \|\| MBBI->definesRegister(X86::EBX) \|\|
	MBBI->definesRegister(X86::ECX) \|\| MBBI->definesRegister(X86::EDX))
	--MBBI;
	addFullAddress(
	BuildMI(BB, MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);

	setDirectAddressInInstr(&MI, 0, computedAddrVReg);

	return BB;
	}
	case X86::LCMPXCHG16B:
	return BB;
	case X86::LCMPXCHG8B_SAVE_EBX:
	case X86::LCMPXCHG16B_SAVE_RBX: {
	unsigned BasePtr =
	MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
	if (!BB->isLiveIn(BasePtr))
	BB->addLiveIn(BasePtr);
	return BB;
	}
	}
	}

	//===----------------------------------------------------------------------===//
	// X86 Optimization Hooks
	//===----------------------------------------------------------------------===//

	void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
	KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth) const {
	unsigned BitWidth = Known.getBitWidth();
	unsigned Opc = Op.getOpcode();
	EVT VT = Op.getValueType();
	assert((Opc >= ISD::BUILTIN_OP_END \|\|
	Opc == ISD::INTRINSIC_WO_CHAIN \|\|
	Opc == ISD::INTRINSIC_W_CHAIN \|\|
	Opc == ISD::INTRINSIC_VOID) &&
	"Should use MaskedValueIsZero if you don't know whether Op"
	" is a target node!");

	Known.resetAll();
	switch (Opc) {
	default: break;
	case X86ISD::SETCC:
	Known.Zero.setBitsFrom(1);
	break;
	case X86ISD::MOVMSK: {
	unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
	Known.Zero.setBitsFrom(NumLoBits);
	break;
	}
	case X86ISD::PEXTRB:
	case X86ISD::PEXTRW: {
	SDValue Src = Op.getOperand(0);
	EVT SrcVT = Src.getValueType();
	APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
	Op.getConstantOperandVal(1));
	DAG.computeKnownBits(Src, Known, DemandedElt, Depth + 1);
	Known = Known.zextOrTrunc(BitWidth);
	Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
	break;
	}
	case X86ISD::VSHLI:
	case X86ISD::VSRLI: {
	if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
	Known.setAllZero();
	break;
	}

	DAG.computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
	unsigned ShAmt = ShiftImm->getZExtValue();
	if (Opc == X86ISD::VSHLI) {
	Known.Zero <<= ShAmt;
	Known.One <<= ShAmt;
	// Low bits are known zero.
	Known.Zero.setLowBits(ShAmt);
	} else {
	Known.Zero.lshrInPlace(ShAmt);
	Known.One.lshrInPlace(ShAmt);
	// High bits are known zero.
	Known.Zero.setHighBits(ShAmt);
	}
	}
	break;
	}
	case X86ISD::VZEXT: {
	// TODO: Add DemandedElts support.
	SDValue N0 = Op.getOperand(0);
	unsigned NumElts = VT.getVectorNumElements();

	EVT SrcVT = N0.getValueType();
	unsigned InNumElts = SrcVT.getVectorNumElements();
	unsigned InBitWidth = SrcVT.getScalarSizeInBits();
	assert(InNumElts >= NumElts && "Illegal VZEXT input");

	Known = KnownBits(InBitWidth);
	APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
	DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
	Known = Known.zext(BitWidth);
	Known.Zero.setBitsFrom(InBitWidth);
	break;
	}
	case X86ISD::CMOV: {
	DAG.computeKnownBits(Op.getOperand(1), Known, Depth+1);
	// If we don't know any bits, early out.
	if (Known.isUnknown())
	break;
	KnownBits Known2;
	DAG.computeKnownBits(Op.getOperand(0), Known2, Depth+1);

	// Only known if known in both the LHS and RHS.
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	break;
	}
	case X86ISD::UDIVREM8_ZEXT_HREG:
	// TODO: Support more than just the zero extended bits?
	if (Op.getResNo() != 1)
	break;
	// The remainder is zero extended.
	Known.Zero.setBitsFrom(8);
	break;
	}
	}

	unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
	SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
	unsigned Depth) const {
	unsigned VTBits = Op.getScalarValueSizeInBits();
	unsigned Opcode = Op.getOpcode();
	switch (Opcode) {
	case X86ISD::SETCC_CARRY:
	// SETCC_CARRY sets the dest to ~0 for true or 0 for false.
	return VTBits;

	case X86ISD::VSEXT: {
	// TODO: Add DemandedElts support.
	SDValue Src = Op.getOperand(0);
	unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
	Tmp += VTBits - Src.getScalarValueSizeInBits();
	return Tmp;
	}

	case X86ISD::VTRUNC: {
	// TODO: Add DemandedElts support.
	SDValue Src = Op.getOperand(0);
	unsigned NumSrcBits = Src.getScalarValueSizeInBits();
	assert(VTBits < NumSrcBits && "Illegal truncation input type");
	unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
	if (Tmp > (NumSrcBits - VTBits))
	return Tmp - (NumSrcBits - VTBits);
	return 1;
	}

	case X86ISD::PACKSS: {
	// PACKSS is just a truncation if the sign bits extend to the packed size.
	// TODO: Add DemandedElts support.
	unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
	unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
	unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
	unsigned Tmp = std::min(Tmp0, Tmp1);
	if (Tmp > (SrcBits - VTBits))
	return Tmp - (SrcBits - VTBits);
	return 1;
	}

	case X86ISD::VSHLI: {
	SDValue Src = Op.getOperand(0);
	APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
	if (ShiftVal.uge(VTBits))
	return VTBits; // Shifted all bits out --> zero.
	unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
	if (ShiftVal.uge(Tmp))
	return 1; // Shifted all sign bits out --> unknown.
	return Tmp - ShiftVal.getZExtValue();
	}

	case X86ISD::VSRAI: {
	SDValue Src = Op.getOperand(0);
	APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
	if (ShiftVal.uge(VTBits - 1))
	return VTBits; // Sign splat.
	unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
	ShiftVal += Tmp;
	return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
	}

	case X86ISD::PCMPGT:
	case X86ISD::PCMPEQ:
	case X86ISD::CMPP:
	case X86ISD::VPCOM:
	case X86ISD::VPCOMU:
	// Vector compares return zero/all-bits result values.
	return VTBits;

	case X86ISD::CMOV: {
	unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
	if (Tmp0 == 1) return 1; // Early out.
	unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
	return std::min(Tmp0, Tmp1);
	}
	case X86ISD::SDIVREM8_SEXT_HREG:
	// TODO: Support more than just the sign extended bits?
	if (Op.getResNo() != 1)
	break;
	// The remainder is sign extended.
	return VTBits - 7;
	}

	// Fallback case.
	return 1;
	}

	SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
	if (N->getOpcode() == X86ISD::Wrapper \|\| N->getOpcode() == X86ISD::WrapperRIP)
	return N->getOperand(0);
	return N;
	}

	/// Returns true (and the GlobalValue and the offset) if the node is a
	/// GlobalAddress + offset.
	bool X86TargetLowering::isGAPlusOffset(SDNode *N,
	const GlobalValue* &GA,
	int64_t &Offset) const {
	if (N->getOpcode() == X86ISD::Wrapper) {
	if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
	GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
	Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
	return true;
	}
	}
	return TargetLowering::isGAPlusOffset(N, GA, Offset);
	}

	// Attempt to match a combined shuffle mask against supported unary shuffle
	// instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
	bool AllowFloatDomain, bool AllowIntDomain,
	SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
	unsigned NumMaskElts = Mask.size();
	unsigned MaskEltSize = MaskVT.getScalarSizeInBits();

	// Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
	// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
	unsigned MaxScale = 64 / MaskEltSize;
	for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
	bool Match = true;
	unsigned NumDstElts = NumMaskElts / Scale;
	for (unsigned i = 0; i != NumDstElts && Match; ++i) {
	Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
	Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
	}
	if (Match) {
	unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
	MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
	MVT::getIntegerVT(MaskEltSize);
	SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);

	if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) {
	V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
	Shuffle = unsigned(X86ISD::VZEXT);
	} else
	Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);

	DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
	DstVT = MVT::getVectorVT(DstVT, NumDstElts);
	return true;
	}
	}
	}

	// Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
	if (((MaskEltSize == 32) \|\| (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
	isUndefOrEqual(Mask[0], 0) &&
	isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
	Shuffle = X86ISD::VZEXT_MOVL;
	SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
	return true;
	}

	// Check if we have SSE3 which will let us use MOVDDUP etc. The
	// instructions are no slower than UNPCKLPD but has the option to
	// fold the input operand into even an unaligned memory load.
	if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
	if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v2f64;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	}

	if (MaskVT.is256BitVector() && AllowFloatDomain) {
	assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v4f64;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v8f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v8f32;
	return true;
	}
	}

	if (MaskVT.is512BitVector() && AllowFloatDomain) {
	assert(Subtarget.hasAVX512() &&
	"AVX512 required for 512-bit vector shuffles");
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v8f64;
	return true;
	}
	if (isTargetShuffleEquivalent(
	Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v16f32;
	return true;
	}
	if (isTargetShuffleEquivalent(
	Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v16f32;
	return true;
	}
	}

	// Attempt to match against broadcast-from-vector.
	if (Subtarget.hasAVX2()) {
	SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
	if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
	SrcVT = DstVT = MaskVT;
	Shuffle = X86ISD::VBROADCAST;
	return true;
	}
	}

	return false;
	}

	// Attempt to match a combined shuffle mask against supported unary immediate
	// permute instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
	const APInt &Zeroable,
	bool AllowFloatDomain,
	bool AllowIntDomain,
	const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &ShuffleVT,
	unsigned &PermuteImm) {
	unsigned NumMaskElts = Mask.size();
	unsigned InputSizeInBits = MaskVT.getSizeInBits();
	unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
	MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);

	bool ContainsZeros =
	llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });

	// Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
	if (!ContainsZeros && MaskScalarSizeInBits == 64) {
	// Check for lane crossing permutes.
	if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
	// PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
	if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
	Shuffle = X86ISD::VPERMI;
	ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
	PermuteImm = getV4X86ShuffleImm(Mask);
	return true;
	}
	if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
	SmallVector<int, 4> RepeatedMask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
	Shuffle = X86ISD::VPERMI;
	ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
	PermuteImm = getV4X86ShuffleImm(RepeatedMask);
	return true;
	}
	}
	} else if (AllowFloatDomain && Subtarget.hasAVX()) {
	// VPERMILPD can permute with a non-repeating shuffle.
	Shuffle = X86ISD::VPERMILPI;
	ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
	PermuteImm = 0;
	for (int i = 0, e = Mask.size(); i != e; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
	PermuteImm \|= (M & 1) << i;
	}
	return true;
	}
	}

	// Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
	// AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
	// had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
	if ((MaskScalarSizeInBits == 64 \|\| MaskScalarSizeInBits == 32) &&
	!ContainsZeros && (AllowIntDomain \|\| Subtarget.hasAVX())) {
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
	// Narrow the repeated mask to create 32-bit element permutes.
	SmallVector<int, 4> WordMask = RepeatedMask;
	if (MaskScalarSizeInBits == 64)
	scaleShuffleMask<int>(2, RepeatedMask, WordMask);

	Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
	ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
	ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
	PermuteImm = getV4X86ShuffleImm(WordMask);
	return true;
	}
	}

	// Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
	if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
	ArrayRef<int> LoMask(Mask.data() + 0, 4);
	ArrayRef<int> HiMask(Mask.data() + 4, 4);

	// PSHUFLW: permute lower 4 elements only.
	if (isUndefOrInRange(LoMask, 0, 4) &&
	isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
	Shuffle = X86ISD::PSHUFLW;
	ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
	PermuteImm = getV4X86ShuffleImm(LoMask);
	return true;
	}

	// PSHUFHW: permute upper 4 elements only.
	if (isUndefOrInRange(HiMask, 4, 8) &&
	isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
	// Offset the HiMask so that we can create the shuffle immediate.
	int OffsetHiMask[4];
	for (int i = 0; i != 4; ++i)
	OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);

	Shuffle = X86ISD::PSHUFHW;
	ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
	PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
	return true;
	}
	}
	}

	// Attempt to match against byte/bit shifts.
	// FIXME: Add 512-bit support.
	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
	int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
	MaskScalarSizeInBits, Mask,
	0, Zeroable, Subtarget);
	if (0 < ShiftAmt) {
	PermuteImm = (unsigned)ShiftAmt;
	return true;
	}
	}

	return false;
	}

	// Attempt to match a combined unary shuffle mask against supported binary
	// shuffle instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
	bool AllowFloatDomain, bool AllowIntDomain,
	SDValue &V1, SDValue &V2, SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
	bool IsUnary) {
	unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

	if (MaskVT.is128BitVector()) {
	if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
	V2 = V1;
	Shuffle = X86ISD::MOVLHPS;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
	V2 = V1;
	Shuffle = X86ISD::MOVHLPS;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
	(AllowFloatDomain \|\| !Subtarget.hasSSE41())) {
	std::swap(V1, V2);
	Shuffle = X86ISD::MOVSD;
	SrcVT = DstVT = MaskVT;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
	(AllowFloatDomain \|\| !Subtarget.hasSSE41())) {
	Shuffle = X86ISD::MOVSS;
	SrcVT = DstVT = MaskVT;
	return true;
	}
	}

	// Attempt to match against either a unary or binary PACKSS/PACKUS shuffle.
	// TODO add support for 256/512-bit types.
	if ((MaskVT == MVT::v8i16 \|\| MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) {
	if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
	Subtarget)) {
	DstVT = MaskVT;
	return true;
	}
	}

	// Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
	if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) \|\|
	(MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
	if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
	DAG, Subtarget)) {
	SrcVT = DstVT = MaskVT;
	if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
	SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
	return true;
	}
	}

	return false;
	}

	static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
	const APInt &Zeroable,
	bool AllowFloatDomain,
	bool AllowIntDomain,
	SDValue &V1, SDValue &V2, SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &ShuffleVT,
	unsigned &PermuteImm) {
	unsigned NumMaskElts = Mask.size();
	unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

	// Attempt to match against PALIGNR byte rotate.
	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
	int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
	if (0 < ByteRotation) {
	Shuffle = X86ISD::PALIGNR;
	ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
	PermuteImm = ByteRotation;
	return true;
	}
	}

	// Attempt to combine to X86ISD::BLENDI.
	if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) \|\|
	(Subtarget.hasAVX() && MaskVT.is256BitVector()))) \|\|
	(MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
	uint64_t BlendMask = 0;
	bool ForceV1Zero = false, ForceV2Zero = false;
	SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
	if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
	BlendMask)) {
	if (MaskVT == MVT::v16i16) {
	// We can only use v16i16 PBLENDW if the lanes are repeated.
	SmallVector<int, 8> RepeatedMask;
	if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
	RepeatedMask)) {
	assert(RepeatedMask.size() == 8 &&
	"Repeated mask size doesn't match!");
	PermuteImm = 0;
	for (int i = 0; i < 8; ++i)
	if (RepeatedMask[i] >= 8)
	PermuteImm \|= 1 << i;
	V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
	V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
	Shuffle = X86ISD::BLENDI;
	ShuffleVT = MaskVT;
	return true;
	}
	} else {
	// Determine a type compatible with X86ISD::BLENDI.
	ShuffleVT = MaskVT;
	if (Subtarget.hasAVX2()) {
	if (ShuffleVT == MVT::v4i64)
	ShuffleVT = MVT::v8i32;
	else if (ShuffleVT == MVT::v2i64)
	ShuffleVT = MVT::v4i32;
	} else {
	if (ShuffleVT == MVT::v2i64 \|\| ShuffleVT == MVT::v4i32)
	ShuffleVT = MVT::v8i16;
	else if (ShuffleVT == MVT::v4i64)
	ShuffleVT = MVT::v4f64;
	else if (ShuffleVT == MVT::v8i32)
	ShuffleVT = MVT::v8f32;
	}

	if (!ShuffleVT.isFloatingPoint()) {
	int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
	BlendMask =
	scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
	ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
	ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
	}

	V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
	V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
	PermuteImm = (unsigned)BlendMask;
	Shuffle = X86ISD::BLENDI;
	return true;
	}
	}
	}

	// Attempt to combine to INSERTPS.
	if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
	MaskVT.is128BitVector()) {
	if (Zeroable.getBoolValue() &&
	matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
	Shuffle = X86ISD::INSERTPS;
	ShuffleVT = MVT::v4f32;
	return true;
	}
	}

	// Attempt to combine to SHUFPD.
	if (AllowFloatDomain && EltSizeInBits == 64 &&
	((MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
	if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
	Shuffle = X86ISD::SHUFP;
	ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
	return true;
	}
	}

	// Attempt to combine to SHUFPS.
	if (AllowFloatDomain && EltSizeInBits == 32 &&
	((MaskVT.is128BitVector() && Subtarget.hasSSE1()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
	SmallVector<int, 4> RepeatedMask;
	if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
	// Match each half of the repeated mask, to determine if its just
	// referencing one of the vectors, is zeroable or entirely undef.
	auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
	int M0 = RepeatedMask[Offset];
	int M1 = RepeatedMask[Offset + 1];

	if (isUndefInRange(RepeatedMask, Offset, 2)) {
	return DAG.getUNDEF(MaskVT);
	} else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : 0);
	S1 = (SM_SentinelUndef == M1 ? -1 : 1);
	return getZeroVector(MaskVT, Subtarget, DAG, DL);
	} else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
	S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
	return V1;
	} else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
	S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
	return V2;
	}

	return SDValue();
	};

	int ShufMask[4] = {-1, -1, -1, -1};
	SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
	SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);

	if (Lo && Hi) {
	V1 = Lo;
	V2 = Hi;
	Shuffle = X86ISD::SHUFP;
	ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
	PermuteImm = getV4X86ShuffleImm(ShufMask);
	return true;
	}
	}
	}

	return false;
	}

	/// \brief Combine an arbitrary chain of shuffles into a single instruction if
	/// possible.
	///
	/// This is the leaf of the recursive combine below. When we have found some
	/// chain of single-use x86 shuffle instructions and accumulated the combined
	/// shuffle mask represented by them, this will try to pattern match that mask
	/// into either a single instruction if there is a special purpose instruction
	/// for this operation, or into a PSHUFB instruction which is a fully general
	/// instruction but should only be used to replace chains over a certain depth.
	static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
	ArrayRef<int> BaseMask, int Depth,
	bool HasVariableMask, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
	assert((Inputs.size() == 1 \|\| Inputs.size() == 2) &&
	"Unexpected number of shuffle inputs!");

	// Find the inputs that enter the chain. Note that multiple uses are OK
	// here, we're not going to remove the operands we find.
	bool UnaryShuffle = (Inputs.size() == 1);
	SDValue V1 = peekThroughBitcasts(Inputs[0]);
	SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
	: peekThroughBitcasts(Inputs[1]));

	MVT VT1 = V1.getSimpleValueType();
	MVT VT2 = V2.getSimpleValueType();
	MVT RootVT = Root.getSimpleValueType();
	assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
	VT2.getSizeInBits() == RootVT.getSizeInBits() &&
	"Vector size mismatch");

	SDLoc DL(Root);
	SDValue Res;

	unsigned NumBaseMaskElts = BaseMask.size();
	if (NumBaseMaskElts == 1) {
	assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
	return DAG.getBitcast(RootVT, V1);
	}

	unsigned RootSizeInBits = RootVT.getSizeInBits();
	unsigned NumRootElts = RootVT.getVectorNumElements();
	unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
	bool FloatDomain = VT1.isFloatingPoint() \|\| VT2.isFloatingPoint() \|\|
	(RootVT.is256BitVector() && !Subtarget.hasAVX2());

	// Don't combine if we are a AVX512/EVEX target and the mask element size
	// is different from the root element size - this would prevent writemasks
	// from being reused.
	// TODO - this currently prevents all lane shuffles from occurring.
	// TODO - check for writemasks usage instead of always preventing combining.
	// TODO - attempt to narrow Mask back to writemask size.
	bool IsEVEXShuffle =
	RootSizeInBits == 512 \|\| (Subtarget.hasVLX() && RootSizeInBits >= 128);

	// TODO - handle 128/256-bit lane shuffles of 512-bit vectors.

	// Handle 128-bit lane shuffles of 256-bit vectors.
	// If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
	// we need to use the zeroing feature.
	// TODO - this should support binary shuffles.
	if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
	!(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
	!isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
	if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
	return SDValue(); // Nothing to do!
	MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
	unsigned PermMask = 0;
	PermMask \|= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
	PermMask \|= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);

	Res = DAG.getBitcast(ShuffleVT, V1);
	DCI.AddToWorklist(Res.getNode());
	Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
	DAG.getUNDEF(ShuffleVT),
	DAG.getConstant(PermMask, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	// For masks that have been widened to 128-bit elements or more,
	// narrow back down to 64-bit elements.
	SmallVector<int, 64> Mask;
	if (BaseMaskEltSizeInBits > 64) {
	assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
	int MaskScale = BaseMaskEltSizeInBits / 64;
	scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
	} else {
	Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
	}

	unsigned NumMaskElts = Mask.size();
	unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;

	// Determine the effective mask value type.
	FloatDomain &= (32 <= MaskEltSizeInBits);
	MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
	: MVT::getIntegerVT(MaskEltSizeInBits);
	MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);

	// Only allow legal mask types.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
	return SDValue();

	// Attempt to match the mask against known shuffle patterns.
	MVT ShuffleSrcVT, ShuffleVT;
	unsigned Shuffle, PermuteImm;

	// Which shuffle domains are permitted?
	// Permit domain crossing at higher combine depths.
	bool AllowFloatDomain = FloatDomain \|\| (Depth > 3);
	bool AllowIntDomain = (!FloatDomain \|\| (Depth > 3)) && Subtarget.hasSSE2() &&
	(!MaskVT.is256BitVector() \|\| Subtarget.hasAVX2());

	// Determine zeroable mask elements.
	APInt Zeroable(NumMaskElts, 0);
	for (unsigned i = 0; i != NumMaskElts; ++i)
	if (isUndefOrZero(Mask[i]))
	Zeroable.setBit(i);

	if (UnaryShuffle) {
	// If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
	// directly if we don't shuffle the lower element and we shuffle the upper
	// (zero) elements within themselves.
	if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
	(V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
	unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
	ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
	if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
	isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
	return DAG.getBitcast(RootVT, V1);
	}
	}

	SDValue NewV1 = V1; // Save operand in case early exit happens.
	if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
	NewV1, DL, DAG, Subtarget, Shuffle,
	ShuffleSrcVT, ShuffleVT) &&
	(!IsEVEXShuffle \|\| (NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 1 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
	DCI.AddToWorklist(Res.getNode());
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
	AllowIntDomain, Subtarget, Shuffle,
	ShuffleVT, PermuteImm) &&
	(!IsEVEXShuffle \|\| (NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 1 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	Res = DAG.getBitcast(ShuffleVT, V1);
	DCI.AddToWorklist(Res.getNode());
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
	DAG.getConstant(PermuteImm, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}
	}

	SDValue NewV1 = V1; // Save operands in case early exit happens.
	SDValue NewV2 = V2;
	if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
	NewV1, NewV2, DL, DAG, Subtarget, Shuffle,
	ShuffleSrcVT, ShuffleVT, UnaryShuffle) &&
	(!IsEVEXShuffle \|\| (NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 1 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
	DCI.AddToWorklist(NewV1.getNode());
	NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
	DCI.AddToWorklist(NewV2.getNode());
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	NewV1 = V1; // Save operands in case early exit happens.
	NewV2 = V2;
	if (matchBinaryPermuteVectorShuffle(
	MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
	NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
	(!IsEVEXShuffle \|\| (NumRootElts == ShuffleVT.getVectorNumElements()))) {
	if (Depth == 1 && Root.getOpcode() == Shuffle)
	return SDValue(); // Nothing to do!
	NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
	DCI.AddToWorklist(NewV1.getNode());
	NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
	DCI.AddToWorklist(NewV2.getNode());
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
	DAG.getConstant(PermuteImm, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	// Typically from here on, we need an integer version of MaskVT.
	MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
	IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);

	// Annoyingly, SSE4A instructions don't map into the above match helpers.
	if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
	uint64_t BitLen, BitIdx;
	if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
	Zeroable)) {
	if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
	return SDValue(); // Nothing to do!
	V1 = DAG.getBitcast(IntMaskVT, V1);
	DCI.AddToWorklist(V1.getNode());
	Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
	DAG.getConstant(BitLen, DL, MVT::i8),
	DAG.getConstant(BitIdx, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
	if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
	return SDValue(); // Nothing to do!
	V1 = DAG.getBitcast(IntMaskVT, V1);
	DCI.AddToWorklist(V1.getNode());
	V2 = DAG.getBitcast(IntMaskVT, V2);
	DCI.AddToWorklist(V2.getNode());
	Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
	DAG.getConstant(BitLen, DL, MVT::i8),
	DAG.getConstant(BitIdx, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}
	}

	// Don't try to re-form single instruction chains under any circumstances now
	// that we've done encoding canonicalization for them.
	if (Depth < 2)
	return SDValue();

	// Depth threshold above which we can efficiently use variable mask shuffles.
	int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3;
	bool AllowVariableMask = (Depth >= VariableShuffleDepth) \|\| HasVariableMask;

	bool MaskContainsZeros =
	any_of(Mask, [](int M) { return M == SM_SentinelZero; });

	if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
	// If we have a single input lane-crossing shuffle then lower to VPERMV.
	if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
	((Subtarget.hasAVX2() &&
	(MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	DCI.AddToWorklist(VPermMask.getNode());
	Res = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(Res.getNode());
	Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	// Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
	// vector as the second source.
	if (UnaryShuffle && AllowVariableMask &&
	((Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasVLX() &&
	(MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4i64 \|\|
	MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	// Adjust shuffle mask - replace SM_SentinelZero with second source index.
	for (unsigned i = 0; i != NumMaskElts; ++i)
	if (Mask[i] == SM_SentinelZero)
	Mask[i] = NumMaskElts + i;

	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	DCI.AddToWorklist(VPermMask.getNode());
	Res = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(Res.getNode());
	SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
	DCI.AddToWorklist(Zero.getNode());
	Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	// If we have a dual input lane-crossing shuffle then lower to VPERMV3.
	if (AllowVariableMask && !MaskContainsZeros &&
	((Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasVLX() &&
	(MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4i64 \|\|
	MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	DCI.AddToWorklist(VPermMask.getNode());
	V1 = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(V1.getNode());
	V2 = DAG.getBitcast(MaskVT, V2);
	DCI.AddToWorklist(V2.getNode());
	Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}
	return SDValue();
	}

	// See if we can combine a single input shuffle with zeros to a bit-mask,
	// which is much simpler than any shuffle.
	if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
	isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
	DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
	APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
	APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
	APInt UndefElts(NumMaskElts, 0);
	SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
	for (unsigned i = 0; i != NumMaskElts; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	UndefElts.setBit(i);
	continue;
	}
	if (M == SM_SentinelZero)
	continue;
	EltBits[i] = AllOnes;
	}
	SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
	DCI.AddToWorklist(BitMask.getNode());
	Res = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(Res.getNode());
	unsigned AndOpcode =
	FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
	Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	// If we have a single input shuffle with different shuffle patterns in the
	// the 128-bit lanes use the variable mask to VPERMILPS.
	// TODO Combine other mask types at higher depths.
	if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
	((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) \|\|
	(MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
	SmallVector<SDValue, 16> VPermIdx;
	for (int M : Mask) {
	SDValue Idx =
	M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
	VPermIdx.push_back(Idx);
	}
	SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
	DCI.AddToWorklist(VPermMask.getNode());
	Res = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(Res.getNode());
	Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	// With XOP, binary shuffles of 128/256-bit floating point vectors can combine
	// to VPERMIL2PD/VPERMIL2PS.
	if (AllowVariableMask && Subtarget.hasXOP() &&
	(MaskVT == MVT::v2f64 \|\| MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4f32 \|\|
	MaskVT == MVT::v8f32)) {
	// VPERMIL2 Operation.
	// Bits[3] - Match Bit.
	// Bits[2:1] - (Per Lane) PD Shuffle Mask.
	// Bits[2:0] - (Per Lane) PS Shuffle Mask.
	unsigned NumLanes = MaskVT.getSizeInBits() / 128;
	unsigned NumEltsPerLane = NumMaskElts / NumLanes;
	SmallVector<int, 8> VPerm2Idx;
	unsigned M2ZImm = 0;
	for (int M : Mask) {
	if (M == SM_SentinelUndef) {
	VPerm2Idx.push_back(-1);
	continue;
	}
	if (M == SM_SentinelZero) {
	M2ZImm = 2;
	VPerm2Idx.push_back(8);
	continue;
	}
	int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
	Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
	VPerm2Idx.push_back(Index);
	}
	V1 = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(V1.getNode());
	V2 = DAG.getBitcast(MaskVT, V2);
	DCI.AddToWorklist(V2.getNode());
	SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
	DCI.AddToWorklist(VPerm2MaskOp.getNode());
	Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
	DAG.getConstant(M2ZImm, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	// If we have 3 or more shuffle instructions or a chain involving a variable
	// mask, we can replace them with a single PSHUFB instruction profitably.
	// Intel's manuals suggest only using PSHUFB if doing so replacing 5
	// instructions, but in practice PSHUFB tends to be very fast so we're
	// more aggressive.
	if (UnaryShuffle && AllowVariableMask &&
	((RootVT.is128BitVector() && Subtarget.hasSSSE3()) \|\|
	(RootVT.is256BitVector() && Subtarget.hasAVX2()) \|\|
	(RootVT.is512BitVector() && Subtarget.hasBWI()))) {
	SmallVector<SDValue, 16> PSHUFBMask;
	int NumBytes = RootVT.getSizeInBits() / 8;
	int Ratio = NumBytes / NumMaskElts;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / Ratio];
	if (M == SM_SentinelUndef) {
	PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
	continue;
	}
	if (M == SM_SentinelZero) {
	PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
	continue;
	}
	M = Ratio * M + i % Ratio;
	assert((M / 16) == (i / 16) && "Lane crossing detected");
	PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
	}
	MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
	Res = DAG.getBitcast(ByteVT, V1);
	DCI.AddToWorklist(Res.getNode());
	SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
	DCI.AddToWorklist(PSHUFBMaskOp.getNode());
	Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	// With XOP, if we have a 128-bit binary input shuffle we can always combine
	// to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
	// slower than PSHUFB on targets that support both.
	if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
	// VPPERM Mask Operation
	// Bits[4:0] - Byte Index (0 - 31)
	// Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
	SmallVector<SDValue, 16> VPPERMMask;
	int NumBytes = 16;
	int Ratio = NumBytes / NumMaskElts;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / Ratio];
	if (M == SM_SentinelUndef) {
	VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
	continue;
	}
	if (M == SM_SentinelZero) {
	VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
	continue;
	}
	M = Ratio * M + i % Ratio;
	VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
	}
	MVT ByteVT = MVT::v16i8;
	V1 = DAG.getBitcast(ByteVT, V1);
	DCI.AddToWorklist(V1.getNode());
	V2 = DAG.getBitcast(ByteVT, V2);
	DCI.AddToWorklist(V2.getNode());
	SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
	DCI.AddToWorklist(VPPERMMaskOp.getNode());
	Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
	DCI.AddToWorklist(Res.getNode());
	return DAG.getBitcast(RootVT, Res);
	}

	// Failed to find any combines.
	return SDValue();
	}

	// Attempt to constant fold all of the constant source ops.
	// Returns true if the entire shuffle is folded to a constant.
	// TODO: Extend this to merge multiple constant Ops and update the mask.
	static SDValue combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
	ArrayRef<int> Mask, SDValue Root,
	bool HasVariableMask,
	SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	MVT VT = Root.getSimpleValueType();

	unsigned SizeInBits = VT.getSizeInBits();
	unsigned NumMaskElts = Mask.size();
	unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
	unsigned NumOps = Ops.size();

	// Extract constant bits from each source op.
	bool OneUseConstantOp = false;
	SmallVector<APInt, 16> UndefEltsOps(NumOps);
	SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
	for (unsigned i = 0; i != NumOps; ++i) {
	SDValue SrcOp = Ops[i];
	OneUseConstantOp \|= SrcOp.hasOneUse();
	if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
	RawBitsOps[i]))
	return SDValue();
	}

	// Only fold if at least one of the constants is only used once or
	// the combined shuffle has included a variable mask shuffle, this
	// is to avoid constant pool bloat.
	if (!OneUseConstantOp && !HasVariableMask)
	return SDValue();

	// Shuffle the constant bits according to the mask.
	APInt UndefElts(NumMaskElts, 0);
	APInt ZeroElts(NumMaskElts, 0);
	APInt ConstantElts(NumMaskElts, 0);
	SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
	APInt::getNullValue(MaskSizeInBits));
	for (unsigned i = 0; i != NumMaskElts; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	UndefElts.setBit(i);
	continue;
	} else if (M == SM_SentinelZero) {
	ZeroElts.setBit(i);
	continue;
	}
	assert(0 <= M && M < (int)(NumMaskElts * NumOps));

	unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
	unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;

	auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
	if (SrcUndefElts[SrcMaskIdx]) {
	UndefElts.setBit(i);
	continue;
	}

	auto &SrcEltBits = RawBitsOps[SrcOpIdx];
	APInt &Bits = SrcEltBits[SrcMaskIdx];
	if (!Bits) {
	ZeroElts.setBit(i);
	continue;
	}

	ConstantElts.setBit(i);
	ConstantBitData[i] = Bits;
	}
	assert((UndefElts \| ZeroElts \| ConstantElts).isAllOnesValue());

	// Create the constant data.
	MVT MaskSVT;
	if (VT.isFloatingPoint() && (MaskSizeInBits == 32 \|\| MaskSizeInBits == 64))
	MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
	else
	MaskSVT = MVT::getIntegerVT(MaskSizeInBits);

	MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);

	SDLoc DL(Root);
	SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
	DCI.AddToWorklist(CstOp.getNode());
	return DAG.getBitcast(VT, CstOp);
	}

	/// \brief Fully generic combining of x86 shuffle instructions.
	///
	/// This should be the last combine run over the x86 shuffle instructions. Once
	/// they have been fully optimized, this will recursively consider all chains
	/// of single-use shuffle instructions, build a generic model of the cumulative
	/// shuffle operation, and check for simpler instructions which implement this
	/// operation. We use this primarily for two purposes:
	///
	/// 1) Collapse generic shuffles to specialized single instructions when
	/// equivalent. In most cases, this is just an encoding size win, but
	/// sometimes we will collapse multiple generic shuffles into a single
	/// special-purpose shuffle.
	/// 2) Look for sequences of shuffle instructions with 3 or more total
	/// instructions, and replace them with the slightly more expensive SSSE3
	/// PSHUFB instruction if available. We do this as the last combining step
	/// to ensure we avoid using PSHUFB if we can implement the shuffle with
	/// a suitable short sequence of other instructions. The PSHUFB will either
	/// use a register or have to read from memory and so is slightly (but only
	/// slightly) more expensive than the other shuffle instructions.
	///
	/// Because this is inherently a quadratic operation (for each shuffle in
	/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
	/// This should never be an issue in practice as the shuffle lowering doesn't
	/// produce sequences of more than 8 instructions.
	///
	/// FIXME: We will currently miss some cases where the redundant shuffling
	/// would simplify under the threshold for PSHUFB formation because of
	/// combine-ordering. To fix this, we should do the redundant instruction
	/// combining in this recursive walk.
	static SDValue combineX86ShufflesRecursively(
	ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
	ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, int Depth,
	bool HasVariableMask, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
	// Bound the depth of our recursive combine because this is ultimately
	// quadratic in nature.
	if (Depth > 8)
	return SDValue();

	// Directly rip through bitcasts to find the underlying operand.
	SDValue Op = SrcOps[SrcOpIndex];
	Op = peekThroughOneUseBitcasts(Op);

	MVT VT = Op.getSimpleValueType();
	if (!VT.isVector())
	return SDValue(); // Bail if we hit a non-vector.

	assert(Root.getSimpleValueType().isVector() &&
	"Shuffles operate on vector types!");
	assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
	"Can only combine shuffles of the same vector register size.");

	// Extract target shuffle mask and resolve sentinels and inputs.
	SmallVector<int, 64> OpMask;
	SmallVector<SDValue, 2> OpInputs;
	if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
	return SDValue();

	assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
	SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
	SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());

	// Add the inputs to the Ops list, avoiding duplicates.
	SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());

	int InputIdx0 = -1, InputIdx1 = -1;
	for (int i = 0, e = Ops.size(); i < e; ++i) {
	SDValue BC = peekThroughBitcasts(Ops[i]);
	if (Input0 && BC == peekThroughBitcasts(Input0))
	InputIdx0 = i;
	if (Input1 && BC == peekThroughBitcasts(Input1))
	InputIdx1 = i;
	}

	if (Input0 && InputIdx0 < 0) {
	InputIdx0 = SrcOpIndex;
	Ops[SrcOpIndex] = Input0;
	}
	if (Input1 && InputIdx1 < 0) {
	InputIdx1 = Ops.size();
	Ops.push_back(Input1);
	}

	assert(((RootMask.size() > OpMask.size() &&
	RootMask.size() % OpMask.size() == 0) \|\|
	(OpMask.size() > RootMask.size() &&
	OpMask.size() % RootMask.size() == 0) \|\|
	OpMask.size() == RootMask.size()) &&
	"The smaller number of elements must divide the larger.");

	// This function can be performance-critical, so we rely on the power-of-2
	// knowledge that we have about the mask sizes to replace div/rem ops with
	// bit-masks and shifts.
	assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
	unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
	unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());

	unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
	unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
	unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
	assert((RootRatio == 1 \|\| OpRatio == 1) &&
	"Must not have a ratio for both incoming and op masks!");

	assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
	unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
	unsigned OpRatioLog2 = countTrailingZeros(OpRatio);

	SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);

	// Merge this shuffle operation's mask into our accumulated mask. Note that
	// this shuffle's mask will be the first applied to the input, followed by the
	// root mask to get us all the way to the root value arrangement. The reason
	// for this order is that we are recursing up the operation chain.
	for (unsigned i = 0; i < MaskWidth; ++i) {
	unsigned RootIdx = i >> RootRatioLog2;
	if (RootMask[RootIdx] < 0) {
	// This is a zero or undef lane, we're done.
	Mask[i] = RootMask[RootIdx];
	continue;
	}

	unsigned RootMaskedIdx =
	RootRatio == 1
	? RootMask[RootIdx]
	: (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));

	// Just insert the scaled root mask value if it references an input other
	// than the SrcOp we're currently inserting.
	if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) \|\|
	(((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
	Mask[i] = RootMaskedIdx;
	continue;
	}

	RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
	unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
	if (OpMask[OpIdx] < 0) {
	// The incoming lanes are zero or undef, it doesn't matter which ones we
	// are using.
	Mask[i] = OpMask[OpIdx];
	continue;
	}

	// Ok, we have non-zero lanes, map them through to one of the Op's inputs.
	unsigned OpMaskedIdx =
	OpRatio == 1
	? OpMask[OpIdx]
	: (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));

	OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
	if (OpMask[OpIdx] < (int)OpMask.size()) {
	assert(0 <= InputIdx0 && "Unknown target shuffle input");
	OpMaskedIdx += InputIdx0 * MaskWidth;
	} else {
	assert(0 <= InputIdx1 && "Unknown target shuffle input");
	OpMaskedIdx += InputIdx1 * MaskWidth;
	}

	Mask[i] = OpMaskedIdx;
	}

	// Handle the all undef/zero cases early.
	if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
	return DAG.getUNDEF(Root.getValueType());

	// TODO - should we handle the mixed zero/undef case as well? Just returning
	// a zero mask will lose information on undef elements possibly reducing
	// future combine possibilities.
	if (all_of(Mask, [](int Idx) { return Idx < 0; }))
	return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
	SDLoc(Root));

	// Remove unused shuffle source ops.
	resolveTargetShuffleInputsAndMask(Ops, Mask);
	assert(!Ops.empty() && "Shuffle with no inputs detected");

	HasVariableMask \|= isTargetShuffleVariableMask(Op.getOpcode());

	// Update the list of shuffle nodes that have been combined so far.
	SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
	SrcNodes.end());
	CombinedNodes.push_back(Op.getNode());

	// See if we can recurse into each shuffle source op (if it's a target
	// shuffle). The source op should only be combined if it either has a
	// single use (i.e. current Op) or all its users have already been combined.
	for (int i = 0, e = Ops.size(); i < e; ++i)
	if (Ops[i].getNode()->hasOneUse() \|\|
	SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
	if (SDValue Res = combineX86ShufflesRecursively(
	Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
	DAG, DCI, Subtarget))
	return Res;

	// Attempt to constant fold all of the constant source ops.
	if (SDValue Cst = combineX86ShufflesConstants(
	Ops, Mask, Root, HasVariableMask, DAG, DCI, Subtarget))
	return Cst;

	// We can only combine unary and binary shuffle mask cases.
	if (Ops.size() > 2)
	return SDValue();

	// Minor canonicalization of the accumulated shuffle mask to make it easier
	// to match below. All this does is detect masks with sequential pairs of
	// elements, and shrink them to the half-width mask. It does this in a loop
	// so it will reduce the size of the mask to the minimal width mask which
	// performs an equivalent shuffle.
	SmallVector<int, 64> WidenedMask;
	while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
	Mask = std::move(WidenedMask);
	}

	// Canonicalization of binary shuffle masks to improve pattern matching by
	// commuting the inputs.
	if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
	ShuffleVectorSDNode::commuteMask(Mask);
	std::swap(Ops[0], Ops[1]);
	}

	// Finally, try to combine into a single shuffle instruction.
	return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
	DCI, Subtarget);
	}

	/// \brief Get the PSHUF-style mask from PSHUF node.
	///
	/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
	/// PSHUF-style masks that can be reused with such instructions.
	static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
	MVT VT = N.getSimpleValueType();
	SmallVector<int, 4> Mask;
	SmallVector<SDValue, 2> Ops;
	bool IsUnary;
	bool HaveMask =
	getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
	(void)HaveMask;
	assert(HaveMask);

	// If we have more than 128-bits, only the low 128-bits of shuffle mask
	// matter. Check that the upper masks are repeats and remove them.
	if (VT.getSizeInBits() > 128) {
	int LaneElts = 128 / VT.getScalarSizeInBits();
	#ifndef NDEBUG
	for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
	for (int j = 0; j < LaneElts; ++j)
	assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
	"Mask doesn't repeat in high 128-bit lanes!");
	#endif
	Mask.resize(LaneElts);
	}

	switch (N.getOpcode()) {
	case X86ISD::PSHUFD:
	return Mask;
	case X86ISD::PSHUFLW:
	Mask.resize(4);
	return Mask;
	case X86ISD::PSHUFHW:
	Mask.erase(Mask.begin(), Mask.begin() + 4);
	for (int &M : Mask)
	M -= 4;
	return Mask;
	default:
	llvm_unreachable("No valid shuffle instruction found!");
	}
	}

	/// \brief Search for a combinable shuffle across a chain ending in pshufd.
	///
	/// We walk up the chain and look for a combinable shuffle, skipping over
	/// shuffles that we could hoist this shuffle's transformation past without
	/// altering anything.
	static SDValue
	combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(N.getOpcode() == X86ISD::PSHUFD &&
	"Called with something other than an x86 128-bit half shuffle!");
	SDLoc DL(N);

	// Walk up a single-use chain looking for a combinable shuffle. Keep a stack
	// of the shuffles in the chain so that we can form a fresh chain to replace
	// this one.
	SmallVector<SDValue, 8> Chain;
	SDValue V = N.getOperand(0);
	for (; V.hasOneUse(); V = V.getOperand(0)) {
	switch (V.getOpcode()) {
	default:
	return SDValue(); // Nothing combined!

	case ISD::BITCAST:
	// Skip bitcasts as we always know the type for the target specific
	// instructions.
	continue;

	case X86ISD::PSHUFD:
	// Found another dword shuffle.
	break;

	case X86ISD::PSHUFLW:
	// Check that the low words (being shuffled) are the identity in the
	// dword shuffle, and the high words are self-contained.
	if (Mask[0] != 0 \|\| Mask[1] != 1 \|\|
	!(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
	return SDValue();

	Chain.push_back(V);
	continue;

	case X86ISD::PSHUFHW:
	// Check that the high words (being shuffled) are the identity in the
	// dword shuffle, and the low words are self-contained.
	if (Mask[2] != 2 \|\| Mask[3] != 3 \|\|
	!(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
	return SDValue();

	Chain.push_back(V);
	continue;

	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	// For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
	// shuffle into a preceding word shuffle.
	if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
	V.getSimpleValueType().getVectorElementType() != MVT::i16)
	return SDValue();

	// Search for a half-shuffle which we can combine with.
	unsigned CombineOp =
	V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
	if (V.getOperand(0) != V.getOperand(1) \|\|
	!V->isOnlyUserOf(V.getOperand(0).getNode()))
	return SDValue();
	Chain.push_back(V);
	V = V.getOperand(0);
	do {
	switch (V.getOpcode()) {
	default:
	return SDValue(); // Nothing to combine.

	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	if (V.getOpcode() == CombineOp)
	break;

	Chain.push_back(V);

	LLVM_FALLTHROUGH;
	case ISD::BITCAST:
	V = V.getOperand(0);
	continue;
	}
	break;
	} while (V.hasOneUse());
	break;
	}
	// Break out of the loop if we break out of the switch.
	break;
	}

	if (!V.hasOneUse())
	// We fell out of the loop without finding a viable combining instruction.
	return SDValue();

	// Merge this node's mask and our incoming mask.
	SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
	for (int &M : Mask)
	M = VMask[M];
	V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

	// Rebuild the chain around this new shuffle.
	while (!Chain.empty()) {
	SDValue W = Chain.pop_back_val();

	if (V.getValueType() != W.getOperand(0).getValueType())
	V = DAG.getBitcast(W.getOperand(0).getValueType(), V);

	switch (W.getOpcode()) {
	default:
	llvm_unreachable("Only PSHUF and UNPCK instructions get here!");

	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
	break;

	case X86ISD::PSHUFD:
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
	break;
	}
	}
	if (V.getValueType() != N.getValueType())
	V = DAG.getBitcast(N.getValueType(), V);

	// Return the new chain to replace N.
	return V;
	}

	/// \brief Search for a combinable shuffle across a chain ending in pshuflw or
	/// pshufhw.
	///
	/// We walk up the chain, skipping shuffles of the other half and looking
	/// through shuffles which switch halves trying to find a shuffle of the same
	/// pair of dwords.
	static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
	SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	assert(
	(N.getOpcode() == X86ISD::PSHUFLW \|\| N.getOpcode() == X86ISD::PSHUFHW) &&
	"Called with something other than an x86 128-bit half shuffle!");
	SDLoc DL(N);
	unsigned CombineOpcode = N.getOpcode();

	// Walk up a single-use chain looking for a combinable shuffle.
	SDValue V = N.getOperand(0);
	for (; V.hasOneUse(); V = V.getOperand(0)) {
	switch (V.getOpcode()) {
	default:
	return false; // Nothing combined!

	case ISD::BITCAST:
	// Skip bitcasts as we always know the type for the target specific
	// instructions.
	continue;

	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	if (V.getOpcode() == CombineOpcode)
	break;

	// Other-half shuffles are no-ops.
	continue;
	}
	// Break out of the loop if we break out of the switch.
	break;
	}

	if (!V.hasOneUse())
	// We fell out of the loop without finding a viable combining instruction.
	return false;

	// Combine away the bottom node as its shuffle will be accumulated into
	// a preceding shuffle.
	DCI.CombineTo(N.getNode(), N.getOperand(0), /AddTo/ true);

	// Record the old value.
	SDValue Old = V;

	// Merge this node's mask and our incoming mask (adjusted to account for all
	// the pshufd instructions encountered).
	SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
	for (int &M : Mask)
	M = VMask[M];
	V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

	// Check that the shuffles didn't cancel each other out. If not, we need to
	// combine to the new one.
	if (Old != V)
	// Replace the combinable shuffle with the combined one, updating all users
	// so that we re-evaluate the chain here.
	DCI.CombineTo(Old.getNode(), V, /AddTo/ true);

	return true;
	}

	/// \brief Try to combine x86 target specific shuffles.
	static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	MVT VT = N.getSimpleValueType();
	SmallVector<int, 4> Mask;
	unsigned Opcode = N.getOpcode();

	// Combine binary shuffle of 2 similar 'Horizontal' instructions into a
	// single instruction.
	if (VT.getScalarSizeInBits() == 64 &&
	(Opcode == X86ISD::MOVSD \|\| Opcode == X86ISD::UNPCKH \|\|
	Opcode == X86ISD::UNPCKL)) {
	auto BC0 = peekThroughBitcasts(N.getOperand(0));
	auto BC1 = peekThroughBitcasts(N.getOperand(1));
	EVT VT0 = BC0.getValueType();
	EVT VT1 = BC1.getValueType();
	unsigned Opcode0 = BC0.getOpcode();
	unsigned Opcode1 = BC1.getOpcode();
	if (Opcode0 == Opcode1 && VT0 == VT1 &&
	(Opcode0 == X86ISD::FHADD \|\| Opcode0 == X86ISD::HADD \|\|
	Opcode0 == X86ISD::FHSUB \|\| Opcode0 == X86ISD::HSUB \|\|
	Opcode0 == X86ISD::PACKSS \|\| Opcode0 == X86ISD::PACKUS)) {
	SDValue Lo, Hi;
	if (Opcode == X86ISD::MOVSD) {
	Lo = BC1.getOperand(0);
	Hi = BC0.getOperand(1);
	} else {
	Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
	Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
	}
	SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
	DCI.AddToWorklist(Horiz.getNode());
	return DAG.getBitcast(VT, Horiz);
	}
	}

	switch (Opcode) {
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	Mask = getPSHUFShuffleMask(N);
	assert(Mask.size() == 4);
	break;
	case X86ISD::UNPCKL: {
	// Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
	// which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
	// moves upper half elements into the lower half part. For example:
	//
	// t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
	// undef:v16i8
	// t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
	//
	// will be combined to:
	//
	// t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1

	// This is only for 128-bit vectors. From SSE4.1 onward this combine may not
	// happen due to advanced instructions.
	if (!VT.is128BitVector())
	return SDValue();

	auto Op0 = N.getOperand(0);
	auto Op1 = N.getOperand(1);
	if (Op0.isUndef() && Op1.getOpcode() == ISD::VECTOR_SHUFFLE) {
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();

	unsigned NumElts = VT.getVectorNumElements();
	SmallVector<int, 8> ExpectedMask(NumElts, -1);
	std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
	NumElts / 2);

	auto ShufOp = Op1.getOperand(0);
	if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
	return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
	}
	return SDValue();
	}
	case X86ISD::BLENDI: {
	SDValue V0 = N->getOperand(0);
	SDValue V1 = N->getOperand(1);
	assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
	"Unexpected input vector types");

	// Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
	// operands and changing the mask to 1. This saves us a bunch of
	// pattern-matching possibilities related to scalar math ops in SSE/AVX.
	// x86InstrInfo knows how to commute this back after instruction selection
	// if it would help register allocation.

	// TODO: If optimizing for size or a processor that doesn't suffer from
	// partial register update stalls, this should be transformed into a MOVSD
	// instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.

	if (VT == MVT::v2f64)
	if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
	if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
	SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
	return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
	}

	return SDValue();
	}
	case X86ISD::MOVSD:
	case X86ISD::MOVSS: {
	SDValue V0 = peekThroughBitcasts(N->getOperand(0));
	SDValue V1 = peekThroughBitcasts(N->getOperand(1));
	bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
	bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
	if (isZero0 && isZero1)
	return SDValue();

	// We often lower to MOVSD/MOVSS from integer as well as native float
	// types; remove unnecessary domain-crossing bitcasts if we can to make it
	// easier to combine shuffles later on. We've already accounted for the
	// domain switching cost when we decided to lower with it.
	bool isFloat = VT.isFloatingPoint();
	bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
	bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
	if ((isFloat != isFloat0 \|\| isZero0) && (isFloat != isFloat1 \|\| isZero1)) {
	MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
	: (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
	V0 = DAG.getBitcast(NewVT, V0);
	V1 = DAG.getBitcast(NewVT, V1);
	return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
	}

	return SDValue();
	}
	case X86ISD::INSERTPS: {
	assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
	SDValue Op0 = N.getOperand(0);
	SDValue Op1 = N.getOperand(1);
	SDValue Op2 = N.getOperand(2);
	unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
	unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
	unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
	unsigned ZeroMask = InsertPSMask & 0xF;

	// If we zero out all elements from Op0 then we don't need to reference it.
	if (((ZeroMask \| (1u << DstIdx)) == 0xF) && !Op0.isUndef())
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
	DAG.getConstant(InsertPSMask, DL, MVT::i8));

	// If we zero out the element from Op1 then we don't need to reference it.
	if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
	DAG.getConstant(InsertPSMask, DL, MVT::i8));

	// Attempt to merge insertps Op1 with an inner target shuffle node.
	SmallVector<int, 8> TargetMask1;
	SmallVector<SDValue, 2> Ops1;
	if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
	int M = TargetMask1[SrcIdx];
	if (isUndefOrZero(M)) {
	// Zero/UNDEF insertion - zero out element and remove dependency.
	InsertPSMask \|= (1u << DstIdx);
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
	DAG.getConstant(InsertPSMask, DL, MVT::i8));
	}
	// Update insertps mask srcidx and reference the source input directly.
	assert(0 <= M && M < 8 && "Shuffle index out of range");
	InsertPSMask = (InsertPSMask & 0x3f) \| ((M & 0x3) << 6);
	Op1 = Ops1[M < 4 ? 0 : 1];
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
	DAG.getConstant(InsertPSMask, DL, MVT::i8));
	}

	// Attempt to merge insertps Op0 with an inner target shuffle node.
	SmallVector<int, 8> TargetMask0;
	SmallVector<SDValue, 2> Ops0;
	if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
	return SDValue();

	bool Updated = false;
	bool UseInput00 = false;
	bool UseInput01 = false;
	for (int i = 0; i != 4; ++i) {
	int M = TargetMask0[i];
	if ((InsertPSMask & (1u << i)) \|\| (i == (int)DstIdx)) {
	// No change if element is already zero or the inserted element.
	continue;
	} else if (isUndefOrZero(M)) {
	// If the target mask is undef/zero then we must zero the element.
	InsertPSMask \|= (1u << i);
	Updated = true;
	continue;
	}

	// The input vector element must be inline.
	if (M != i && M != (i + 4))
	return SDValue();

	// Determine which inputs of the target shuffle we're using.
	UseInput00 \|= (0 <= M && M < 4);
	UseInput01 \|= (4 <= M);
	}

	// If we're not using both inputs of the target shuffle then use the
	// referenced input directly.
	if (UseInput00 && !UseInput01) {
	Updated = true;
	Op0 = Ops0[0];
	} else if (!UseInput00 && UseInput01) {
	Updated = true;
	Op0 = Ops0[1];
	}

	if (Updated)
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
	DAG.getConstant(InsertPSMask, DL, MVT::i8));

	return SDValue();
	}
	default:
	return SDValue();
	}

	// Nuke no-op shuffles that show up after combining.
	if (isNoopShuffleMask(Mask))
	return DCI.CombineTo(N.getNode(), N.getOperand(0), /AddTo/ true);

	// Look for simplifications involving one or two shuffle instructions.
	SDValue V = N.getOperand(0);
	switch (N.getOpcode()) {
	default:
	break;
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");

	if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
	return SDValue(); // We combined away this shuffle, so we're done.

	// See if this reduces to a PSHUFD which is no more expensive and can
	// combine with more operations. Note that it has to at least flip the
	// dwords as otherwise it would have been removed as a no-op.
	if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
	int DMask[] = {0, 1, 2, 3};
	int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
	DMask[DOffset + 0] = DOffset + 1;
	DMask[DOffset + 1] = DOffset + 0;
	MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
	V = DAG.getBitcast(DVT, V);
	DCI.AddToWorklist(V.getNode());
	V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
	getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
	DCI.AddToWorklist(V.getNode());
	return DAG.getBitcast(VT, V);
	}

	// Look for shuffle patterns which can be implemented as a single unpack.
	// FIXME: This doesn't handle the location of the PSHUFD generically, and
	// only works when we have a PSHUFD followed by two half-shuffles.
	if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
	(V.getOpcode() == X86ISD::PSHUFLW \|\|
	V.getOpcode() == X86ISD::PSHUFHW) &&
	V.getOpcode() != N.getOpcode() &&
	V.hasOneUse()) {
	SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
	if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
	SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
	SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
	int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
	int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
	int WordMask[8];
	for (int i = 0; i < 4; ++i) {
	WordMask[i + NOffset] = Mask[i] + NOffset;
	WordMask[i + VOffset] = VMask[i] + VOffset;
	}
	// Map the word mask through the DWord mask.
	int MappedMask[8];
	for (int i = 0; i < 8; ++i)
	MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
	if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) \|\|
	makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
	// We can replace all three shuffles with an unpack.
	V = DAG.getBitcast(VT, D.getOperand(0));
	DCI.AddToWorklist(V.getNode());
	return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
	: X86ISD::UNPCKH,
	DL, VT, V, V);
	}
	}
	}

	break;

	case X86ISD::PSHUFD:
	if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
	return NewN;

	break;
	}

	return SDValue();
	}

	/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
	/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
	/// are written to the parameters \p Opnd0 and \p Opnd1.
	///
	/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
	/// so it is easier to generically match. We also insert dummy vector shuffle
	/// nodes for the operands which explicitly discard the lanes which are unused
	/// by this operation to try to flow through the rest of the combiner
	/// the fact that they're unused.
	static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
	SDValue &Opnd0, SDValue &Opnd1,
	bool matchSubAdd = false) {

	EVT VT = N->getValueType(0);
	if ((!Subtarget.hasSSE3() \|\| (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
	(!Subtarget.hasAVX() \|\| (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
	(!Subtarget.hasAVX512() \|\| (VT != MVT::v16f32 && VT != MVT::v8f64)))
	return false;

	// We only handle target-independent shuffles.
	// FIXME: It would be easy and harmless to use the target shuffle mask
	// extraction tool to support more.
	if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
	return false;

	ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
	SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());

	SDValue V1 = N->getOperand(0);
	SDValue V2 = N->getOperand(1);

	unsigned ExpectedOpcode = matchSubAdd ? ISD::FADD : ISD::FSUB;
	unsigned NextExpectedOpcode = matchSubAdd ? ISD::FSUB : ISD::FADD;

	// We require the first shuffle operand to be the ExpectedOpcode node,
	// and the second to be the NextExpectedOpcode node.
	if (V1.getOpcode() == NextExpectedOpcode && V2.getOpcode() == ExpectedOpcode) {
	ShuffleVectorSDNode::commuteMask(Mask);
	std::swap(V1, V2);
	} else if (V1.getOpcode() != ExpectedOpcode \|\| V2.getOpcode() != NextExpectedOpcode)
	return false;

	// If there are other uses of these operations we can't fold them.
	if (!V1->hasOneUse() \|\| !V2->hasOneUse())
	return false;

	// Ensure that both operations have the same operands. Note that we can
	// commute the FADD operands.
	SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
	if ((V2->getOperand(0) != LHS \|\| V2->getOperand(1) != RHS) &&
	(V2->getOperand(0) != RHS \|\| V2->getOperand(1) != LHS))
	return false;

	// We're looking for blends between FADD and FSUB nodes. We insist on these
	// nodes being lined up in a specific expected pattern.
	if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) \|\|
	isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) \|\|
	isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) \|\|
	isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
	8, 25, 10, 27, 12, 29, 14, 31})))
	return false;

	Opnd0 = LHS;
	Opnd1 = RHS;
	return true;
	}

	/// \brief Try to combine a shuffle into a target-specific add-sub or
	/// mul-add-sub node.
	static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Opnd0, Opnd1;
	if (!isAddSubOrSubAdd(N, Subtarget, Opnd0, Opnd1))
	return SDValue();

	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	// Try to generate X86ISD::FMADDSUB node here.
	SDValue Opnd2;
	if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2))
	return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);

	// Do not generate X86ISD::ADDSUB node for 512-bit types even though
	// the ADDSUB idiom has been successfully recognized. There are no known
	// X86 targets with 512-bit ADDSUB instructions!
	if (VT.is512BitVector())
	return SDValue();

	return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
	}

	/// \brief Try to combine a shuffle into a target-specific
	/// mul-sub-add node.
	static SDValue combineShuffleToFMSubAdd(SDNode *N,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Opnd0, Opnd1;
	if (!isAddSubOrSubAdd(N, Subtarget, Opnd0, Opnd1, true))
	return SDValue();

	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	// Try to generate X86ISD::FMSUBADD node here.
	SDValue Opnd2;
	if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2))
	return DAG.getNode(X86ISD::FMSUBADD, DL, VT, Opnd0, Opnd1, Opnd2);

	return SDValue();
	}

	// We are looking for a shuffle where both sources are concatenated with undef
	// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
	// if we can express this as a single-source shuffle, that's preferable.
	static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasAVX2() \|\| !isa<ShuffleVectorSDNode>(N))
	return SDValue();

	EVT VT = N->getValueType(0);

	// We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
	if (!VT.is128BitVector() && !VT.is256BitVector())
	return SDValue();

	if (VT.getVectorElementType() != MVT::i32 &&
	VT.getVectorElementType() != MVT::i64 &&
	VT.getVectorElementType() != MVT::f32 &&
	VT.getVectorElementType() != MVT::f64)
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Check that both sources are concats with undef.
	if (N0.getOpcode() != ISD::CONCAT_VECTORS \|\|
	N1.getOpcode() != ISD::CONCAT_VECTORS \|\| N0.getNumOperands() != 2 \|\|
	N1.getNumOperands() != 2 \|\| !N0.getOperand(1).isUndef() \|\|
	!N1.getOperand(1).isUndef())
	return SDValue();

	// Construct the new shuffle mask. Elements from the first source retain their
	// index, but elements from the second source no longer need to skip an undef.
	SmallVector<int, 8> Mask;
	int NumElts = VT.getVectorNumElements();

	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
	for (int Elt : SVOp->getMask())
	Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));

	SDLoc DL(N);
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
	N1.getOperand(0));
	return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
	}

	/// Eliminate a redundant shuffle of a horizontal math op.
	static SDValue foldShuffleOfHorizOp(SDNode *N) {
	if (N->getOpcode() != ISD::VECTOR_SHUFFLE \|\| !N->getOperand(1).isUndef())
	return SDValue();

	SDValue HOp = N->getOperand(0);
	if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
	HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
	return SDValue();

	// 128-bit horizontal math instructions are defined to operate on adjacent
	// lanes of each operand as:
	// v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
	// ...similarly for v2f64 and v8i16.
	// TODO: 256-bit is not the same because...x86.
	if (HOp.getOperand(0) != HOp.getOperand(1) \|\| HOp.getValueSizeInBits() != 128)
	return SDValue();

	// When the operands of a horizontal math op are identical, the low half of
	// the result is the same as the high half. If the shuffle is also replicating
	// low and high halves, we don't need the shuffle.
	// shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
	// TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
	// but this should be tied to whatever horizontal op matching and shuffle
	// canonicalization are producing.
	if (isTargetShuffleEquivalent(Mask, { 0, 0 }) \|\|
	isTargetShuffleEquivalent(Mask, { 0, 1, 0, 1 }) \|\|
	isTargetShuffleEquivalent(Mask, { 0, 1, 2, 3, 0, 1, 2, 3 }))
	return HOp;

	return SDValue();
	}

	static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	// If we have legalized the vector types, look for blends of FADD and FSUB
	// nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
	if (TLI.isTypeLegal(VT)) {
	if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
	return AddSub;

	if (SDValue FMSubAdd = combineShuffleToFMSubAdd(N, Subtarget, DAG))
	return FMSubAdd;

	if (SDValue HAddSub = foldShuffleOfHorizOp(N))
	return HAddSub;
	}

	// During Type Legalization, when promoting illegal vector types,
	// the backend might introduce new shuffle dag nodes and bitcasts.
	//
	// This code performs the following transformation:
	// fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
	// (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
	//
	// We do this only if both the bitcast and the BINOP dag nodes have
	// one use. Also, perform this transformation only if the new binary
	// operation is legal. This is to avoid introducing dag nodes that
	// potentially need to be further expanded (or custom lowered) into a
	// less optimal sequence of dag nodes.
	if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
	N->getOpcode() == ISD::VECTOR_SHUFFLE &&
	N->getOperand(0).getOpcode() == ISD::BITCAST &&
	N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	SDValue BC0 = N0.getOperand(0);
	EVT SVT = BC0.getValueType();
	unsigned Opcode = BC0.getOpcode();
	unsigned NumElts = VT.getVectorNumElements();

	if (BC0.hasOneUse() && SVT.isVector() &&
	SVT.getVectorNumElements() * 2 == NumElts &&
	TLI.isOperationLegal(Opcode, VT)) {
	bool CanFold = false;
	switch (Opcode) {
	default : break;
	case ISD::ADD:
	case ISD::SUB:
	case ISD::MUL:
	// isOperationLegal lies for integer ops on floating point types.
	CanFold = VT.isInteger();
	break;
	case ISD::FADD:
	case ISD::FSUB:
	case ISD::FMUL:
	// isOperationLegal lies for floating point ops on integer types.
	CanFold = VT.isFloatingPoint();
	break;
	}

	unsigned SVTNumElts = SVT.getVectorNumElements();
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
	for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
	CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
	for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
	CanFold = SVOp->getMaskElt(i) < 0;

	if (CanFold) {
	SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
	SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
	SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
	return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
	}
	}
	}

	// Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
	// load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
	// consecutive, non-overlapping, and in the right order.
	SmallVector<SDValue, 16> Elts;
	for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
	if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
	Elts.push_back(Elt);
	continue;
	}
	Elts.clear();
	break;
	}

	if (Elts.size() == VT.getVectorNumElements())
	if (SDValue LD =
	EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
	return LD;

	// For AVX2, we sometimes want to combine
	// (vector_shuffle <mask> (concat_vectors t1, undef)
	// (concat_vectors t2, undef))
	// Into:
	// (vector_shuffle <mask> (concat_vectors t1, t2), undef)
	// Since the latter can be efficiently lowered with VPERMD/VPERMQ
	if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
	return ShufConcat;

	if (isTargetShuffle(N->getOpcode())) {
	SDValue Op(N, 0);
	if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
	return Shuffle;

	// Try recursively combining arbitrary sequences of x86 shuffle
	// instructions into higher-order shuffles. We do this after combining
	// specific PSHUF instruction sequences into their minimal form so that we
	// can evaluate how many specialized shuffle instructions are involved in
	// a particular chain.
	if (SDValue Res = combineX86ShufflesRecursively(
	{Op}, 0, Op, {0}, {}, /Depth/ 1,
	/HasVarMask/ false, DAG, DCI, Subtarget)) {
	DCI.CombineTo(N, Res);
	return SDValue();
	}
	}

	return SDValue();
	}

	/// Check if a vector extract from a target-specific shuffle of a load can be
	/// folded into a single element load.
	/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
	/// shuffles have been custom lowered so we need to handle those here.
	static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SDValue InVec = N->getOperand(0);
	SDValue EltNo = N->getOperand(1);
	EVT EltVT = N->getValueType(0);

	if (!isa<ConstantSDNode>(EltNo))
	return SDValue();

	EVT OriginalVT = InVec.getValueType();

	// Peek through bitcasts, don't duplicate a load with other uses.
	InVec = peekThroughOneUseBitcasts(InVec);

	EVT CurrentVT = InVec.getValueType();
	if (!CurrentVT.isVector() \|\|
	CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
	return SDValue();

	if (!isTargetShuffle(InVec.getOpcode()))
	return SDValue();

	// Don't duplicate a load with other uses.
	if (!InVec.hasOneUse())
	return SDValue();

	SmallVector<int, 16> ShuffleMask;
	SmallVector<SDValue, 2> ShuffleOps;
	bool UnaryShuffle;
	if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
	ShuffleOps, ShuffleMask, UnaryShuffle))
	return SDValue();

	// Select the input vector, guarding against out of range extract vector.
	unsigned NumElems = CurrentVT.getVectorNumElements();
	int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
	int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];

	if (Idx == SM_SentinelZero)
	return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
	: DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
	if (Idx == SM_SentinelUndef)
	return DAG.getUNDEF(EltVT);

	assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
	SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
	: ShuffleOps[1];

	// If inputs to shuffle are the same for both ops, then allow 2 uses
	unsigned AllowedUses =
	(ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;

	if (LdNode.getOpcode() == ISD::BITCAST) {
	// Don't duplicate a load with other uses.
	if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
	return SDValue();

	AllowedUses = 1; // only allow 1 load use if we have a bitcast
	LdNode = LdNode.getOperand(0);
	}

	if (!ISD::isNormalLoad(LdNode.getNode()))
	return SDValue();

	LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);

	if (!LN0 \|\|!LN0->hasNUsesOfValue(AllowedUses, 0) \|\| LN0->isVolatile())
	return SDValue();

	// If there's a bitcast before the shuffle, check if the load type and
	// alignment is valid.
	unsigned Align = LN0->getAlignment();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
	EltVT.getTypeForEVT(*DAG.getContext()));

	if (NewAlign > Align \|\| !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
	return SDValue();

	// All checks match so transform back to vector_shuffle so that DAG combiner
	// can finish the job
	SDLoc dl(N);

	// Create shuffle node taking into account the case that its a unary shuffle
	SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
	Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
	ShuffleMask);
	Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
	EltNo);
	}

	// Try to match patterns such as
	// (i16 bitcast (v16i1 x))
	// ->
	// (i16 movmsk (16i8 sext (v16i1 x)))
	// before the illegal vector is scalarized on subtargets that don't have legal
	// vxi1 types.
	static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
	const X86Subtarget &Subtarget) {
	EVT VT = BitCast.getValueType();
	SDValue N0 = BitCast.getOperand(0);
	EVT VecVT = N0->getValueType(0);
	-
	- if (VT.isVector() && VecVT.isScalarInteger() && Subtarget.hasAVX512() &&
	- N0->getOpcode() == ISD::OR) {
	- SDValue Op0 = N0->getOperand(0);
	- SDValue Op1 = N0->getOperand(1);
	- MVT TrunckVT;
	- MVT BitcastVT;
	- switch (VT.getSimpleVT().SimpleTy) {
	- default:
	- return SDValue();
	- case MVT::v16i1:
	- TrunckVT = MVT::i8;
	- BitcastVT = MVT::v8i1;
	- break;
	- case MVT::v32i1:
	- TrunckVT = MVT::i16;
	- BitcastVT = MVT::v16i1;
	- break;
	- case MVT::v64i1:
	- TrunckVT = MVT::i32;
	- BitcastVT = MVT::v32i1;
	- break;
	- }
	- bool isArg0UndefRight = Op0->getOpcode() == ISD::SHL;
	- bool isArg0UndefLeft =
	- Op0->getOpcode() == ISD::ZERO_EXTEND \|\| Op0->getOpcode() == ISD::AND;
	- bool isArg1UndefRight = Op1->getOpcode() == ISD::SHL;
	- bool isArg1UndefLeft =
	- Op1->getOpcode() == ISD::ZERO_EXTEND \|\| Op1->getOpcode() == ISD::AND;
	- SDValue OpLeft;
	- SDValue OpRight;
	- if (isArg0UndefRight && isArg1UndefLeft) {
	- OpLeft = Op0;
	- OpRight = Op1;
	- } else if (isArg1UndefRight && isArg0UndefLeft) {
	- OpLeft = Op1;
	- OpRight = Op0;
	- } else
	- return SDValue();
	- SDLoc DL(BitCast);
	- SDValue Shr = OpLeft->getOperand(0);
	- SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, Shr);
	- SDValue Bitcast1 = DAG.getBitcast(BitcastVT, Trunc1);
	- SDValue Trunc2 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, OpRight);
	- SDValue Bitcast2 = DAG.getBitcast(BitcastVT, Trunc2);
	- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Bitcast1, Bitcast2);
	- }

	if (!VT.isScalarInteger() \|\| !VecVT.isSimple())
	return SDValue();

	// With AVX512 vxi1 types are legal and we prefer using k-regs.
	// MOVMSK is supported in SSE2 or later.
	if (Subtarget.hasAVX512() \|\| !Subtarget.hasSSE2())
	return SDValue();

	// There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
	// v8f64. So all legal 128-bit and 256-bit vectors are covered except for
	// v8i16 and v16i16.
	// For these two cases, we can shuffle the upper element bytes to a
	// consecutive sequence at the start of the vector and treat the results as
	// v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
	// for v16i16 this is not the case, because the shuffle is expensive, so we
	// avoid sign-extending to this type entirely.
	// For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
	// (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
	MVT SExtVT;
	MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
	switch (VecVT.getSimpleVT().SimpleTy) {
	default:
	return SDValue();
	case MVT::v2i1:
	SExtVT = MVT::v2i64;
	FPCastVT = MVT::v2f64;
	break;
	case MVT::v4i1:
	SExtVT = MVT::v4i32;
	FPCastVT = MVT::v4f32;
	// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
	// sign-extend to a 256-bit operation to avoid truncation.
	if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
	N0->getOperand(0).getValueType().is256BitVector()) {
	SExtVT = MVT::v4i64;
	FPCastVT = MVT::v4f64;
	}
	break;
	case MVT::v8i1:
	SExtVT = MVT::v8i16;
	// For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
	// sign-extend to a 256-bit operation to match the compare.
	// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
	// 256-bit because the shuffle is cheaper than sign extending the result of
	// the compare.
	if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
	(N0->getOperand(0).getValueType().is256BitVector() \|\|
	N0->getOperand(0).getValueType().is512BitVector())) {
	SExtVT = MVT::v8i32;
	FPCastVT = MVT::v8f32;
	}
	break;
	case MVT::v16i1:
	SExtVT = MVT::v16i8;
	// For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
	// it is not profitable to sign-extend to 256-bit because this will
	// require an extra cross-lane shuffle which is more expensive than
	// truncating the result of the compare to 128-bits.
	break;
	case MVT::v32i1:
	SExtVT = MVT::v32i8;
	break;
	};

	SDLoc DL(BitCast);
	SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);

	if (SExtVT == MVT::v32i8 && !Subtarget.hasInt256()) {
	// Handle pre-AVX2 cases by splitting to two v16i1's.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	MVT ShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), MVT::i32);
	SDValue Lo = extract128BitVector(V, 0, DAG, DL);
	SDValue Hi = extract128BitVector(V, 16, DAG, DL);
	Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
	Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
	Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
	DAG.getConstant(16, DL, ShiftTy));
	V = DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
	return DAG.getZExtOrTrunc(V, DL, VT);
	}

	if (SExtVT == MVT::v8i16) {
	assert(16 == DAG.ComputeNumSignBits(V) && "Expected all/none bit vector");
	V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
	DAG.getUNDEF(MVT::v8i16));
	} else
	assert(SExtVT.getScalarType() != MVT::i16 &&
	"Vectors of i16 must be packed");
	if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
	V = DAG.getBitcast(FPCastVT, V);
	V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
	return DAG.getZExtOrTrunc(V, DL, VT);
	}

	static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT SrcVT = N0.getValueType();

	// Try to match patterns such as
	// (i16 bitcast (v16i1 x))
	// ->
	// (i16 movmsk (16i8 sext (v16i1 x)))
	// before the setcc result is scalarized on subtargets that don't have legal
	// vxi1 types.
	if (DCI.isBeforeLegalize()) {
	if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
	return V;

	// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
	// type, widen both sides to avoid a trip through memory.
	if ((VT == MVT::v4i1 \|\| VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
	Subtarget.hasVLX()) {
	SDLoc dl(N);
	N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
	N0 = DAG.getBitcast(MVT::v8i1, N0);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
	DAG.getIntPtrConstant(0, dl));
	}

	// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
	// type, widen both sides to avoid a trip through memory.
	if ((SrcVT == MVT::v4i1 \|\| SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
	Subtarget.hasVLX()) {
	SDLoc dl(N);
	unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
	SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
	Ops[0] = N0;
	N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
	N0 = DAG.getBitcast(MVT::i8, N0);
	return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
	}
	}

	// Since MMX types are special and don't usually play with other vector types,
	// it's better to handle them early to be sure we emit efficient code by
	// avoiding store-load conversions.

	// Detect bitcasts between i32 to x86mmx low word.
	if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
	SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
	SDValue N00 = N0->getOperand(0);
	if (N00.getValueType() == MVT::i32)
	return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
	}

	// Detect bitcasts between element or subvector extraction to x86mmx.
	if (VT == MVT::x86mmx &&
	(N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT \|\|
	N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
	isNullConstant(N0.getOperand(1))) {
	SDValue N00 = N0->getOperand(0);
	if (N00.getValueType().is128BitVector())
	return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
	DAG.getBitcast(MVT::v2i64, N00));
	}

	// Detect bitcasts from FP_TO_SINT to x86mmx.
	if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
	N0.getOpcode() == ISD::FP_TO_SINT) {
	SDLoc DL(N0);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
	DAG.getUNDEF(MVT::v2i32));
	return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
	DAG.getBitcast(MVT::v2i64, Res));
	}

	// Convert a bitcasted integer logic operation that has one bitcasted
	// floating-point operand into a floating-point logic operation. This may
	// create a load of a constant, but that is cheaper than materializing the
	// constant in an integer register and transferring it to an SSE register or
	// transferring the SSE operand to integer register and back.
	unsigned FPOpcode;
	switch (N0.getOpcode()) {
	case ISD::AND: FPOpcode = X86ISD::FAND; break;
	case ISD::OR: FPOpcode = X86ISD::FOR; break;
	case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
	default: return SDValue();
	}

	if (!((Subtarget.hasSSE1() && VT == MVT::f32) \|\|
	(Subtarget.hasSSE2() && VT == MVT::f64)))
	return SDValue();

	SDValue LogicOp0 = N0.getOperand(0);
	SDValue LogicOp1 = N0.getOperand(1);
	SDLoc DL0(N0);

	// bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
	if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
	LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
	!isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
	SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
	return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
	}
	// bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
	if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
	LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
	!isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
	SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
	return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
	}

	return SDValue();
	}

	// Match a binop + shuffle pyramid that represents a horizontal reduction over
	// the elements of a vector.
	// Returns the vector that is being reduced on, or SDValue() if a reduction
	// was not matched.
	static SDValue matchBinOpReduction(SDNode *Extract, unsigned &BinOp,
	ArrayRef<ISD::NodeType> CandidateBinOps) {
	// The pattern must end in an extract from index 0.
	if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) \|\|
	!isNullConstant(Extract->getOperand(1)))
	return SDValue();

	SDValue Op = Extract->getOperand(0);
	unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());

	// Match against one of the candidate binary ops.
	if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) {
	return Op.getOpcode() == unsigned(BinOp);
	}))
	return SDValue();

	// At each stage, we're looking for something that looks like:
	// %s = shufflevector <8 x i32> %op, <8 x i32> undef,
	// <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
	// i32 undef, i32 undef, i32 undef, i32 undef>
	// %a = binop <8 x i32> %op, %s
	// Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
	// we expect something like:
	// <4,5,6,7,u,u,u,u>
	// <2,3,u,u,u,u,u,u>
	// <1,u,u,u,u,u,u,u>
	unsigned CandidateBinOp = Op.getOpcode();
	for (unsigned i = 0; i < Stages; ++i) {
	if (Op.getOpcode() != CandidateBinOp)
	return SDValue();

	ShuffleVectorSDNode *Shuffle =
	dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
	if (Shuffle) {
	Op = Op.getOperand(1);
	} else {
	Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
	Op = Op.getOperand(0);
	}

	// The first operand of the shuffle should be the same as the other operand
	// of the binop.
	if (!Shuffle \|\| Shuffle->getOperand(0) != Op)
	return SDValue();

	// Verify the shuffle has the expected (at this stage of the pyramid) mask.
	for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
	if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
	return SDValue();
	}

	BinOp = CandidateBinOp;
	return Op;
	}

	// Given a select, detect the following pattern:
	// 1: %2 = zext <N x i8> %0 to <N x i32>
	// 2: %3 = zext <N x i8> %1 to <N x i32>
	// 3: %4 = sub nsw <N x i32> %2, %3
	// 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
	// 5: %6 = sub nsw <N x i32> zeroinitializer, %4
	// 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
	// This is useful as it is the input into a SAD pattern.
	static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
	SDValue &Op1) {
	// Check the condition of the select instruction is greater-than.
	SDValue SetCC = Select->getOperand(0);
	if (SetCC.getOpcode() != ISD::SETCC)
	return false;
	ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
	if (CC != ISD::SETGT && CC != ISD::SETLT)
	return false;

	SDValue SelectOp1 = Select->getOperand(1);
	SDValue SelectOp2 = Select->getOperand(2);

	// The following instructions assume SelectOp1 is the subtraction operand
	// and SelectOp2 is the negation operand.
	// In the case of SETLT this is the other way around.
	if (CC == ISD::SETLT)
	std::swap(SelectOp1, SelectOp2);

	// The second operand of the select should be the negation of the first
	// operand, which is implemented as 0 - SelectOp1.
	if (!(SelectOp2.getOpcode() == ISD::SUB &&
	ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
	SelectOp2.getOperand(1) == SelectOp1))
	return false;

	// The first operand of SetCC is the first operand of the select, which is the
	// difference between the two input vectors.
	if (SetCC.getOperand(0) != SelectOp1)
	return false;

	// In SetLT case, The second operand of the comparison can be either 1 or 0.
	APInt SplatVal;
	if ((CC == ISD::SETLT) &&
	!((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
	SplatVal.isOneValue()) \|\|
	(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
	return false;

	// In SetGT case, The second operand of the comparison can be either -1 or 0.
	if ((CC == ISD::SETGT) &&
	!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) \|\|
	ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
	return false;

	// The first operand of the select is the difference between the two input
	// vectors.
	if (SelectOp1.getOpcode() != ISD::SUB)
	return false;

	Op0 = SelectOp1.getOperand(0);
	Op1 = SelectOp1.getOperand(1);

	// Check if the operands of the sub are zero-extended from vectors of i8.
	if (Op0.getOpcode() != ISD::ZERO_EXTEND \|\|
	Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 \|\|
	Op1.getOpcode() != ISD::ZERO_EXTEND \|\|
	Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
	return false;

	return true;
	}

	// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
	// to these zexts.
	static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
	const SDValue &Zext1, const SDLoc &DL) {

	// Find the appropriate width for the PSADBW.
	EVT InVT = Zext0.getOperand(0).getValueType();
	unsigned RegSize = std::max(128u, InVT.getSizeInBits());

	// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
	// fill in the missing vector elements with 0.
	unsigned NumConcat = RegSize / InVT.getSizeInBits();
	SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
	Ops[0] = Zext0.getOperand(0);
	MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
	SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
	Ops[0] = Zext1.getOperand(0);
	SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

	// Actually build the SAD
	MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
	return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
	}

	// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
	// PHMINPOSUW.
	static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Bail without SSE41.
	if (!Subtarget.hasSSE41())
	return SDValue();

	EVT ExtractVT = Extract->getValueType(0);
	if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
	return SDValue();

	// Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
	unsigned BinOp;
	SDValue Src = matchBinOpReduction(
	Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});
	if (!Src)
	return SDValue();

	EVT SrcVT = Src.getValueType();
	EVT SrcSVT = SrcVT.getScalarType();
	if (SrcSVT != ExtractVT \|\| (SrcVT.getSizeInBits() % 128) != 0)
	return SDValue();

	SDLoc DL(Extract);
	SDValue MinPos = Src;

	// First, reduce the source down to 128-bit, applying BinOp to lo/hi.
	while (SrcVT.getSizeInBits() > 128) {
	unsigned NumElts = SrcVT.getVectorNumElements();
	unsigned NumSubElts = NumElts / 2;
	SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
	unsigned SubSizeInBits = SrcVT.getSizeInBits();
	SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
	SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
	MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
	}
	assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) \|\|
	(SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
	"Unexpected value type");

	// PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
	// to flip the value accordingly.
	SDValue Mask;
	unsigned MaskEltsBits = ExtractVT.getSizeInBits();
	if (BinOp == ISD::SMAX)
	Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
	else if (BinOp == ISD::SMIN)
	Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
	else if (BinOp == ISD::UMAX)
	Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);

	if (Mask)
	MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

	// For v16i8 cases we need to perform UMIN on pairs of byte elements,
	// shuffling each upper element down and insert zeros. This means that the
	// v16i8 UMIN will leave the upper element as zero, performing zero-extension
	// ready for the PHMINPOS.
	if (ExtractVT == MVT::i8) {
	SDValue Upper = DAG.getVectorShuffle(
	SrcVT, DL, MinPos, getZeroVector(MVT::v16i8, Subtarget, DAG, DL),
	{1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
	MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
	}

	// Perform the PHMINPOS on a v8i16 vector,
	MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
	MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
	MinPos = DAG.getBitcast(SrcVT, MinPos);

	if (Mask)
	MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
	DAG.getIntPtrConstant(0, DL));
	}

	// Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
	static SDValue combineHorizontalPredicateResult(SDNode *Extract,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Bail without SSE2 or with AVX512VL (which uses predicate registers).
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasVLX())
	return SDValue();

	EVT ExtractVT = Extract->getValueType(0);
	unsigned BitWidth = ExtractVT.getSizeInBits();
	if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
	ExtractVT != MVT::i8)
	return SDValue();

	// Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
	unsigned BinOp = 0;
	SDValue Match = matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
	if (!Match)
	return SDValue();

	// EXTRACT_VECTOR_ELT can require implicit extension of the vector element
	// which we can't support here for now.
	if (Match.getScalarValueSizeInBits() != BitWidth)
	return SDValue();

	// We require AVX2 for PMOVMSKB for v16i16/v32i8;
	unsigned MatchSizeInBits = Match.getValueSizeInBits();
	if (!(MatchSizeInBits == 128 \|\|
	(MatchSizeInBits == 256 &&
	((Subtarget.hasAVX() && BitWidth >= 32) \|\| Subtarget.hasAVX2()))))
	return SDValue();

	// Don't bother performing this for 2-element vectors.
	if (Match.getValueType().getVectorNumElements() <= 2)
	return SDValue();

	// Check that we are extracting a reduction of all sign bits.
	if (DAG.ComputeNumSignBits(Match) != BitWidth)
	return SDValue();

	// For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
	MVT MaskVT;
	if (64 == BitWidth \|\| 32 == BitWidth)
	MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
	MatchSizeInBits / BitWidth);
	else
	MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);

	APInt CompareBits;
	ISD::CondCode CondCode;
	if (BinOp == ISD::OR) {
	// any_of -> MOVMSK != 0
	CompareBits = APInt::getNullValue(32);
	CondCode = ISD::CondCode::SETNE;
	} else {
	// all_of -> MOVMSK == ((1 << NumElts) - 1)
	CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
	CondCode = ISD::CondCode::SETEQ;
	}

	// Perform the select as i32/i64 and then truncate to avoid partial register
	// stalls.
	unsigned ResWidth = std::max(BitWidth, 32u);
	EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
	SDLoc DL(Extract);
	SDValue Zero = DAG.getConstant(0, DL, ResVT);
	SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
	SDValue Res = DAG.getBitcast(MaskVT, Match);
	Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
	Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
	Ones, Zero, CondCode);
	return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
	}

	static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// PSADBW is only supported on SSE2 and up.
	if (!Subtarget.hasSSE2())
	return SDValue();

	// Verify the type we're extracting from is any integer type above i16.
	EVT VT = Extract->getOperand(0).getValueType();
	if (!VT.isSimple() \|\| !(VT.getVectorElementType().getSizeInBits() > 16))
	return SDValue();

	unsigned RegSize = 128;
	if (Subtarget.hasBWI())
	RegSize = 512;
	else if (Subtarget.hasAVX2())
	RegSize = 256;

	// We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
	// TODO: We should be able to handle larger vectors by splitting them before
	// feeding them into several SADs, and then reducing over those.
	if (RegSize / VT.getVectorNumElements() < 8)
	return SDValue();

	// Match shuffle + add pyramid.
	unsigned BinOp = 0;
	SDValue Root = matchBinOpReduction(Extract, BinOp, {ISD::ADD});

	// The operand is expected to be zero extended from i8
	// (verified in detectZextAbsDiff).
	// In order to convert to i64 and above, additional any/zero/sign
	// extend is expected.
	// The zero extend from 32 bit has no mathematical effect on the result.
	// Also the sign extend is basically zero extend
	// (extends the sign bit which is zero).
	// So it is correct to skip the sign/zero extend instruction.
	if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND \|\|
	Root.getOpcode() == ISD::ZERO_EXTEND \|\|
	Root.getOpcode() == ISD::ANY_EXTEND))
	Root = Root.getOperand(0);

	// If there was a match, we want Root to be a select that is the root of an
	// abs-diff pattern.
	if (!Root \|\| (Root.getOpcode() != ISD::VSELECT))
	return SDValue();

	// Check whether we have an abs-diff pattern feeding into the select.
	SDValue Zext0, Zext1;
	if (!detectZextAbsDiff(Root, Zext0, Zext1))
	return SDValue();

	// Create the SAD instruction.
	SDLoc DL(Extract);
	SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);

	// If the original vector was wider than 8 elements, sum over the results
	// in the SAD vector.
	unsigned Stages = Log2_32(VT.getVectorNumElements());
	MVT SadVT = SAD.getSimpleValueType();
	if (Stages > 3) {
	unsigned SadElems = SadVT.getVectorNumElements();

	for(unsigned i = Stages - 3; i > 0; --i) {
	SmallVector<int, 16> Mask(SadElems, -1);
	for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
	Mask[j] = MaskEnd + j;

	SDValue Shuffle =
	DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
	SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
	}
	}

	MVT Type = Extract->getSimpleValueType(0);
	unsigned TypeSizeInBits = Type.getSizeInBits();
	// Return the lowest TypeSizeInBits bits.
	MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
	SAD = DAG.getBitcast(ResVT, SAD);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
	Extract->getOperand(1));
	}

	// Attempt to peek through a target shuffle and extract the scalar from the
	// source.
	static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SDValue Src = N->getOperand(0);
	SDValue Idx = N->getOperand(1);

	EVT VT = N->getValueType(0);
	EVT SrcVT = Src.getValueType();
	EVT SrcSVT = SrcVT.getVectorElementType();
	unsigned NumSrcElts = SrcVT.getVectorNumElements();

	// Don't attempt this for boolean mask vectors or unknown extraction indices.
	if (SrcSVT == MVT::i1 \|\| !isa<ConstantSDNode>(Idx))
	return SDValue();

	// Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
	if (X86ISD::VBROADCAST == Src.getOpcode() &&
	Src.getOperand(0).getValueType() == VT)
	return Src.getOperand(0);

	// Resolve the target shuffle inputs and mask.
	SmallVector<int, 16> Mask;
	SmallVector<SDValue, 2> Ops;
	if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
	return SDValue();

	// Attempt to narrow/widen the shuffle mask to the correct size.
	if (Mask.size() != NumSrcElts) {
	if ((NumSrcElts % Mask.size()) == 0) {
	SmallVector<int, 16> ScaledMask;
	int Scale = NumSrcElts / Mask.size();
	scaleShuffleMask<int>(Scale, Mask, ScaledMask);
	Mask = std::move(ScaledMask);
	} else if ((Mask.size() % NumSrcElts) == 0) {
	SmallVector<int, 16> WidenedMask;
	while (Mask.size() > NumSrcElts &&
	canWidenShuffleElements(Mask, WidenedMask))
	Mask = std::move(WidenedMask);
	// TODO - investigate support for wider shuffle masks with known upper
	// undef/zero elements for implicit zero-extension.
	}
	}

	// Check if narrowing/widening failed.
	if (Mask.size() != NumSrcElts)
	return SDValue();

	int SrcIdx = Mask[N->getConstantOperandVal(1)];
	SDLoc dl(N);

	// If the shuffle source element is undef/zero then we can just accept it.
	if (SrcIdx == SM_SentinelUndef)
	return DAG.getUNDEF(VT);

	if (SrcIdx == SM_SentinelZero)
	return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
	: DAG.getConstant(0, dl, VT);

	SDValue SrcOp = Ops[SrcIdx / Mask.size()];
	SrcOp = DAG.getBitcast(SrcVT, SrcOp);
	SrcIdx = SrcIdx % Mask.size();

	// We can only extract other elements from 128-bit vectors and in certain
	// circumstances, depending on SSE-level.
	// TODO: Investigate using extract_subvector for larger vectors.
	// TODO: Investigate float/double extraction if it will be just stored.
	if ((SrcVT == MVT::v4i32 \|\| SrcVT == MVT::v2i64) &&
	((SrcIdx == 0 && Subtarget.hasSSE2()) \|\| Subtarget.hasSSE41())) {
	assert(SrcSVT == VT && "Unexpected extraction type");
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
	DAG.getIntPtrConstant(SrcIdx, dl));
	}

	if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) \|\|
	(SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
	assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
	"Unexpected extraction type");
	unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
	SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
	DAG.getIntPtrConstant(SrcIdx, dl));
	return DAG.getZExtOrTrunc(ExtOp, dl, VT);
	}

	return SDValue();
	}

	/// Detect vector gather/scatter index generation and convert it from being a
	/// bunch of shuffles and extracts into a somewhat faster sequence.
	/// For i686, the best sequence is apparently storing the value and loading
	/// scalars back, while for x64 we should use 64-bit extracts and shifts.
	static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
	return NewOp;

	// TODO - Remove this once we can handle the implicit zero-extension of
	// X86ISD::PEXTRW/X86ISD::PEXTRB in:
	// XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
	// combineBasicSADPattern.
	if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
	return NewOp;

	SDValue InputVector = N->getOperand(0);
	SDValue EltIdx = N->getOperand(1);

	EVT SrcVT = InputVector.getValueType();
	EVT VT = N->getValueType(0);
	SDLoc dl(InputVector);

	// Detect mmx extraction of all bits as a i64. It works better as a bitcast.
	if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
	VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
	SDValue MMXSrc = InputVector.getOperand(0);

	// The bitcast source is a direct mmx result.
	if (MMXSrc.getValueType() == MVT::x86mmx)
	return DAG.getBitcast(VT, InputVector);
	}

	// Detect mmx to i32 conversion through a v2i32 elt extract.
	if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
	VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
	SDValue MMXSrc = InputVector.getOperand(0);

	// The bitcast source is a direct mmx result.
	if (MMXSrc.getValueType() == MVT::x86mmx)
	return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
	}

	if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
	isa<ConstantSDNode>(EltIdx) &&
	isa<ConstantSDNode>(InputVector.getOperand(0))) {
	uint64_t ExtractedElt = N->getConstantOperandVal(1);
	uint64_t InputValue = InputVector.getConstantOperandVal(0);
	uint64_t Res = (InputValue >> ExtractedElt) & 1;
	return DAG.getConstant(Res, dl, MVT::i1);
	}

	// Check whether this extract is the root of a sum of absolute differences
	// pattern. This has to be done here because we really want it to happen
	// pre-legalization,
	if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
	return SAD;

	// Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
	if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
	return Cmp;

	// Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
	if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
	return MinMax;

	// Only operate on vectors of 4 elements, where the alternative shuffling
	// gets to be more expensive.
	if (SrcVT != MVT::v4i32)
	return SDValue();

	// Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
	// single use which is a sign-extend or zero-extend, and all elements are
	// used.
	SmallVector<SDNode *, 4> Uses;
	unsigned ExtractedElements = 0;
	for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
	UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
	if (UI.getUse().getResNo() != InputVector.getResNo())
	return SDValue();

	SDNode Extract = UI;
	if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	if (Extract->getValueType(0) != MVT::i32)
	return SDValue();
	if (!Extract->hasOneUse())
	return SDValue();
	if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
	Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
	return SDValue();
	if (!isa<ConstantSDNode>(Extract->getOperand(1)))
	return SDValue();

	// Record which element was extracted.
	ExtractedElements \|= 1 << Extract->getConstantOperandVal(1);
	Uses.push_back(Extract);
	}

	// If not all the elements were used, this may not be worthwhile.
	if (ExtractedElements != 15)
	return SDValue();

	// Ok, we've now decided to do the transformation.
	// If 64-bit shifts are legal, use the extract-shift sequence,
	// otherwise bounce the vector off the cache.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Vals[4];

	if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
	SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
	auto &DL = DAG.getDataLayout();
	EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
	SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
	DAG.getConstant(0, dl, VecIdxTy));
	SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
	DAG.getConstant(1, dl, VecIdxTy));

	SDValue ShAmt = DAG.getConstant(
	32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
	Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
	Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
	DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
	Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
	Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
	DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
	} else {
	// Store the value to a temporary stack slot.
	SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
	SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
	MachinePointerInfo());

	EVT ElementType = SrcVT.getVectorElementType();
	unsigned EltSize = ElementType.getSizeInBits() / 8;

	// Replace each use (extract) with a load of the appropriate element.
	for (unsigned i = 0; i < 4; ++i) {
	uint64_t Offset = EltSize * i;
	auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
	SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);

	SDValue ScalarAddr =
	DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);

	// Load the scalar.
	Vals[i] =
	DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
	}
	}

	// Replace the extracts
	for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
	UE = Uses.end(); UI != UE; ++UI) {
	SDNode Extract = UI;

	uint64_t IdxVal = Extract->getConstantOperandVal(1);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
	}

	// The replacement was made in place; return N so it won't be revisited.
	return SDValue(N, 0);
	}

	/// If a vector select has an operand that is -1 or 0, try to simplify the
	/// select to a bitwise logic operation.
	/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
	static SDValue
	combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue Cond = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	EVT VT = LHS.getValueType();
	EVT CondVT = Cond.getValueType();
	SDLoc DL(N);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (N->getOpcode() != ISD::VSELECT)
	return SDValue();

	assert(CondVT.isVector() && "Vector select expects a vector selector!");

	bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
	// Check if the first operand is all zeros and Cond type is vXi1.
	// This situation only applies to avx512.
	if (TValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
	CondVT.getVectorElementType() == MVT::i1) {
	// Invert the cond to not(cond) : xor(op,allones)=not(op)
	SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
	DAG.getAllOnesConstant(DL, CondVT));
	// Vselect cond, op1, op2 = Vselect not(cond), op2, op1
	return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
	}

	// To use the condition operand as a bitwise mask, it must have elements that
	// are the same size as the select elements. Ie, the condition operand must
	// have already been promoted from the IR select condition type <N x i1>.
	// Don't check if the types themselves are equal because that excludes
	// vector floating-point selects.
	if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
	return SDValue();

	bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
	bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());

	// Try to invert the condition if true value is not all 1s and false value is
	// not all 0s.
	if (!TValIsAllOnes && !FValIsAllZeros &&
	// Check if the selector will be produced by CMPP/PCMP.
	Cond.getOpcode() == ISD::SETCC &&
	// Check if SETCC has already been promoted.
	TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
	CondVT) {
	bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());

	if (TValIsAllZeros \|\| FValIsAllOnes) {
	SDValue CC = Cond.getOperand(2);
	ISD::CondCode NewCC =
	ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
	Cond.getOperand(0).getValueType().isInteger());
	Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
	NewCC);
	std::swap(LHS, RHS);
	TValIsAllOnes = FValIsAllOnes;
	FValIsAllZeros = TValIsAllZeros;
	}
	}

	// Cond value must be 'sign splat' to be converted to a logical op.
	if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
	return SDValue();

	// vselect Cond, 111..., 000... -> Cond
	if (TValIsAllOnes && FValIsAllZeros)
	return DAG.getBitcast(VT, Cond);

	if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
	return SDValue();

	// vselect Cond, 111..., X -> or Cond, X
	if (TValIsAllOnes) {
	SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
	SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
	return DAG.getBitcast(VT, Or);
	}

	// vselect Cond, X, 000... -> and Cond, X
	if (FValIsAllZeros) {
	SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
	SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
	return DAG.getBitcast(VT, And);
	}

	// vselect Cond, 000..., X -> andn Cond, X
	if (TValIsAllZeros) {
	MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
	SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
	SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
	SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
	return DAG.getBitcast(VT, AndN);
	}

	return SDValue();
	}

	static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
	SDValue Cond = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	SDLoc DL(N);

	auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
	auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
	if (!TrueC \|\| !FalseC)
	return SDValue();

	// Don't do this for crazy integer types.
	EVT VT = N->getValueType(0);
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	// We're going to use the condition bit in math or logic ops. We could allow
	// this with a wider condition value (post-legalization it becomes an i8),
	// but if nothing is creating selects that late, it doesn't matter.
	if (Cond.getValueType() != MVT::i1)
	return SDValue();

	// A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
	// 3, 5, or 9 with i32/i64, so those get transformed too.
	// TODO: For constants that overflow or do not differ by power-of-2 or small
	// multiplier, convert to 'and' + 'add'.
	const APInt &TrueVal = TrueC->getAPIntValue();
	const APInt &FalseVal = FalseC->getAPIntValue();
	bool OV;
	APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
	if (OV)
	return SDValue();

	APInt AbsDiff = Diff.abs();
	if (AbsDiff.isPowerOf2() \|\|
	((VT == MVT::i32 \|\| VT == MVT::i64) &&
	(AbsDiff == 3 \|\| AbsDiff == 5 \|\| AbsDiff == 9))) {

	// We need a positive multiplier constant for shift/LEA codegen. The 'not'
	// of the condition can usually be folded into a compare predicate, but even
	// without that, the sequence should be cheaper than a CMOV alternative.
	if (TrueVal.slt(FalseVal)) {
	Cond = DAG.getNOT(DL, Cond, MVT::i1);
	std::swap(TrueC, FalseC);
	}

	// select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
	SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);

	// Multiply condition by the difference if non-one.
	if (!AbsDiff.isOneValue())
	R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));

	// Add the base if non-zero.
	if (!FalseC->isNullValue())
	R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));

	return R;
	}

	return SDValue();
	}

	// If this is a bitcasted op that can be represented as another type, push the
	// the bitcast to the inputs. This allows more opportunities for pattern
	// matching masked instructions. This is called when we know that the operation
	// is used as one of the inputs of a vselect.
	static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	// Make sure we have a bitcast.
	if (OrigOp.getOpcode() != ISD::BITCAST)
	return false;

	SDValue Op = OrigOp.getOperand(0);

	// If the operation is used by anything other than the bitcast, we shouldn't
	// do this combine as that would replicate the operation.
	if (!Op.hasOneUse())
	return false;

	MVT VT = OrigOp.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	SDLoc DL(Op.getNode());

	auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
	SDValue Op2) {
	Op0 = DAG.getBitcast(VT, Op0);
	DCI.AddToWorklist(Op0.getNode());
	Op1 = DAG.getBitcast(VT, Op1);
	DCI.AddToWorklist(Op1.getNode());
	DCI.CombineTo(OrigOp.getNode(),
	DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
	return true;
	};

	unsigned Opcode = Op.getOpcode();
	switch (Opcode) {
	case X86ISD::SHUF128: {
	if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
	return false;
	// Only change element size, not type.
	if (VT.isInteger() != Op.getSimpleValueType().isInteger())
	return false;
	return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
	Op.getOperand(2));
	}
	case X86ISD::SUBV_BROADCAST: {
	unsigned EltSize = EltVT.getSizeInBits();
	if (EltSize != 32 && EltSize != 64)
	return false;
	// Only change element size, not type.
	if (VT.isInteger() != Op.getSimpleValueType().isInteger())
	return false;
	SDValue Op0 = Op.getOperand(0);
	MVT Op0VT = MVT::getVectorVT(EltVT,
	Op0.getSimpleValueType().getSizeInBits() / EltSize);
	Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
	DCI.AddToWorklist(Op0.getNode());
	DCI.CombineTo(OrigOp.getNode(),
	DAG.getNode(Opcode, DL, VT, Op0));
	return true;
	}
	}

	return false;
	}

	/// Do target-specific dag combines on SELECT and VSELECT nodes.
	static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	SDValue Cond = N->getOperand(0);
	// Get the LHS/RHS of the select.
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	EVT VT = LHS.getValueType();
	EVT CondVT = Cond.getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// If we have SSE[12] support, try to form min/max nodes. SSE min/max
	// instructions match the semantics of the common C idiom x<y?x:y but not
	// x<=y?x:y, because of how they handle negative zero (which can be
	// ignored in unsafe-math mode).
	// We also try to create v2f32 min/max nodes, which we later widen to v4f32.
	if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
	VT != MVT::f80 && VT != MVT::f128 &&
	(TLI.isTypeLegal(VT) \|\| VT == MVT::v2f32) &&
	(Subtarget.hasSSE2() \|\|
	(Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

	unsigned Opcode = 0;
	// Check for x CC y ? x : y.
	if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(1))) {
	switch (CC) {
	default: break;
	case ISD::SETULT:
	// Converting this to a min would handle NaNs incorrectly, and swapping
	// the operands would cause it to handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)) {
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!(DAG.isKnownNeverZero(LHS) \|\| DAG.isKnownNeverZero(RHS)))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETOLE:
	// Converting this to a min would handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
	break;
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETULE:
	// Converting this to a min would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOLT:
	case ISD::SETLT:
	case ISD::SETLE:
	Opcode = X86ISD::FMIN;
	break;

	case ISD::SETOGE:
	// Converting this to a max would handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
	break;
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETUGT:
	// Converting this to a max would handle NaNs incorrectly, and swapping
	// the operands would cause it to handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)) {
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!(DAG.isKnownNeverZero(LHS) \|\| DAG.isKnownNeverZero(RHS)))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETUGE:
	// Converting this to a max would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOGT:
	case ISD::SETGT:
	case ISD::SETGE:
	Opcode = X86ISD::FMAX;
	break;
	}
	// Check for x CC y ? y : x -- a min/max with reversed arms.
	} else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(0))) {
	switch (CC) {
	default: break;
	case ISD::SETOGE:
	// Converting this to a min would handle comparisons between positive
	// and negative zero incorrectly, and swapping the operands would
	// cause it to handle NaNs incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!(DAG.isKnownNeverZero(LHS) \|\| DAG.isKnownNeverZero(RHS))) {
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETUGT:
	// Converting this to a min would handle NaNs incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	(!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)))
	break;
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETUGE:
	// Converting this to a min would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOGT:
	case ISD::SETGT:
	case ISD::SETGE:
	Opcode = X86ISD::FMIN;
	break;

	case ISD::SETULT:
	// Converting this to a max would handle NaNs incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETOLE:
	// Converting this to a max would handle comparisons between positive
	// and negative zero incorrectly, and swapping the operands would
	// cause it to handle NaNs incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETULE:
	// Converting this to a max would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOLT:
	case ISD::SETLT:
	case ISD::SETLE:
	Opcode = X86ISD::FMAX;
	break;
	}
	}

	if (Opcode)
	return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
	}

	// v16i8 (select v16i1, v16i8, v16i8) does not have a proper
	// lowering on KNL. In this case we convert it to
	// v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
	// The same situation for all 128 and 256-bit vectors of i8 and i16.
	// Since SKX these selects have a proper lowering.
	if (Subtarget.hasAVX512() && CondVT.isVector() &&
	CondVT.getVectorElementType() == MVT::i1 &&
	(VT.is128BitVector() \|\| VT.is256BitVector()) &&
	(VT.getVectorElementType() == MVT::i8 \|\|
	VT.getVectorElementType() == MVT::i16) &&
	!(Subtarget.hasBWI() && Subtarget.hasVLX())) {
	Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
	DCI.AddToWorklist(Cond.getNode());
	return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
	}

	if (SDValue V = combineSelectOfTwoConstants(N, DAG))
	return V;

	// Canonicalize max and min:
	// (x > y) ? x : y -> (x >= y) ? x : y
	// (x < y) ? x : y -> (x <= y) ? x : y
	// This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
	// the need for an extra compare
	// against zero. e.g.
	// (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
	// subl %esi, %edi
	// testl %edi, %edi
	// movl $0, %eax
	// cmovgl %edi, %eax
	// =>
	// xorl %eax, %eax
	// subl %esi, $edi
	// cmovsl %eax, %edi
	if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
	DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(1))) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
	switch (CC) {
	default: break;
	case ISD::SETLT:
	case ISD::SETGT: {
	ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
	Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
	Cond.getOperand(0), Cond.getOperand(1), NewCC);
	return DAG.getSelect(DL, VT, Cond, LHS, RHS);
	}
	}
	}

	// Early exit check
	if (!TLI.isTypeLegal(VT))
	return SDValue();

	// Match VSELECTs into subs with unsigned saturation.
	if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
	// psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
	((Subtarget.hasSSE2() && (VT == MVT::v16i8 \|\| VT == MVT::v8i16)) \|\|
	(Subtarget.hasAVX2() && (VT == MVT::v32i8 \|\| VT == MVT::v16i16)))) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

	// Check if one of the arms of the VSELECT is a zero vector. If it's on the
	// left side invert the predicate to simplify logic below.
	SDValue Other;
	if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
	Other = RHS;
	CC = ISD::getSetCCInverse(CC, true);
	} else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
	Other = LHS;
	}

	if (Other.getNode() && Other->getNumOperands() == 2 &&
	DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
	SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
	SDValue CondRHS = Cond->getOperand(1);

	// Look for a general sub with unsigned saturation first.
	// x >= y ? x-y : 0 --> subus x, y
	// x > y ? x-y : 0 --> subus x, y
	if ((CC == ISD::SETUGE \|\| CC == ISD::SETUGT) &&
	Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
	return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);

	if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
	if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
	if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
	if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
	// If the RHS is a constant we have to reverse the const
	// canonicalization.
	// x > C-1 ? x+-C : 0 --> subus x, C
	if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
	CondRHSConst->getAPIntValue() ==
	(-OpRHSConst->getAPIntValue() - 1))
	return DAG.getNode(
	X86ISD::SUBUS, DL, VT, OpLHS,
	DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));

	// Another special case: If C was a sign bit, the sub has been
	// canonicalized into a xor.
	// FIXME: Would it be better to use computeKnownBits to determine
	// whether it's safe to decanonicalize the xor?
	// x s< 0 ? x^C : 0 --> subus x, C
	if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
	ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
	OpRHSConst->getAPIntValue().isSignMask())
	// Note that we have to rebuild the RHS constant here to ensure we
	// don't rely on particular values of undef lanes.
	return DAG.getNode(
	X86ISD::SUBUS, DL, VT, OpLHS,
	DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
	}
	}
	}

	if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
	return V;

	// If this is a dynamic select (non-constant condition) and we can match
	// this node with one of the variable blend instructions, restructure the
	// condition so that blends can use the high (sign) bit of each element and
	// use SimplifyDemandedBits to simplify the condition operand.
	if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
	!DCI.isBeforeLegalize() &&
	!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
	unsigned BitWidth = Cond.getScalarValueSizeInBits();

	// Don't optimize vector selects that map to mask-registers.
	if (BitWidth == 1)
	return SDValue();

	// We can only handle the cases where VSELECT is directly legal on the
	// subtarget. We custom lower VSELECT nodes with constant conditions and
	// this makes it hard to see whether a dynamic VSELECT will correctly
	// lower, so we both check the operation's status and explicitly handle the
	// cases where a dynamic blend will fail even though a constant-condition
	// blend could be custom lowered.
	// FIXME: We should find a better way to handle this class of problems.
	// Potentially, we should combine constant-condition vselect nodes
	// pre-legalization into shuffles and not mark as many types as custom
	// lowered.
	if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
	return SDValue();
	// FIXME: We don't support i16-element blends currently. We could and
	// should support them by making all the bits in the condition be set
	// rather than just the high bit and using an i8-element blend.
	if (VT.getVectorElementType() == MVT::i16)
	return SDValue();
	// Dynamic blending was only available from SSE4.1 onward.
	if (VT.is128BitVector() && !Subtarget.hasSSE41())
	return SDValue();
	// Byte blends are only available in AVX2
	if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
	return SDValue();
	// There are no 512-bit blend instructions that use sign bits.
	if (VT.is512BitVector())
	return SDValue();

	assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
	APInt DemandedMask(APInt::getSignMask(BitWidth));
	KnownBits Known;
	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
	!DCI.isBeforeLegalizeOps());
	if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) \|\|
	TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {
	// If we changed the computation somewhere in the DAG, this change will
	// affect all users of Cond. Make sure it is fine and update all the nodes
	// so that we do not use the generic VSELECT anymore. Otherwise, we may
	// perform wrong optimizations as we messed with the actual expectation
	// for the vector boolean values.
	if (Cond != TLO.Old) {
	// Check all uses of the condition operand to check whether it will be
	// consumed by non-BLEND instructions. Those may require that all bits
	// are set properly.
	for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
	UI != UE; ++UI) {
	// TODO: Add other opcodes eventually lowered into BLEND.
	if (UI->getOpcode() != ISD::VSELECT \|\| UI.getOperandNo() != 0)
	return SDValue();
	}

	// Update all users of the condition before committing the change, so
	// that the VSELECT optimizations that expect the correct vector boolean
	// value will not be triggered.
	for (SDNode *U : Cond->uses()) {
	SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
	U->getValueType(0), Cond, U->getOperand(1),
	U->getOperand(2));
	DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
	}
	DCI.CommitTargetLoweringOpt(TLO);
	return SDValue();
	}
	// Only Cond (rather than other nodes in the computation chain) was
	// changed. Change the condition just for N to keep the opportunity to
	// optimize all other users their own way.
	SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
	return SDValue();
	}
	}

	// Look for vselects with LHS/RHS being bitcasted from an operation that
	// can be executed on another type. Push the bitcast to the inputs of
	// the operation. This exposes opportunities for using masking instructions.
	if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
	CondVT.getVectorElementType() == MVT::i1) {
	if (combineBitcastForMaskedOp(LHS, DAG, DCI))
	return SDValue(N, 0);
	if (combineBitcastForMaskedOp(RHS, DAG, DCI))
	return SDValue(N, 0);
	}

	// Custom action for SELECT MMX
	if (VT == MVT::x86mmx) {
	LHS = DAG.getBitcast(MVT::i64, LHS);
	RHS = DAG.getBitcast(MVT::i64, RHS);
	SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
	return DAG.getBitcast(VT, newSelect);
	}

	return SDValue();
	}

	/// Combine:
	/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
	/// to:
	/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
	/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
	/// Note that this is only legal for some op/cc combinations.
	static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// This combine only operates on CMP-like nodes.
	if (!(Cmp.getOpcode() == X86ISD::CMP \|\|
	(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
	return SDValue();

	// Can't replace the cmp if it has more uses than the one we're looking at.
	// FIXME: We would like to be able to handle this, but would need to make sure
	// all uses were updated.
	if (!Cmp.hasOneUse())
	return SDValue();

	// This only applies to variations of the common case:
	// (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
	// (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
	// (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
	// (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
	// Using the proper condcodes (see below), overflow is checked for.

	// FIXME: We can generalize both constraints:
	// - XOR/OR/AND (if they were made to survive AtomicExpand)
	// - LHS != 1
	// if the result is compared.

	SDValue CmpLHS = Cmp.getOperand(0);
	SDValue CmpRHS = Cmp.getOperand(1);

	if (!CmpLHS.hasOneUse())
	return SDValue();

	unsigned Opc = CmpLHS.getOpcode();
	if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
	return SDValue();

	SDValue OpRHS = CmpLHS.getOperand(2);
	auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
	if (!OpRHSC)
	return SDValue();

	APInt Addend = OpRHSC->getAPIntValue();
	if (Opc == ISD::ATOMIC_LOAD_SUB)
	Addend = -Addend;

	auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
	if (!CmpRHSC)
	return SDValue();

	APInt Comparison = CmpRHSC->getAPIntValue();

	// If the addend is the negation of the comparison value, then we can do
	// a full comparison by emitting the atomic arithmetic as a locked sub.
	if (Comparison == -Addend) {
	// The CC is fine, but we need to rewrite the LHS of the comparison as an
	// atomic sub.
	auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
	auto AtomicSub = DAG.getAtomic(
	ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
	/Chain/ CmpLHS.getOperand(0), /LHS/ CmpLHS.getOperand(1),
	/RHS/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
	AN->getMemOperand());
	// If the comparision uses the CF flag we can't use INC/DEC instructions.
	bool NeedCF = false;
	switch (CC) {
	default: break;
	case X86::COND_A: case X86::COND_AE:
	case X86::COND_B: case X86::COND_BE:
	NeedCF = true;
	break;
	}
	auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget, !NeedCF);
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
	DAG.getUNDEF(CmpLHS.getValueType()));
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
	return LockOp;
	}

	// We can handle comparisons with zero in a number of cases by manipulating
	// the CC used.
	if (!Comparison.isNullValue())
	return SDValue();

	if (CC == X86::COND_S && Addend == 1)
	CC = X86::COND_LE;
	else if (CC == X86::COND_NS && Addend == 1)
	CC = X86::COND_G;
	else if (CC == X86::COND_G && Addend == -1)
	CC = X86::COND_GE;
	else if (CC == X86::COND_LE && Addend == -1)
	CC = X86::COND_L;
	else
	return SDValue();

	SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
	DAG.getUNDEF(CmpLHS.getValueType()));
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
	return LockOp;
	}

	// Check whether a boolean test is testing a boolean value generated by
	// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
	// code.
	//
	// Simplify the following patterns:
	// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
	// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
	// to (Op EFLAGS Cond)
	//
	// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
	// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
	// to (Op EFLAGS !Cond)
	//
	// where Op could be BRCOND or CMOV.
	//
	static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
	// This combine only operates on CMP-like nodes.
	if (!(Cmp.getOpcode() == X86ISD::CMP \|\|
	(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
	return SDValue();

	// Quit if not used as a boolean value.
	if (CC != X86::COND_E && CC != X86::COND_NE)
	return SDValue();

	// Check CMP operands. One of them should be 0 or 1 and the other should be
	// an SetCC or extended from it.
	SDValue Op1 = Cmp.getOperand(0);
	SDValue Op2 = Cmp.getOperand(1);

	SDValue SetCC;
	const ConstantSDNode* C = nullptr;
	bool needOppositeCond = (CC == X86::COND_E);
	bool checkAgainstTrue = false; // Is it a comparison against 1?

	if ((C = dyn_cast<ConstantSDNode>(Op1)))
	SetCC = Op2;
	else if ((C = dyn_cast<ConstantSDNode>(Op2)))
	SetCC = Op1;
	else // Quit if all operands are not constants.
	return SDValue();

	if (C->getZExtValue() == 1) {
	needOppositeCond = !needOppositeCond;
	checkAgainstTrue = true;
	} else if (C->getZExtValue() != 0)
	// Quit if the constant is neither 0 or 1.
	return SDValue();

	bool truncatedToBoolWithAnd = false;
	// Skip (zext $x), (trunc $x), or (and $x, 1) node.
	while (SetCC.getOpcode() == ISD::ZERO_EXTEND \|\|
	SetCC.getOpcode() == ISD::TRUNCATE \|\|
	SetCC.getOpcode() == ISD::AND) {
	if (SetCC.getOpcode() == ISD::AND) {
	int OpIdx = -1;
	if (isOneConstant(SetCC.getOperand(0)))
	OpIdx = 1;
	if (isOneConstant(SetCC.getOperand(1)))
	OpIdx = 0;
	if (OpIdx < 0)
	break;
	SetCC = SetCC.getOperand(OpIdx);
	truncatedToBoolWithAnd = true;
	} else
	SetCC = SetCC.getOperand(0);
	}

	switch (SetCC.getOpcode()) {
	case X86ISD::SETCC_CARRY:
	// Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
	// simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
	// i.e. it's a comparison against true but the result of SETCC_CARRY is not
	// truncated to i1 using 'and'.
	if (checkAgainstTrue && !truncatedToBoolWithAnd)
	break;
	assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
	"Invalid use of SETCC_CARRY!");
	LLVM_FALLTHROUGH;
	case X86ISD::SETCC:
	// Set the condition code or opposite one if necessary.
	CC = X86::CondCode(SetCC.getConstantOperandVal(0));
	if (needOppositeCond)
	CC = X86::GetOppositeBranchCondition(CC);
	return SetCC.getOperand(1);
	case X86ISD::CMOV: {
	// Check whether false/true value has canonical one, i.e. 0 or 1.
	ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
	ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
	// Quit if true value is not a constant.
	if (!TVal)
	return SDValue();
	// Quit if false value is not a constant.
	if (!FVal) {
	SDValue Op = SetCC.getOperand(0);
	// Skip 'zext' or 'trunc' node.
	if (Op.getOpcode() == ISD::ZERO_EXTEND \|\|
	Op.getOpcode() == ISD::TRUNCATE)
	Op = Op.getOperand(0);
	// A special case for rdrand/rdseed, where 0 is set if false cond is
	// found.
	if ((Op.getOpcode() != X86ISD::RDRAND &&
	Op.getOpcode() != X86ISD::RDSEED) \|\| Op.getResNo() != 0)
	return SDValue();
	}
	// Quit if false value is not the constant 0 or 1.
	bool FValIsFalse = true;
	if (FVal && FVal->getZExtValue() != 0) {
	if (FVal->getZExtValue() != 1)
	return SDValue();
	// If FVal is 1, opposite cond is needed.
	needOppositeCond = !needOppositeCond;
	FValIsFalse = false;
	}
	// Quit if TVal is not the constant opposite of FVal.
	if (FValIsFalse && TVal->getZExtValue() != 1)
	return SDValue();
	if (!FValIsFalse && TVal->getZExtValue() != 0)
	return SDValue();
	CC = X86::CondCode(SetCC.getConstantOperandVal(2));
	if (needOppositeCond)
	CC = X86::GetOppositeBranchCondition(CC);
	return SetCC.getOperand(3);
	}
	}

	return SDValue();
	}

	/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
	/// Match:
	/// (X86or (X86setcc) (X86setcc))
	/// (X86cmp (and (X86setcc) (X86setcc)), 0)
	static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
	X86::CondCode &CC1, SDValue &Flags,
	bool &isAnd) {
	if (Cond->getOpcode() == X86ISD::CMP) {
	if (!isNullConstant(Cond->getOperand(1)))
	return false;

	Cond = Cond->getOperand(0);
	}

	isAnd = false;

	SDValue SetCC0, SetCC1;
	switch (Cond->getOpcode()) {
	default: return false;
	case ISD::AND:
	case X86ISD::AND:
	isAnd = true;
	LLVM_FALLTHROUGH;
	case ISD::OR:
	case X86ISD::OR:
	SetCC0 = Cond->getOperand(0);
	SetCC1 = Cond->getOperand(1);
	break;
	};

	// Make sure we have SETCC nodes, using the same flags value.
	if (SetCC0.getOpcode() != X86ISD::SETCC \|\|
	SetCC1.getOpcode() != X86ISD::SETCC \|\|
	SetCC0->getOperand(1) != SetCC1->getOperand(1))
	return false;

	CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
	CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
	Flags = SetCC0->getOperand(1);
	return true;
	}

	// When legalizing carry, we create carries via add X, -1
	// If that comes from an actual carry, via setcc, we use the
	// carry directly.
	static SDValue combineCarryThroughADD(SDValue EFLAGS) {
	if (EFLAGS.getOpcode() == X86ISD::ADD) {
	if (isAllOnesConstant(EFLAGS.getOperand(1))) {
	SDValue Carry = EFLAGS.getOperand(0);
	while (Carry.getOpcode() == ISD::TRUNCATE \|\|
	Carry.getOpcode() == ISD::ZERO_EXTEND \|\|
	Carry.getOpcode() == ISD::SIGN_EXTEND \|\|
	Carry.getOpcode() == ISD::ANY_EXTEND \|\|
	(Carry.getOpcode() == ISD::AND &&
	isOneConstant(Carry.getOperand(1))))
	Carry = Carry.getOperand(0);
	if (Carry.getOpcode() == X86ISD::SETCC \|\|
	Carry.getOpcode() == X86ISD::SETCC_CARRY) {
	if (Carry.getConstantOperandVal(0) == X86::COND_B)
	return Carry.getOperand(1);
	}
	}
	}

	return SDValue();
	}

	/// Optimize an EFLAGS definition used according to the condition code \p CC
	/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
	/// uses of chain values.
	static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (CC == X86::COND_B)
	if (SDValue Flags = combineCarryThroughADD(EFLAGS))
	return Flags;

	if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
	return R;
	return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
	}

	/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
	static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);

	SDValue FalseOp = N->getOperand(0);
	SDValue TrueOp = N->getOperand(1);
	X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
	SDValue Cond = N->getOperand(3);

	if (CC == X86::COND_E \|\| CC == X86::COND_NE) {
	switch (Cond.getOpcode()) {
	default: break;
	case X86ISD::BSR:
	case X86ISD::BSF:
	// If operand of BSR / BSF are proven never zero, then ZF cannot be set.
	if (DAG.isKnownNeverZero(Cond.getOperand(0)))
	return (CC == X86::COND_E) ? FalseOp : TrueOp;
	}
	}

	// Try to simplify the EFLAGS and condition code operands.
	// We can't always do this as FCMOV only supports a subset of X86 cond.
	if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
	if (FalseOp.getValueType() != MVT::f80 \|\| hasFPCMov(CC)) {
	SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
	Flags};
	return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
	}
	}

	// If this is a select between two integer constants, try to do some
	// optimizations. Note that the operands are ordered the opposite of SELECT
	// operands.
	if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
	if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
	// Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
	// larger than FalseC (the false value).
	if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
	CC = X86::GetOppositeBranchCondition(CC);
	std::swap(TrueC, FalseC);
	std::swap(TrueOp, FalseOp);
	}

	// Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
	// This is efficient for any integer data type (including i8/i16) and
	// shift amount.
	if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
	Cond = getSETCC(CC, Cond, DL, DAG);

	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);

	unsigned ShAmt = TrueC->getAPIntValue().logBase2();
	Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
	DAG.getConstant(ShAmt, DL, MVT::i8));
	return Cond;
	}

	// Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
	// for any integer data type, including i8/i16.
	if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
	Cond = getSETCC(CC, Cond, DL, DAG);

	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
	FalseC->getValueType(0), Cond);
	Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
	SDValue(FalseC, 0));
	return Cond;
	}

	// Optimize cases that will turn into an LEA instruction. This requires
	// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
	if (N->getValueType(0) == MVT::i32 \|\| N->getValueType(0) == MVT::i64) {
	uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
	if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;

	bool isFastMultiplier = false;
	if (Diff < 10) {
	switch ((unsigned char)Diff) {
	default: break;
	case 1: // result = add base, cond
	case 2: // result = lea base( , cond*2)
	case 3: // result = lea base(cond, cond*2)
	case 4: // result = lea base( , cond*4)
	case 5: // result = lea base(cond, cond*4)
	case 8: // result = lea base( , cond*8)
	case 9: // result = lea base(cond, cond*8)
	isFastMultiplier = true;
	break;
	}
	}

	if (isFastMultiplier) {
	APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
	Cond = getSETCC(CC, Cond, DL ,DAG);
	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
	Cond);
	// Scale the condition by the difference.
	if (Diff != 1)
	Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
	DAG.getConstant(Diff, DL, Cond.getValueType()));

	// Add the base if non-zero.
	if (FalseC->getAPIntValue() != 0)
	Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
	SDValue(FalseC, 0));
	return Cond;
	}
	}
	}
	}

	// Handle these cases:
	// (select (x != c), e, c) -> select (x != c), e, x),
	// (select (x == c), c, e) -> select (x == c), x, e)
	// where the c is an integer constant, and the "select" is the combination
	// of CMOV and CMP.
	//
	// The rationale for this change is that the conditional-move from a constant
	// needs two instructions, however, conditional-move from a register needs
	// only one instruction.
	//
	// CAVEAT: By replacing a constant with a symbolic value, it may obscure
	// some instruction-combining opportunities. This opt needs to be
	// postponed as late as possible.
	//
	if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
	// the DCI.xxxx conditions are provided to postpone the optimization as
	// late as possible.

	ConstantSDNode *CmpAgainst = nullptr;
	if ((Cond.getOpcode() == X86ISD::CMP \|\| Cond.getOpcode() == X86ISD::SUB) &&
	(CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
	!isa<ConstantSDNode>(Cond.getOperand(0))) {

	if (CC == X86::COND_NE &&
	CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
	CC = X86::GetOppositeBranchCondition(CC);
	std::swap(TrueOp, FalseOp);
	}

	if (CC == X86::COND_E &&
	CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
	SDValue Ops[] = { FalseOp, Cond.getOperand(0),
	DAG.getConstant(CC, DL, MVT::i8), Cond };
	return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
	}
	}
	}

	// Fold and/or of setcc's to double CMOV:
	// (CMOV F, T, ((cc1 \| cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
	// (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
	//
	// This combine lets us generate:
	// cmovcc1 (jcc1 if we don't have CMOV)
	// cmovcc2 (same)
	// instead of:
	// setcc1
	// setcc2
	// and/or
	// cmovne (jne if we don't have CMOV)
	// When we can't use the CMOV instruction, it might increase branch
	// mispredicts.
	// When we can use CMOV, or when there is no mispredict, this improves
	// throughput and reduces register pressure.
	//
	if (CC == X86::COND_NE) {
	SDValue Flags;
	X86::CondCode CC0, CC1;
	bool isAndSetCC;
	if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
	if (isAndSetCC) {
	std::swap(FalseOp, TrueOp);
	CC0 = X86::GetOppositeBranchCondition(CC0);
	CC1 = X86::GetOppositeBranchCondition(CC1);
	}

	SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
	Flags};
	SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
	SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
	SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
	return CMOV;
	}
	}

	return SDValue();
	}

	/// Different mul shrinking modes.
	enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };

	static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
	EVT VT = N->getOperand(0).getValueType();
	if (VT.getScalarSizeInBits() != 32)
	return false;

	assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
	unsigned SignBits[2] = {1, 1};
	bool IsPositive[2] = {false, false};
	for (unsigned i = 0; i < 2; i++) {
	SDValue Opd = N->getOperand(i);

	// DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
	// compute signbits for it separately.
	if (Opd.getOpcode() == ISD::ANY_EXTEND) {
	// For anyextend, it is safe to assume an appropriate number of leading
	// sign/zero bits.
	if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
	SignBits[i] = 25;
	else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
	MVT::i16)
	SignBits[i] = 17;
	else
	return false;
	IsPositive[i] = true;
	} else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
	// All the operands of BUILD_VECTOR need to be int constant.
	// Find the smallest value range which all the operands belong to.
	SignBits[i] = 32;
	IsPositive[i] = true;
	for (const SDValue &SubOp : Opd.getNode()->op_values()) {
	if (SubOp.isUndef())
	continue;
	auto *CN = dyn_cast<ConstantSDNode>(SubOp);
	if (!CN)
	return false;
	APInt IntVal = CN->getAPIntValue();
	if (IntVal.isNegative())
	IsPositive[i] = false;
	SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
	}
	} else {
	SignBits[i] = DAG.ComputeNumSignBits(Opd);
	if (Opd.getOpcode() == ISD::ZERO_EXTEND)
	IsPositive[i] = true;
	}
	}

	bool AllPositive = IsPositive[0] && IsPositive[1];
	unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
	// When ranges are from -128 ~ 127, use MULS8 mode.
	if (MinSignBits >= 25)
	Mode = MULS8;
	// When ranges are from 0 ~ 255, use MULU8 mode.
	else if (AllPositive && MinSignBits >= 24)
	Mode = MULU8;
	// When ranges are from -32768 ~ 32767, use MULS16 mode.
	else if (MinSignBits >= 17)
	Mode = MULS16;
	// When ranges are from 0 ~ 65535, use MULU16 mode.
	else if (AllPositive && MinSignBits >= 16)
	Mode = MULU16;
	else
	return false;
	return true;
	}

	/// When the operands of vector mul are extended from smaller size values,
	/// like i8 and i16, the type of mul may be shrinked to generate more
	/// efficient code. Two typical patterns are handled:
	/// Pattern1:
	/// %2 = sext/zext <N x i8> %1 to <N x i32>
	/// %4 = sext/zext <N x i8> %3 to <N x i32>
	// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
	/// %5 = mul <N x i32> %2, %4
	///
	/// Pattern2:
	/// %2 = zext/sext <N x i16> %1 to <N x i32>
	/// %4 = zext/sext <N x i16> %3 to <N x i32>
	/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
	/// %5 = mul <N x i32> %2, %4
	///
	/// There are four mul shrinking modes:
	/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
	/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
	/// generate pmullw+sext32 for it (MULS8 mode).
	/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
	/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
	/// generate pmullw+zext32 for it (MULU8 mode).
	/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
	/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
	/// generate pmullw+pmulhw for it (MULS16 mode).
	/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
	/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
	/// generate pmullw+pmulhuw for it (MULU16 mode).
	static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Check for legality
	// pmullw/pmulhw are not supported by SSE.
	if (!Subtarget.hasSSE2())
	return SDValue();

	// Check for profitability
	// pmulld is supported since SSE41. It is better to use pmulld
	// instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
	// the expansion.
	bool OptForMinSize = DAG.getMachineFunction().getFunction().optForMinSize();
	if (Subtarget.hasSSE41() && (OptForMinSize \|\| !Subtarget.isPMULLDSlow()))
	return SDValue();

	ShrinkMode Mode;
	if (!canReduceVMulWidth(N, DAG, Mode))
	return SDValue();

	SDLoc DL(N);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getOperand(0).getValueType();
	unsigned NumElts = VT.getVectorNumElements();
	if ((NumElts % 2) != 0)
	return SDValue();

	// If the upper 17 bits of each element are zero then we can use PMADD.
	APInt Mask17 = APInt::getHighBitsSet(32, 17);
	if (VT == MVT::v4i32 && DAG.MaskedValueIsZero(N0, Mask17) &&
	DAG.MaskedValueIsZero(N1, Mask17))
	return DAG.getNode(X86ISD::VPMADDWD, DL, VT, DAG.getBitcast(MVT::v8i16, N0),
	DAG.getBitcast(MVT::v8i16, N1));

	unsigned RegSize = 128;
	MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
	EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);

	// Shrink the operands of mul.
	SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
	SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);

	if (NumElts >= OpsVT.getVectorNumElements()) {
	// Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
	// lower part is needed.
	SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
	if (Mode == MULU8 \|\| Mode == MULS8) {
	return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
	DL, VT, MulLo);
	} else {
	MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
	// Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
	// the higher part is also needed.
	SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
	ReducedVT, NewN0, NewN1);

	// Repack the lower part and higher part result of mul into a wider
	// result.
	// Generate shuffle functioning as punpcklwd.
	SmallVector<int, 16> ShuffleMask(NumElts);
	for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
	ShuffleMask[2 * i] = i;
	ShuffleMask[2 * i + 1] = i + NumElts;
	}
	SDValue ResLo =
	DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
	ResLo = DAG.getBitcast(ResVT, ResLo);
	// Generate shuffle functioning as punpckhwd.
	for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
	ShuffleMask[2 * i] = i + NumElts / 2;
	ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
	}
	SDValue ResHi =
	DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
	ResHi = DAG.getBitcast(ResVT, ResHi);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
	}
	} else {
	// When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
	// to legalize the mul explicitly because implicit legalization for type
	// <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
	// instructions which will not exist when we explicitly legalize it by
	// extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
	// <4 x i16> undef).
	//
	// Legalize the operands of mul.
	// FIXME: We may be able to handle non-concatenated vectors by insertion.
	unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
	if ((RegSize % ReducedSizeInBits) != 0)
	return SDValue();

	SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
	DAG.getUNDEF(ReducedVT));
	Ops[0] = NewN0;
	NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
	Ops[0] = NewN1;
	NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);

	if (Mode == MULU8 \|\| Mode == MULS8) {
	// Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
	// part is needed.
	SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);

	// convert the type of mul result to VT.
	MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
	SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
	: ISD::SIGN_EXTEND_VECTOR_INREG,
	DL, ResVT, Mul);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	} else {
	// Generate the lower and higher part of mul: pmulhw/pmulhuw. For
	// MULU16/MULS16, both parts are needed.
	SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
	SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
	OpsVT, NewN0, NewN1);

	// Repack the lower part and higher part result of mul into a wider
	// result. Make sure the type of mul result is VT.
	MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
	SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi);
	Res = DAG.getBitcast(ResVT, Res);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	}
	}
	}

	static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
	EVT VT, SDLoc DL) {

	auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
	SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(Mult, DL, VT));
	Result = DAG.getNode(ISD::SHL, DL, VT, Result,
	DAG.getConstant(Shift, DL, MVT::i8));
	Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
	N->getOperand(0));
	return Result;
	};

	auto combineMulMulAddOrSub = [&](bool isAdd) {
	SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(9, DL, VT));
	Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
	Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
	N->getOperand(0));
	return Result;
	};

	switch (MulAmt) {
	default:
	break;
	case 11:
	// mul x, 11 => add ((shl (mul x, 5), 1), x)
	return combineMulShlAddOrSub(5, 1, /isAdd/ true);
	case 21:
	// mul x, 21 => add ((shl (mul x, 5), 2), x)
	return combineMulShlAddOrSub(5, 2, /isAdd/ true);
	case 22:
	// mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
	return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
	combineMulShlAddOrSub(5, 2, /isAdd/ true));
	case 19:
	// mul x, 19 => sub ((shl (mul x, 5), 2), x)
	return combineMulShlAddOrSub(5, 2, /isAdd/ false);
	case 13:
	// mul x, 13 => add ((shl (mul x, 3), 2), x)
	return combineMulShlAddOrSub(3, 2, /isAdd/ true);
	case 23:
	// mul x, 13 => sub ((shl (mul x, 3), 3), x)
	return combineMulShlAddOrSub(3, 3, /isAdd/ false);
	case 14:
	// mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
	return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
	combineMulShlAddOrSub(3, 2, /isAdd/ true));
	case 26:
	// mul x, 26 => sub ((mul (mul x, 9), 3), x)
	return combineMulMulAddOrSub(/isAdd/ false);
	case 28:
	// mul x, 28 => add ((mul (mul x, 9), 3), x)
	return combineMulMulAddOrSub(/isAdd/ true);
	case 29:
	// mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
	return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
	combineMulMulAddOrSub(/isAdd/ true));
	case 30:
	// mul x, 30 => sub (sub ((shl x, 5), x), x)
	return DAG.getNode(
	ISD::SUB, DL, VT,
	DAG.getNode(ISD::SUB, DL, VT,
	DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(5, DL, MVT::i8)),
	N->getOperand(0)),
	N->getOperand(0));
	}
	return SDValue();
	}

	/// Optimize a single multiply with constant into two operations in order to
	/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
	static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	if (DCI.isBeforeLegalize() && VT.isVector())
	return reduceVMULWidth(N, DAG, Subtarget);

	if (!MulConstantOptimization)
	return SDValue();
	// An imul is usually smaller than the alternative sequence.
	if (DAG.getMachineFunction().getFunction().optForMinSize())
	return SDValue();

	if (DCI.isBeforeLegalize() \|\| DCI.isCalledByLegalizer())
	return SDValue();

	if (VT != MVT::i64 && VT != MVT::i32)
	return SDValue();

	ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!C)
	return SDValue();
	uint64_t MulAmt = C->getZExtValue();
	if (isPowerOf2_64(MulAmt) \|\| MulAmt == 3 \|\| MulAmt == 5 \|\| MulAmt == 9)
	return SDValue();

	uint64_t MulAmt1 = 0;
	uint64_t MulAmt2 = 0;
	if ((MulAmt % 9) == 0) {
	MulAmt1 = 9;
	MulAmt2 = MulAmt / 9;
	} else if ((MulAmt % 5) == 0) {
	MulAmt1 = 5;
	MulAmt2 = MulAmt / 5;
	} else if ((MulAmt % 3) == 0) {
	MulAmt1 = 3;
	MulAmt2 = MulAmt / 3;
	}

	SDLoc DL(N);
	SDValue NewMul;
	if (MulAmt2 &&
	(isPowerOf2_64(MulAmt2) \|\| MulAmt2 == 3 \|\| MulAmt2 == 5 \|\| MulAmt2 == 9)){

	if (isPowerOf2_64(MulAmt2) &&
	!(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
	// If second multiplifer is pow2, issue it first. We want the multiply by
	// 3, 5, or 9 to be folded into the addressing mode unless the lone use
	// is an add.
	std::swap(MulAmt1, MulAmt2);

	if (isPowerOf2_64(MulAmt1))
	NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
	else
	NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(MulAmt1, DL, VT));

	if (isPowerOf2_64(MulAmt2))
	NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
	DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
	else
	NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
	DAG.getConstant(MulAmt2, DL, VT));
	} else if (!Subtarget.slowLEA())
	NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);

	if (!NewMul) {
	assert(MulAmt != 0 &&
	MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
	"Both cases that could cause potential overflows should have "
	"already been handled.");
	int64_t SignMulAmt = C->getSExtValue();
	if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
	(SignMulAmt != -INT64_MAX)) {
	int NumSign = SignMulAmt > 0 ? 1 : -1;
	bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
	bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
	if (IsPowerOf2_64PlusOne) {
	// (mul x, 2^N + 1) => (add (shl x, N), x)
	NewMul = DAG.getNode(
	ISD::ADD, DL, VT, N->getOperand(0),
	DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
	MVT::i8)));
	} else if (IsPowerOf2_64MinusOne) {
	// (mul x, 2^N - 1) => (sub (shl x, N), x)
	NewMul = DAG.getNode(
	ISD::SUB, DL, VT,
	DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
	MVT::i8)),
	N->getOperand(0));
	}
	// To negate, subtract the number from zero
	if ((IsPowerOf2_64PlusOne \|\| IsPowerOf2_64MinusOne) && NumSign == -1)
	NewMul =
	DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
	}
	}

	if (NewMul)
	// Do not add new nodes to DAG combiner worklist.
	DCI.CombineTo(N, NewMul, false);

	return SDValue();
	}

	static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
	EVT VT = N0.getValueType();

	// fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
	// since the result of setcc_c is all zero's or all ones.
	if (VT.isInteger() && !VT.isVector() &&
	N1C && N0.getOpcode() == ISD::AND &&
	N0.getOperand(1).getOpcode() == ISD::Constant) {
	SDValue N00 = N0.getOperand(0);
	APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
	Mask <<= N1C->getAPIntValue();
	bool MaskOK = false;
	// We can handle cases concerning bit-widening nodes containing setcc_c if
	// we carefully interrogate the mask to make sure we are semantics
	// preserving.
	// The transform is not safe if the result of C1 << C2 exceeds the bitwidth
	// of the underlying setcc_c operation if the setcc_c was zero extended.
	// Consider the following example:
	// zext(setcc_c) -> i32 0x0000FFFF
	// c1 -> i32 0x0000FFFF
	// c2 -> i32 0x00000001
	// (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
	// (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
	if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = true;
	} else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
	N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = true;
	} else if ((N00.getOpcode() == ISD::ZERO_EXTEND \|\|
	N00.getOpcode() == ISD::ANY_EXTEND) &&
	N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
	}
	if (MaskOK && Mask != 0) {
	SDLoc DL(N);
	return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
	}
	}

	// Hardware support for vector shifts is sparse which makes us scalarize the
	// vector operations in many cases. Also, on sandybridge ADD is faster than
	// shl.
	// (shl V, 1) -> add V,V
	if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
	if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
	assert(N0.getValueType().isVector() && "Invalid vector shift type");
	// We shift all of the values by one. In many cases we do not have
	// hardware support for this operation. This is better expressed as an ADD
	// of two values.
	if (N1SplatC->getAPIntValue() == 1)
	return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
	}

	return SDValue();
	}

	static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	unsigned Size = VT.getSizeInBits();

	// fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
	// into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
	// into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
	// depending on sign of (SarConst - [56,48,32,24,16])

	// sexts in X86 are MOVs. The MOVs have the same code size
	// as above SHIFTs (only SHIFT on 1 has lower code size).
	// However the MOVs have 2 advantages to a SHIFT:
	// 1. MOVs can write to a register that differs from source
	// 2. MOVs accept memory operands

	if (VT.isVector() \|\| N1.getOpcode() != ISD::Constant \|\|
	N0.getOpcode() != ISD::SHL \|\| !N0.hasOneUse() \|\|
	N0.getOperand(1).getOpcode() != ISD::Constant)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
	APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
	EVT CVT = N1.getValueType();

	if (SarConst.isNegative())
	return SDValue();

	for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
	unsigned ShiftSize = SVT.getSizeInBits();
	// skipping types without corresponding sext/zext and
	// ShlConst that is not one of [56,48,32,24,16]
	if (ShiftSize >= Size \|\| ShlConst != Size - ShiftSize)
	continue;
	SDLoc DL(N);
	SDValue NN =
	DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
	SarConst = SarConst - (Size - ShiftSize);
	if (SarConst == 0)
	return NN;
	else if (SarConst.isNegative())
	return DAG.getNode(ISD::SHL, DL, VT, NN,
	DAG.getConstant(-SarConst, DL, CVT));
	else
	return DAG.getNode(ISD::SRA, DL, VT, NN,
	DAG.getConstant(SarConst, DL, CVT));
	}
	return SDValue();
	}

	static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();

	// Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
	// TODO: This is a generic DAG combine that became an x86-only combine to
	// avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
	// and-not ('andn').
	if (N0.getOpcode() != ISD::AND \|\| !N0.hasOneUse())
	return SDValue();

	auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
	auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (!ShiftC \|\| !AndC)
	return SDValue();

	// If we can shrink the constant mask below 8-bits or 32-bits, then this
	// transform should reduce code size. It may also enable secondary transforms
	// from improved known-bits analysis or instruction selection.
	APInt MaskVal = AndC->getAPIntValue();
	APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
	unsigned OldMaskSize = MaskVal.getMinSignedBits();
	unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
	if ((OldMaskSize > 8 && NewMaskSize <= 8) \|\|
	(OldMaskSize > 32 && NewMaskSize <= 32)) {
	// srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
	SDLoc DL(N);
	SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
	SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
	return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
	}
	return SDValue();
	}

	static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (N->getOpcode() == ISD::SHL)
	if (SDValue V = combineShiftLeft(N, DAG))
	return V;

	if (N->getOpcode() == ISD::SRA)
	if (SDValue V = combineShiftRightArithmetic(N, DAG))
	return V;

	if (N->getOpcode() == ISD::SRL)
	if (SDValue V = combineShiftRightLogical(N, DAG))
	return V;

	return SDValue();
	}

	static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	assert((X86ISD::PACKSS == Opcode \|\| X86ISD::PACKUS == Opcode) &&
	"Unexpected shift opcode");

	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	unsigned DstBitsPerElt = VT.getScalarSizeInBits();
	unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
	assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
	N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
	"Unexpected PACKSS/PACKUS input type");

	// Constant Folding.
	APInt UndefElts0, UndefElts1;
	SmallVector<APInt, 32> EltBits0, EltBits1;
	if ((N0->isUndef() \|\| N->isOnlyUserOf(N0.getNode())) &&
	(N1->isUndef() \|\| N->isOnlyUserOf(N1.getNode())) &&
	getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
	getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
	unsigned NumLanes = VT.getSizeInBits() / 128;
	unsigned NumDstElts = VT.getVectorNumElements();
	unsigned NumSrcElts = NumDstElts / 2;
	unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
	unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
	bool IsSigned = (X86ISD::PACKSS == Opcode);

	APInt Undefs(NumDstElts, 0);
	SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
	for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
	for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
	unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
	auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
	auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);

	if (UndefElts[SrcIdx]) {
	Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
	continue;
	}

	APInt &Val = EltBits[SrcIdx];
	if (IsSigned) {
	// PACKSS: Truncate signed value with signed saturation.
	// Source values less than dst minint are saturated to minint.
	// Source values greater than dst maxint are saturated to maxint.
	if (Val.isSignedIntN(DstBitsPerElt))
	Val = Val.trunc(DstBitsPerElt);
	else if (Val.isNegative())
	Val = APInt::getSignedMinValue(DstBitsPerElt);
	else
	Val = APInt::getSignedMaxValue(DstBitsPerElt);
	} else {
	// PACKUS: Truncate signed value with unsigned saturation.
	// Source values less than zero are saturated to zero.
	// Source values greater than dst maxuint are saturated to maxuint.
	if (Val.isIntN(DstBitsPerElt))
	Val = Val.trunc(DstBitsPerElt);
	else if (Val.isNegative())
	Val = APInt::getNullValue(DstBitsPerElt);
	else
	Val = APInt::getAllOnesValue(DstBitsPerElt);
	}
	Bits[Lane * NumDstEltsPerLane + Elt] = Val;
	}
	}

	return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
	}

	// Attempt to combine as shuffle.
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(
	{Op}, 0, Op, {0}, {}, /Depth/ 1,
	/HasVarMask/ false, DAG, DCI, Subtarget)) {
	DCI.CombineTo(N, Res);
	return SDValue();
	}

	return SDValue();
	}

	static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	assert((X86ISD::VSHLI == Opcode \|\| X86ISD::VSRAI == Opcode \|\|
	X86ISD::VSRLI == Opcode) &&
	"Unexpected shift opcode");
	bool LogicalShift = X86ISD::VSHLI == Opcode \|\| X86ISD::VSRLI == Opcode;
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	unsigned NumBitsPerElt = VT.getScalarSizeInBits();
	assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
	"Unexpected value type");

	// Out of range logical bit shifts are guaranteed to be zero.
	// Out of range arithmetic bit shifts splat the sign bit.
	APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
	if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
	if (LogicalShift)
	return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
	else
	ShiftVal = NumBitsPerElt - 1;
	}

	// Shift N0 by zero -> N0.
	if (!ShiftVal)
	return N0;

	// Shift zero -> zero.
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));

	// fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
	// This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
	// TODO - support other sra opcodes as needed.
	if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
	N0.getOpcode() == X86ISD::VSRAI)
	return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);

	// fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
	if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSHLI &&
	N1 == N0.getOperand(1)) {
	SDValue N00 = N0.getOperand(0);
	unsigned NumSignBits = DAG.ComputeNumSignBits(N00);
	if (ShiftVal.ult(NumSignBits))
	return N00;
	}

	// We can decode 'whole byte' logical bit shifts as shuffles.
	if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(
	{Op}, 0, Op, {0}, {}, /Depth/ 1,
	/HasVarMask/ false, DAG, DCI, Subtarget)) {
	DCI.CombineTo(N, Res);
	return SDValue();
	}
	}

	// Constant Folding.
	APInt UndefElts;
	SmallVector<APInt, 32> EltBits;
	if (N->isOnlyUserOf(N0.getNode()) &&
	getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
	assert(EltBits.size() == VT.getVectorNumElements() &&
	"Unexpected shift value type");
	unsigned ShiftImm = ShiftVal.getZExtValue();
	for (APInt &Elt : EltBits) {
	if (X86ISD::VSHLI == Opcode)
	Elt <<= ShiftImm;
	else if (X86ISD::VSRAI == Opcode)
	Elt.ashrInPlace(ShiftImm);
	else
	Elt.lshrInPlace(ShiftImm);
	}
	return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
	}

	return SDValue();
	}

	static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	assert(
	((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) \|\|
	(N->getOpcode() == X86ISD::PINSRW &&
	N->getValueType(0) == MVT::v8i16)) &&
	"Unexpected vector insertion");

	// Attempt to combine PINSRB/PINSRW patterns to a shuffle.
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(
	{Op}, 0, Op, {0}, {}, /Depth/ 1,
	/HasVarMask/ false, DAG, DCI, Subtarget)) {
	DCI.CombineTo(N, Res);
	return SDValue();
	}

	return SDValue();
	}

	/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
	/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
	/// OR -> CMPNEQSS.
	static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned opcode;

	// SSE1 supports CMP{eq\|ne}SS, and SSE2 added CMP{eq\|ne}SD, but
	// we're requiring SSE2 for both.
	if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue CMP0 = N0->getOperand(1);
	SDValue CMP1 = N1->getOperand(1);
	SDLoc DL(N);

	// The SETCCs should both refer to the same CMP.
	if (CMP0.getOpcode() != X86ISD::CMP \|\| CMP0 != CMP1)
	return SDValue();

	SDValue CMP00 = CMP0->getOperand(0);
	SDValue CMP01 = CMP0->getOperand(1);
	EVT VT = CMP00.getValueType();

	if (VT == MVT::f32 \|\| VT == MVT::f64) {
	bool ExpectingFlags = false;
	// Check for any users that want flags:
	for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
	!ExpectingFlags && UI != UE; ++UI)
	switch (UI->getOpcode()) {
	default:
	case ISD::BR_CC:
	case ISD::BRCOND:
	case ISD::SELECT:
	ExpectingFlags = true;
	break;
	case ISD::CopyToReg:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	break;
	}

	if (!ExpectingFlags) {
	enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
	enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);

	if (cc1 == X86::COND_E \|\| cc1 == X86::COND_NE) {
	X86::CondCode tmp = cc0;
	cc0 = cc1;
	cc1 = tmp;
	}

	if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) \|\|
	(cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
	// FIXME: need symbolic constants for these magic numbers.
	// See X86ATTInstPrinter.cpp:printSSECC().
	unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
	if (Subtarget.hasAVX512()) {
	SDValue FSetCC =
	DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
	DAG.getConstant(x86cc, DL, MVT::i8));
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
	N->getSimpleValueType(0), FSetCC,
	DAG.getIntPtrConstant(0, DL));
	}
	SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
	CMP00.getValueType(), CMP00, CMP01,
	DAG.getConstant(x86cc, DL,
	MVT::i8));

	bool is64BitFP = (CMP00.getValueType() == MVT::f64);
	MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;

	if (is64BitFP && !Subtarget.is64Bit()) {
	// On a 32-bit target, we cannot bitcast the 64-bit float to a
	// 64-bit integer, since that's not a legal type. Since
	// OnesOrZeroesF is all ones of all zeroes, we don't need all the
	// bits, but can do this little dance to extract the lowest 32 bits
	// and work with those going forward.
	SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
	OnesOrZeroesF);
	SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
	OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
	Vector32, DAG.getIntPtrConstant(0, DL));
	IntVT = MVT::i32;
	}

	SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
	SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
	DAG.getConstant(1, DL, IntVT));
	SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
	ANDed);
	return OneBitOfTruth;
	}
	}
	}
	}
	return SDValue();
	}

	/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
	static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
	assert(N->getOpcode() == ISD::AND);

	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDLoc DL(N);

	if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
	return SDValue();

	if (N0.getOpcode() == ISD::XOR &&
	ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
	return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);

	if (N1.getOpcode() == ISD::XOR &&
	ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
	return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);

	return SDValue();
	}

	// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
	// register. In most cases we actually compare or select YMM-sized registers
	// and mixing the two types creates horrible code. This method optimizes
	// some of the transition sequences.
	// Even with AVX-512 this is still useful for removing casts around logical
	// operations on vXi1 mask types.
	static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	assert(VT.isVector() && "Expected vector type");

	assert((N->getOpcode() == ISD::ANY_EXTEND \|\|
	N->getOpcode() == ISD::ZERO_EXTEND \|\|
	N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");

	SDValue Narrow = N->getOperand(0);
	EVT NarrowVT = Narrow.getValueType();

	if (Narrow->getOpcode() != ISD::XOR &&
	Narrow->getOpcode() != ISD::AND &&
	Narrow->getOpcode() != ISD::OR)
	return SDValue();

	SDValue N0 = Narrow->getOperand(0);
	SDValue N1 = Narrow->getOperand(1);
	SDLoc DL(Narrow);

	// The Left side has to be a trunc.
	if (N0.getOpcode() != ISD::TRUNCATE)
	return SDValue();

	// The type of the truncated inputs.
	if (N0->getOperand(0).getValueType() != VT)
	return SDValue();

	// The right side has to be a 'trunc' or a constant vector.
	bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
	N1.getOperand(0).getValueType() == VT;
	if (!RHSTrunc &&
	!ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
	return SDValue();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT))
	return SDValue();

	// Set N0 and N1 to hold the inputs to the new wide operation.
	N0 = N0->getOperand(0);
	if (RHSTrunc)
	N1 = N1->getOperand(0);
	else
	N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);

	// Generate the wide operation.
	SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1);
	unsigned Opcode = N->getOpcode();
	switch (Opcode) {
	default: llvm_unreachable("Unexpected opcode");
	case ISD::ANY_EXTEND:
	return Op;
	case ISD::ZERO_EXTEND:
	return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType());
	case ISD::SIGN_EXTEND:
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
	Op, DAG.getValueType(NarrowVT));
	}
	}

	/// If both input operands of a logic op are being cast from floating point
	/// types, try to convert this into a floating point logic node to avoid
	/// unnecessary moves from SSE to integer registers.
	static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned FPOpcode = ISD::DELETED_NODE;
	if (N->getOpcode() == ISD::AND)
	FPOpcode = X86ISD::FAND;
	else if (N->getOpcode() == ISD::OR)
	FPOpcode = X86ISD::FOR;
	else if (N->getOpcode() == ISD::XOR)
	FPOpcode = X86ISD::FXOR;

	assert(FPOpcode != ISD::DELETED_NODE &&
	"Unexpected input node for FP logic conversion");

	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDLoc DL(N);
	if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
	((Subtarget.hasSSE1() && VT == MVT::i32) \|\|
	(Subtarget.hasSSE2() && VT == MVT::i64))) {
	SDValue N00 = N0.getOperand(0);
	SDValue N10 = N1.getOperand(0);
	EVT N00Type = N00.getValueType();
	EVT N10Type = N10.getValueType();
	if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
	SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
	return DAG.getBitcast(VT, FPLogic);
	}
	}
	return SDValue();
	}

	/// If this is a zero/all-bits result that is bitwise-anded with a low bits
	/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
	/// with a shift-right to eliminate loading the vector constant mask value.
	static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
	SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
	EVT VT0 = Op0.getValueType();
	EVT VT1 = Op1.getValueType();

	if (VT0 != VT1 \|\| !VT0.isSimple() \|\| !VT0.isInteger())
	return SDValue();

	APInt SplatVal;
	if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) \|\|
	!SplatVal.isMask())
	return SDValue();

	if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
	return SDValue();

	unsigned EltBitWidth = VT0.getScalarSizeInBits();
	if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
	return SDValue();

	SDLoc DL(N);
	unsigned ShiftVal = SplatVal.countTrailingOnes();
	SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
	SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
	return DAG.getBitcast(N->getValueType(0), Shift);
	}

	// Get the index node from the lowered DAG of a GEP IR instruction with one
	// indexing dimension.
	static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
	if (Ld->isIndexed())
	return SDValue();

	SDValue Base = Ld->getBasePtr();

	if (Base.getOpcode() != ISD::ADD)
	return SDValue();

	SDValue ShiftedIndex = Base.getOperand(0);

	if (ShiftedIndex.getOpcode() != ISD::SHL)
	return SDValue();

	return ShiftedIndex.getOperand(0);

	}

	static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
	if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
	switch (VT.getSizeInBits()) {
	default: return false;
	case 64: return Subtarget.is64Bit() ? true : false;
	case 32: return true;
	}
	}
	return false;
	}

	// This function recognizes cases where X86 bzhi instruction can replace and
	// 'and-load' sequence.
	// In case of loading integer value from an array of constants which is defined
	// as follows:
	//
	// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
	//
	// then applying a bitwise and on the result with another input.
	// It's equivalent to performing bzhi (zero high bits) on the input, with the
	// same index of the load.
	static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Node->getSimpleValueType(0);
	SDLoc dl(Node);

	// Check if subtarget has BZHI instruction for the node's type
	if (!hasBZHI(Subtarget, VT))
	return SDValue();

	// Try matching the pattern for both operands.
	for (unsigned i = 0; i < 2; i++) {
	SDValue N = Node->getOperand(i);
	LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());

	// continue if the operand is not a load instruction
	if (!Ld)
	return SDValue();

	const Value *MemOp = Ld->getMemOperand()->getValue();

	if (!MemOp)
	return SDValue();

	if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
	if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
	if (GV->isConstant() && GV->hasDefinitiveInitializer()) {

	Constant *Init = GV->getInitializer();
	Type *Ty = Init->getType();
	if (!isa<ConstantDataArray>(Init) \|\|
	!Ty->getArrayElementType()->isIntegerTy() \|\|
	Ty->getArrayElementType()->getScalarSizeInBits() !=
	VT.getSizeInBits() \|\|
	Ty->getArrayNumElements() >
	Ty->getArrayElementType()->getScalarSizeInBits())
	continue;

	// Check if the array's constant elements are suitable to our case.
	uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
	bool ConstantsMatch = true;
	for (uint64_t j = 0; j < ArrayElementCount; j++) {
	ConstantInt *Elem =
	dyn_cast<ConstantInt>(Init->getAggregateElement(j));
	if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
	ConstantsMatch = false;
	break;
	}
	}
	if (!ConstantsMatch)
	continue;

	// Do the transformation (For 32-bit type):
	// -> (and (load arr[idx]), inp)
	// <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
	// that will be replaced with one bzhi instruction.
	SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
	SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, VT);

	// Get the Node which indexes into the array.
	SDValue Index = getIndexFromUnindexedLoad(Ld);
	if (!Index)
	return SDValue();
	Index = DAG.getZExtOrTrunc(Index, dl, VT);

	SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SizeC, Index);

	SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
	SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);

	return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
	}
	}
	}
	}
	return SDValue();
	}

	static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);

	// If this is SSE1 only convert to FAND to avoid scalarization.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
	return DAG.getBitcast(
	MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
	DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
	DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
	}

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
	return R;

	if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
	return ShiftRight;

	if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
	return R;

	// Attempt to recursively combine a bitmask AND with shuffles.
	if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(
	{Op}, 0, Op, {0}, {}, /Depth/ 1,
	/HasVarMask/ false, DAG, DCI, Subtarget)) {
	DCI.CombineTo(N, Res);
	return SDValue();
	}
	}

	// Attempt to combine a scalar bitmask AND with an extracted shuffle.
	if ((VT.getScalarSizeInBits() % 8) == 0 &&
	N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
	SDValue BitMask = N->getOperand(1);
	SDValue SrcVec = N->getOperand(0).getOperand(0);
	EVT SrcVecVT = SrcVec.getValueType();

	// Check that the constant bitmask masks whole bytes.
	APInt UndefElts;
	SmallVector<APInt, 64> EltBits;
	if (VT == SrcVecVT.getScalarType() &&
	N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
	getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
	llvm::all_of(EltBits, [](APInt M) {
	return M.isNullValue() \|\| M.isAllOnesValue();
	})) {
	unsigned NumElts = SrcVecVT.getVectorNumElements();
	unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
	unsigned Idx = N->getOperand(0).getConstantOperandVal(1);

	// Create a root shuffle mask from the byte mask and the extracted index.
	SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
	for (unsigned i = 0; i != Scale; ++i) {
	if (UndefElts[i])
	continue;
	int VecIdx = Scale * Idx + i;
	ShuffleMask[VecIdx] =
	EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
	}

	if (SDValue Shuffle = combineX86ShufflesRecursively(
	{SrcVec}, 0, SrcVec, ShuffleMask, {}, /Depth/ 2,
	/HasVarMask/ false, DAG, DCI, Subtarget))
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
	N->getOperand(0).getOperand(1));
	}
	}

	return SDValue();
	}

	// Try to fold:
	// (or (and (m, y), (pandn m, x)))
	// into:
	// (vselect m, x, y)
	// As a special case, try to fold:
	// (or (and (m, (sub 0, x)), (pandn m, x)))
	// into:
	// (sub (xor X, M), M)
	static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);

	if (!((VT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(VT.is256BitVector() && Subtarget.hasInt256())))
	return SDValue();

	// Canonicalize AND to LHS.
	if (N1.getOpcode() == ISD::AND)
	std::swap(N0, N1);

	// TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
	// ANDNP combine allows other combines to happen that prevent matching.
	if (N0.getOpcode() != ISD::AND \|\| N1.getOpcode() != X86ISD::ANDNP)
	return SDValue();

	SDValue Mask = N1.getOperand(0);
	SDValue X = N1.getOperand(1);
	SDValue Y;
	if (N0.getOperand(0) == Mask)
	Y = N0.getOperand(1);
	if (N0.getOperand(1) == Mask)
	Y = N0.getOperand(0);

	// Check to see if the mask appeared in both the AND and ANDNP.
	if (!Y.getNode())
	return SDValue();

	// Validate that X, Y, and Mask are bitcasts, and see through them.
	Mask = peekThroughBitcasts(Mask);
	X = peekThroughBitcasts(X);
	Y = peekThroughBitcasts(Y);

	EVT MaskVT = Mask.getValueType();
	unsigned EltBits = MaskVT.getScalarSizeInBits();

	// TODO: Attempt to handle floating point cases as well?
	if (!MaskVT.isInteger() \|\| DAG.ComputeNumSignBits(Mask) != EltBits)
	return SDValue();

	SDLoc DL(N);

	// Try to match:
	// (or (and (M, (sub 0, X)), (pandn M, X)))
	// which is a special case of vselect:
	// (vselect M, (sub 0, X), X)
	// Per:
	// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
	// We know that, if fNegate is 0 or 1:
	// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
	//
	// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
	// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
	// ( M ? -X : X) == ((X ^ M ) + (M & 1))
	// This lets us transform our vselect to:
	// (add (xor X, M), (and M, 1))
	// And further to:
	// (sub (xor X, M), M)
	if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
	DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
	auto IsNegV = [](SDNode *N, SDValue V) {
	return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
	ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
	};
	SDValue V;
	if (IsNegV(Y.getNode(), X))
	V = X;
	else if (IsNegV(X.getNode(), Y))
	V = Y;

	if (V) {
	SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
	SDValue SubOp2 = Mask;

	// If the negate was on the false side of the select, then
	// the operands of the SUB need to be swapped. PR 27251.
	// This is because the pattern being matched above is
	// (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
	// but if the pattern matched was
	// (vselect M, X, (sub (0, X))), that is really negation of the pattern
	// above, -(vselect M, (sub 0, X), X), and therefore the replacement
	// pattern also needs to be a negation of the replacement pattern above.
	// And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
	// sub accomplishes the negation of the replacement pattern.
	if (V == Y)
	std::swap(SubOp1, SubOp2);

	SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
	return DAG.getBitcast(VT, Res);
	}
	}

	// PBLENDVB is only available on SSE 4.1.
	if (!Subtarget.hasSSE41())
	return SDValue();

	MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;

	X = DAG.getBitcast(BlendVT, X);
	Y = DAG.getBitcast(BlendVT, Y);
	Mask = DAG.getBitcast(BlendVT, Mask);
	Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
	return DAG.getBitcast(VT, Mask);
	}

	// Helper function for combineOrCmpEqZeroToCtlzSrl
	// Transforms:
	// seteq(cmp x, 0)
	// into:
	// srl(ctlz x), log2(bitsize(x))
	// Input pattern is checked by caller.
	static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
	SelectionDAG &DAG) {
	SDValue Cmp = Op.getOperand(1);
	EVT VT = Cmp.getOperand(0).getValueType();
	unsigned Log2b = Log2_32(VT.getSizeInBits());
	SDLoc dl(Op);
	SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
	// The result of the shift is true or false, and on X86, the 32-bit
	// encoding of shr and lzcnt is more desirable.
	SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
	SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
	DAG.getConstant(Log2b, dl, VT));
	return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
	}

	// Try to transform:
	// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
	// into:
	// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
	// Will also attempt to match more generic cases, eg:
	// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
	// Only applies if the target supports the FastLZCNT feature.
	static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalize() \|\| !Subtarget.getTargetLowering()->isCtlzFast())
	return SDValue();

	auto isORCandidate = [](SDValue N) {
	return (N->getOpcode() == ISD::OR && N->hasOneUse());
	};

	// Check the zero extend is extending to 32-bit or more. The code generated by
	// srl(ctlz) for 16-bit or less variants of the pattern would require extra
	// instructions to clear the upper bits.
	if (!N->hasOneUse() \|\| !N->getSimpleValueType(0).bitsGE(MVT::i32) \|\|
	!isORCandidate(N->getOperand(0)))
	return SDValue();

	// Check the node matches: setcc(eq, cmp 0)
	auto isSetCCCandidate = [](SDValue N) {
	return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
	X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
	N->getOperand(1).getOpcode() == X86ISD::CMP &&
	isNullConstant(N->getOperand(1).getOperand(1)) &&
	N->getOperand(1).getValueType().bitsGE(MVT::i32);
	};

	SDNode *OR = N->getOperand(0).getNode();
	SDValue LHS = OR->getOperand(0);
	SDValue RHS = OR->getOperand(1);

	// Save nodes matching or(or, setcc(eq, cmp 0)).
	SmallVector<SDNode *, 2> ORNodes;
	while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) \|\|
	(isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
	ORNodes.push_back(OR);
	OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
	LHS = OR->getOperand(0);
	RHS = OR->getOperand(1);
	}

	// The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
	if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) \|\|
	!isORCandidate(SDValue(OR, 0)))
	return SDValue();

	// We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
	// to
	// or(srl(ctlz),srl(ctlz)).
	// The dag combiner can then fold it into:
	// srl(or(ctlz, ctlz)).
	EVT VT = OR->getValueType(0);
	SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
	SDValue Ret, NewRHS;
	if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
	Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);

	if (!Ret)
	return SDValue();

	// Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
	while (ORNodes.size() > 0) {
	OR = ORNodes.pop_back_val();
	LHS = OR->getOperand(0);
	RHS = OR->getOperand(1);
	// Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
	if (RHS->getOpcode() == ISD::OR)
	std::swap(LHS, RHS);
	EVT VT = OR->getValueType(0);
	SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
	if (!NewRHS)
	return SDValue();
	Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
	}

	if (Ret)
	Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);

	return Ret;
	}

	static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);

	// If this is SSE1 only convert to FOR to avoid scalarization.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
	return DAG.getBitcast(MVT::v4i32,
	DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
	DAG.getBitcast(MVT::v4f32, N0),
	DAG.getBitcast(MVT::v4f32, N1)));
	}

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
	return R;

	if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
	return SDValue();

	// fold (or (x << c) \| (y >> (64 - c))) ==> (shld64 x, y, c)
	bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();

	// SHLD/SHRD instructions have lower register pressure, but on some
	// platforms they have higher latency than the equivalent
	// series of shifts/or that would otherwise be generated.
	// Don't fold (or (x << c) \| (y >> (64 - c))) if SHLD/SHRD instructions
	// have higher latencies and we are not optimizing for size.
	if (!OptForSize && Subtarget.isSHLDSlow())
	return SDValue();

	if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
	std::swap(N0, N1);
	if (N0.getOpcode() != ISD::SHL \|\| N1.getOpcode() != ISD::SRL)
	return SDValue();
	if (!N0.hasOneUse() \|\| !N1.hasOneUse())
	return SDValue();

	SDValue ShAmt0 = N0.getOperand(1);
	if (ShAmt0.getValueType() != MVT::i8)
	return SDValue();
	SDValue ShAmt1 = N1.getOperand(1);
	if (ShAmt1.getValueType() != MVT::i8)
	return SDValue();
	if (ShAmt0.getOpcode() == ISD::TRUNCATE)
	ShAmt0 = ShAmt0.getOperand(0);
	if (ShAmt1.getOpcode() == ISD::TRUNCATE)
	ShAmt1 = ShAmt1.getOperand(0);

	SDLoc DL(N);
	unsigned Opc = X86ISD::SHLD;
	SDValue Op0 = N0.getOperand(0);
	SDValue Op1 = N1.getOperand(0);
	if (ShAmt0.getOpcode() == ISD::SUB \|\|
	ShAmt0.getOpcode() == ISD::XOR) {
	Opc = X86ISD::SHRD;
	std::swap(Op0, Op1);
	std::swap(ShAmt0, ShAmt1);
	}

	// OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
	// OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
	// OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
	// OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
	unsigned Bits = VT.getSizeInBits();
	if (ShAmt1.getOpcode() == ISD::SUB) {
	SDValue Sum = ShAmt1.getOperand(0);
	if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
	SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
	if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
	ShAmt1Op1 = ShAmt1Op1.getOperand(0);
	if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
	return DAG.getNode(Opc, DL, VT,
	Op0, Op1,
	DAG.getNode(ISD::TRUNCATE, DL,
	MVT::i8, ShAmt0));
	}
	} else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
	ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
	if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
	return DAG.getNode(Opc, DL, VT,
	N0.getOperand(0), N1.getOperand(0),
	DAG.getNode(ISD::TRUNCATE, DL,
	MVT::i8, ShAmt0));
	} else if (ShAmt1.getOpcode() == ISD::XOR) {
	SDValue Mask = ShAmt1.getOperand(1);
	if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
	unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
	SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
	if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
	ShAmt1Op0 = ShAmt1Op0.getOperand(0);
	if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
	if (Op1.getOpcode() == InnerShift &&
	isa<ConstantSDNode>(Op1.getOperand(1)) &&
	Op1.getConstantOperandVal(1) == 1) {
	return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
	DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
	}
	// Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
	if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
	Op1.getOperand(0) == Op1.getOperand(1)) {
	return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
	DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
	}
	}
	}
	}

	return SDValue();
	}

	/// Try to turn tests against the signbit in the form of:
	/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
	/// into:
	/// SETGT(X, -1)
	static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
	// This is only worth doing if the output type is i8 or i1.
	EVT ResultType = N->getValueType(0);
	if (ResultType != MVT::i8 && ResultType != MVT::i1)
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// We should be performing an xor against a truncated shift.
	if (N0.getOpcode() != ISD::TRUNCATE \|\| !N0.hasOneUse())
	return SDValue();

	// Make sure we are performing an xor against one.
	if (!isOneConstant(N1))
	return SDValue();

	// SetCC on x86 zero extends so only act on this if it's a logical shift.
	SDValue Shift = N0.getOperand(0);
	if (Shift.getOpcode() != ISD::SRL \|\| !Shift.hasOneUse())
	return SDValue();

	// Make sure we are truncating from one of i16, i32 or i64.
	EVT ShiftTy = Shift.getValueType();
	if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
	return SDValue();

	// Make sure the shift amount extracts the sign bit.
	if (!isa<ConstantSDNode>(Shift.getOperand(1)) \|\|
	Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
	return SDValue();

	// Create a greater-than comparison against -1.
	// N.B. Using SETGE against 0 works but we want a canonical looking
	// comparison, using SETGT matches up with what TranslateX86CC.
	SDLoc DL(N);
	SDValue ShiftOp = Shift.getOperand(0);
	EVT ShiftOpTy = ShiftOp.getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), ResultType);
	SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
	DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
	if (SetCCResultType != ResultType)
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
	return Cond;
	}

	/// Turn vector tests of the signbit in the form of:
	/// xor (sra X, elt_size(X)-1), -1
	/// into:
	/// pcmpgt X, -1
	///
	/// This should be called before type legalization because the pattern may not
	/// persist after that.
	static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	if (!VT.isSimple())
	return SDValue();

	switch (VT.getSimpleVT().SimpleTy) {
	default: return SDValue();
	case MVT::v16i8:
	case MVT::v8i16:
	case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
	case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
	case MVT::v32i8:
	case MVT::v16i16:
	case MVT::v8i32:
	case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
	}

	// There must be a shift right algebraic before the xor, and the xor must be a
	// 'not' operation.
	SDValue Shift = N->getOperand(0);
	SDValue Ones = N->getOperand(1);
	if (Shift.getOpcode() != ISD::SRA \|\| !Shift.hasOneUse() \|\|
	!ISD::isBuildVectorAllOnes(Ones.getNode()))
	return SDValue();

	// The shift should be smearing the sign bit across each vector element.
	auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
	if (!ShiftBV)
	return SDValue();

	EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
	auto *ShiftAmt = ShiftBV->getConstantSplatNode();
	if (!ShiftAmt \|\| ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
	return SDValue();

	// Create a greater-than comparison against -1. We don't use the more obvious
	// greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
	return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
	}

	/// Check if truncation with saturation form type \p SrcVT to \p DstVT
	/// is valid for the given \p Subtarget.
	static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasAVX512())
	return false;

	// FIXME: Scalar type may be supported if we move it to vector register.
	if (!SrcVT.isVector() \|\| !SrcVT.isSimple() \|\| SrcVT.getSizeInBits() > 512)
	return false;

	EVT SrcElVT = SrcVT.getScalarType();
	EVT DstElVT = DstVT.getScalarType();
	if (SrcElVT.getSizeInBits() < 16 \|\| SrcElVT.getSizeInBits() > 64)
	return false;
	if (DstElVT.getSizeInBits() < 8 \|\| DstElVT.getSizeInBits() > 32)
	return false;
	if (SrcVT.is512BitVector() \|\| Subtarget.hasVLX())
	return SrcElVT.getSizeInBits() >= 32 \|\| Subtarget.hasBWI();
	return false;
	}

	/// Detect a pattern of truncation with saturation:
	/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
	/// Return the source value to be truncated or SDValue() if the pattern was not
	/// matched.
	static SDValue detectUSatPattern(SDValue In, EVT VT) {
	if (In.getOpcode() != ISD::UMIN)
	return SDValue();

	//Saturation with truncation. We truncate from InVT to VT.
	assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
	"Unexpected types for truncate operation");

	APInt C;
	if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
	// C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
	// the element size of the destination type.
	return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
	SDValue();
	}
	return SDValue();
	}

	/// Detect a pattern of truncation with saturation:
	/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
	/// The types should allow to use VPMOVUS* instruction on AVX512.
	/// Return the source value to be truncated or SDValue() if the pattern was not
	/// matched.
	static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
	const X86Subtarget &Subtarget) {
	if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
	return SDValue();
	return detectUSatPattern(In, VT);
	}

	static SDValue
	combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isTypeLegal(In.getValueType()) \|\| !TLI.isTypeLegal(VT))
	return SDValue();
	if (auto USatVal = detectUSatPattern(In, VT))
	if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
	return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
	return SDValue();
	}

	/// This function detects the AVG pattern between vectors of unsigned i8/i16,
	/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
	/// X86ISD::AVG instruction.
	static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	const SDLoc &DL) {
	if (!VT.isVector() \|\| !VT.isSimple())
	return SDValue();
	EVT InVT = In.getValueType();
	unsigned NumElems = VT.getVectorNumElements();

	EVT ScalarVT = VT.getVectorElementType();
	if (!((ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16) &&
	isPowerOf2_32(NumElems)))
	return SDValue();

	// InScalarVT is the intermediate type in AVG pattern and it should be greater
	// than the original input type (i8/i16).
	EVT InScalarVT = InVT.getVectorElementType();
	if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
	return SDValue();

	if (!Subtarget.hasSSE2())
	return SDValue();

	// Detect the following pattern:
	//
	// %1 = zext <N x i8> %a to <N x i32>
	// %2 = zext <N x i8> %b to <N x i32>
	// %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
	// %4 = add nuw nsw <N x i32> %3, %2
	// %5 = lshr <N x i32> %N, <i32 1 x N>
	// %6 = trunc <N x i32> %5 to <N x i8>
	//
	// In AVX512, the last instruction can also be a trunc store.
	if (In.getOpcode() != ISD::SRL)
	return SDValue();

	// A lambda checking the given SDValue is a constant vector and each element
	// is in the range [Min, Max].
	auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
	BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
	if (!BV \|\| !BV->isConstant())
	return false;
	for (SDValue Op : V->ops()) {
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
	if (!C)
	return false;
	uint64_t Val = C->getZExtValue();
	if (Val < Min \|\| Val > Max)
	return false;
	}
	return true;
	};

	// Split vectors to legal target size and apply AVG.
	auto LowerToAVG = [&](SDValue Op0, SDValue Op1) {
	unsigned NumSubs = 1;
	if (Subtarget.hasBWI()) {
	if (VT.getSizeInBits() > 512)
	NumSubs = VT.getSizeInBits() / 512;
	} else if (Subtarget.hasAVX2()) {
	if (VT.getSizeInBits() > 256)
	NumSubs = VT.getSizeInBits() / 256;
	} else {
	if (VT.getSizeInBits() > 128)
	NumSubs = VT.getSizeInBits() / 128;
	}

	if (NumSubs == 1)
	return DAG.getNode(X86ISD::AVG, DL, VT, Op0, Op1);

	SmallVector<SDValue, 4> Subs;
	EVT SubVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
	VT.getVectorNumElements() / NumSubs);
	for (unsigned i = 0; i != NumSubs; ++i) {
	unsigned Idx = i * SubVT.getVectorNumElements();
	SDValue LHS = extractSubVector(Op0, Idx, DAG, DL, SubVT.getSizeInBits());
	SDValue RHS = extractSubVector(Op1, Idx, DAG, DL, SubVT.getSizeInBits());
	Subs.push_back(DAG.getNode(X86ISD::AVG, DL, SubVT, LHS, RHS));
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
	};

	// Check if each element of the vector is left-shifted by one.
	auto LHS = In.getOperand(0);
	auto RHS = In.getOperand(1);
	if (!IsConstVectorInRange(RHS, 1, 1))
	return SDValue();
	if (LHS.getOpcode() != ISD::ADD)
	return SDValue();

	// Detect a pattern of a + b + 1 where the order doesn't matter.
	SDValue Operands[3];
	Operands[0] = LHS.getOperand(0);
	Operands[1] = LHS.getOperand(1);

	// Take care of the case when one of the operands is a constant vector whose
	// element is in the range [1, 256].
	if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
	Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
	Operands[0].getOperand(0).getValueType() == VT) {
	// The pattern is detected. Subtract one from the constant vector, then
	// demote it and emit X86ISD::AVG instruction.
	SDValue VecOnes = DAG.getConstant(1, DL, InVT);
	Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
	Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
	return LowerToAVG(Operands[0].getOperand(0), Operands[1]);
	}

	if (Operands[0].getOpcode() == ISD::ADD)
	std::swap(Operands[0], Operands[1]);
	else if (Operands[1].getOpcode() != ISD::ADD)
	return SDValue();
	Operands[2] = Operands[1].getOperand(0);
	Operands[1] = Operands[1].getOperand(1);

	// Now we have three operands of two additions. Check that one of them is a
	// constant vector with ones, and the other two are promoted from i8/i16.
	for (int i = 0; i < 3; ++i) {
	if (!IsConstVectorInRange(Operands[i], 1, 1))
	continue;
	std::swap(Operands[i], Operands[2]);

	// Check if Operands[0] and Operands[1] are results of type promotion.
	for (int j = 0; j < 2; ++j)
	if (Operands[j].getOpcode() != ISD::ZERO_EXTEND \|\|
	Operands[j].getOperand(0).getValueType() != VT)
	return SDValue();

	// The pattern is detected, emit X86ISD::AVG instruction.
	return LowerToAVG(Operands[0].getOperand(0), Operands[1].getOperand(0));
	}

	return SDValue();
	}

	static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	LoadSDNode *Ld = cast<LoadSDNode>(N);
	EVT RegVT = Ld->getValueType(0);
	EVT MemVT = Ld->getMemoryVT();
	SDLoc dl(Ld);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// For chips with slow 32-byte unaligned loads, break the 32-byte operation
	// into two 16-byte operations. Also split non-temporal aligned loads on
	// pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
	ISD::LoadExtType Ext = Ld->getExtensionType();
	bool Fast;
	unsigned AddressSpace = Ld->getAddressSpace();
	unsigned Alignment = Ld->getAlignment();
	if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
	Ext == ISD::NON_EXTLOAD &&
	((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) \|\|
	(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
	AddressSpace, Alignment, &Fast) && !Fast))) {
	unsigned NumElems = RegVT.getVectorNumElements();
	if (NumElems < 2)
	return SDValue();

	SDValue Ptr = Ld->getBasePtr();

	EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
	NumElems/2);
	SDValue Load1 =
	DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
	Alignment, Ld->getMemOperand()->getFlags());

	Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
	SDValue Load2 =
	DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
	Ld->getPointerInfo().getWithOffset(16),
	MinAlign(Alignment, 16U), Ld->getMemOperand()->getFlags());
	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	Load1.getValue(1),
	Load2.getValue(1));

	SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
	return DCI.CombineTo(N, NewVec, TF, true);
	}

	return SDValue();
	}

	/// If V is a build vector of boolean constants and exactly one of those
	/// constants is true, return the operand index of that true element.
	/// Otherwise, return -1.
	static int getOneTrueElt(SDValue V) {
	// This needs to be a build vector of booleans.
	// TODO: Checking for the i1 type matches the IR definition for the mask,
	// but the mask check could be loosened to i8 or other types. That might
	// also require checking more than 'allOnesValue'; eg, the x86 HW
	// instructions only require that the MSB is set for each mask element.
	// The ISD::MSTORE comments/definition do not specify how the mask operand
	// is formatted.
	auto *BV = dyn_cast<BuildVectorSDNode>(V);
	if (!BV \|\| BV->getValueType(0).getVectorElementType() != MVT::i1)
	return -1;

	int TrueIndex = -1;
	unsigned NumElts = BV->getValueType(0).getVectorNumElements();
	for (unsigned i = 0; i < NumElts; ++i) {
	const SDValue &Op = BV->getOperand(i);
	if (Op.isUndef())
	continue;
	auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
	if (!ConstNode)
	return -1;
	if (ConstNode->getAPIntValue().isAllOnesValue()) {
	// If we already found a one, this is too many.
	if (TrueIndex >= 0)
	return -1;
	TrueIndex = i;
	}
	}
	return TrueIndex;
	}

	/// Given a masked memory load/store operation, return true if it has one mask
	/// bit set. If it has one mask bit set, then also return the memory address of
	/// the scalar element to load/store, the vector index to insert/extract that
	/// scalar element, and the alignment for the scalar memory access.
	static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
	SelectionDAG &DAG, SDValue &Addr,
	SDValue &Index, unsigned &Alignment) {
	int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
	if (TrueMaskElt < 0)
	return false;

	// Get the address of the one scalar element that is specified by the mask
	// using the appropriate offset from the base pointer.
	EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
	Addr = MaskedOp->getBasePtr();
	if (TrueMaskElt != 0) {
	unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
	Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
	}

	Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
	Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
	return true;
	}

	/// If exactly one element of the mask is set for a non-extending masked load,
	/// it is a scalar load and vector insert.
	/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
	/// mask have already been optimized in IR, so we don't bother with those here.
	static SDValue
	reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
	// However, some target hooks may need to be added to know when the transform
	// is profitable. Endianness would also have to be considered.

	SDValue Addr, VecIndex;
	unsigned Alignment;
	if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
	return SDValue();

	// Load the one scalar element that is specified by the mask using the
	// appropriate offset from the base pointer.
	SDLoc DL(ML);
	EVT VT = ML->getValueType(0);
	EVT EltVT = VT.getVectorElementType();
	SDValue Load =
	DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
	Alignment, ML->getMemOperand()->getFlags());

	// Insert the loaded element into the appropriate place in the vector.
	SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
	Load, VecIndex);
	return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
	}

	static SDValue
	combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
	return SDValue();

	SDLoc DL(ML);
	EVT VT = ML->getValueType(0);

	// If we are loading the first and last elements of a vector, it is safe and
	// always faster to load the whole vector. Replace the masked load with a
	// vector load and select.
	unsigned NumElts = VT.getVectorNumElements();
	BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
	bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
	bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
	if (LoadFirstElt && LoadLastElt) {
	SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
	ML->getMemOperand());
	SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
	return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
	}

	// Convert a masked load with a constant mask into a masked load and a select.
	// This allows the select operation to use a faster kind of select instruction
	// (for example, vblendvps -> vblendps).

	// Don't try this if the pass-through operand is already undefined. That would
	// cause an infinite loop because that's what we're about to create.
	if (ML->getSrc0().isUndef())
	return SDValue();

	// The new masked load has an undef pass-through operand. The select uses the
	// original pass-through operand.
	SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
	ML->getMask(), DAG.getUNDEF(VT),
	ML->getMemoryVT(), ML->getMemOperand(),
	ML->getExtensionType());
	SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());

	return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
	}

	static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);

	// TODO: Expanding load with constant mask may be optimized as well.
	if (Mld->isExpandingLoad())
	return SDValue();

	if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
	if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
	return ScalarLoad;
	// TODO: Do some AVX512 subsets benefit from this transform?
	if (!Subtarget.hasAVX512())
	if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
	return Blend;
	}

	if (Mld->getExtensionType() != ISD::SEXTLOAD)
	return SDValue();

	// Resolve extending loads.
	EVT VT = Mld->getValueType(0);
	unsigned NumElems = VT.getVectorNumElements();
	EVT LdVT = Mld->getMemoryVT();
	SDLoc dl(Mld);

	assert(LdVT != VT && "Cannot extend to the same type");
	unsigned ToSz = VT.getScalarSizeInBits();
	unsigned FromSz = LdVT.getScalarSizeInBits();
	// From/To sizes and ElemCount must be pow of two.
	assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
	"Unexpected size for extending masked load");

	unsigned SizeRatio = ToSz / FromSz;
	assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());

	// Create a type on which we perform the shuffle.
	EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
	LdVT.getScalarType(), NumElems*SizeRatio);
	assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());

	// Convert Src0 value.
	SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
	if (!Mld->getSrc0().isUndef()) {
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;

	// Can't shuffle using an illegal type.
	assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
	"WideVecVT should be legal");
	WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
	DAG.getUNDEF(WideVecVT), ShuffleVec);
	}

	// Prepare the new mask.
	SDValue NewMask;
	SDValue Mask = Mld->getMask();
	if (Mask.getValueType() == VT) {
	// Mask and original value have the same type.
	NewMask = DAG.getBitcast(WideVecVT, Mask);
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;
	for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
	ShuffleVec[i] = NumElems * SizeRatio;
	NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
	DAG.getConstant(0, dl, WideVecVT),
	ShuffleVec);
	} else {
	assert(Mask.getValueType().getVectorElementType() == MVT::i1);
	unsigned WidenNumElts = NumElems*SizeRatio;
	unsigned MaskNumElts = VT.getVectorNumElements();
	EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
	WidenNumElts);

	unsigned NumConcat = WidenNumElts / MaskNumElts;
	SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
	SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
	Ops[0] = Mask;
	NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
	}

	SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
	Mld->getBasePtr(), NewMask, WideSrc0,
	Mld->getMemoryVT(), Mld->getMemOperand(),
	ISD::NON_EXTLOAD);
	SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
	return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
	}

	/// If exactly one element of the mask is set for a non-truncating masked store,
	/// it is a vector extract and scalar store.
	/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
	/// mask have already been optimized in IR, so we don't bother with those here.
	static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
	SelectionDAG &DAG) {
	// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
	// However, some target hooks may need to be added to know when the transform
	// is profitable. Endianness would also have to be considered.

	SDValue Addr, VecIndex;
	unsigned Alignment;
	if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
	return SDValue();

	// Extract the one scalar element that is actually being stored.
	SDLoc DL(MS);
	EVT VT = MS->getValue().getValueType();
	EVT EltVT = VT.getVectorElementType();
	SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
	MS->getValue(), VecIndex);

	// Store that element at the appropriate offset from the base pointer.
	return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
	Alignment, MS->getMemOperand()->getFlags());
	}

	static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);

	if (Mst->isCompressingStore())
	return SDValue();

	if (!Mst->isTruncatingStore()) {
	if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
	return ScalarStore;

	// If the mask is checking (0 > X), we're creating a vector with all-zeros
	// or all-ones elements based on the sign bits of X. AVX1 masked store only
	// cares about the sign bit of each mask element, so eliminate the compare:
	// mstore val, ptr, (pcmpgt 0, X) --> mstore val, ptr, X
	// Note that by waiting to match an x86-specific PCMPGT node, we're
	// eliminating potentially more complex matching of a setcc node which has
	// a full range of predicates.
	SDValue Mask = Mst->getMask();
	if (Mask.getOpcode() == X86ISD::PCMPGT &&
	ISD::isBuildVectorAllZeros(Mask.getOperand(0).getNode())) {
	assert(Mask.getValueType() == Mask.getOperand(1).getValueType() &&
	"Unexpected type for PCMPGT");
	return DAG.getMaskedStore(
	Mst->getChain(), SDLoc(N), Mst->getValue(), Mst->getBasePtr(),
	Mask.getOperand(1), Mst->getMemoryVT(), Mst->getMemOperand());
	}

	// TODO: AVX512 targets should also be able to simplify something like the
	// pattern above, but that pattern will be different. It will either need to
	// match setcc more generally or match PCMPGTM later (in tablegen?).

	return SDValue();
	}

	// Resolve truncating stores.
	EVT VT = Mst->getValue().getValueType();
	unsigned NumElems = VT.getVectorNumElements();
	EVT StVT = Mst->getMemoryVT();
	SDLoc dl(Mst);

	assert(StVT != VT && "Cannot truncate to the same type");
	unsigned FromSz = VT.getScalarSizeInBits();
	unsigned ToSz = StVT.getScalarSizeInBits();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// The truncating store is legal in some cases. For example
	// vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
	// are designated for truncate store.
	// In this case we don't need any further transformations.
	if (TLI.isTruncStoreLegal(VT, StVT))
	return SDValue();

	// From/To sizes and ElemCount must be pow of two.
	assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
	"Unexpected size for truncating masked store");
	// We are going to use the original vector elt for storing.
	// Accumulated smaller vector elements must be a multiple of the store size.
	assert (((NumElems * FromSz) % ToSz) == 0 &&
	"Unexpected ratio for truncating masked store");

	unsigned SizeRatio = FromSz / ToSz;
	assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());

	// Create a type on which we perform the shuffle.
	EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
	StVT.getScalarType(), NumElems*SizeRatio);

	assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());

	SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;

	// Can't shuffle using an illegal type.
	assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
	"WideVecVT should be legal");

	SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
	DAG.getUNDEF(WideVecVT),
	ShuffleVec);

	SDValue NewMask;
	SDValue Mask = Mst->getMask();
	if (Mask.getValueType() == VT) {
	// Mask and original value have the same type.
	NewMask = DAG.getBitcast(WideVecVT, Mask);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;
	for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
	ShuffleVec[i] = NumElems*SizeRatio;
	NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
	DAG.getConstant(0, dl, WideVecVT),
	ShuffleVec);
	} else {
	assert(Mask.getValueType().getVectorElementType() == MVT::i1);
	unsigned WidenNumElts = NumElems*SizeRatio;
	unsigned MaskNumElts = VT.getVectorNumElements();
	EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
	WidenNumElts);

	unsigned NumConcat = WidenNumElts / MaskNumElts;
	SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
	SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
	Ops[0] = Mask;
	NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
	}

	return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
	Mst->getBasePtr(), NewMask, StVT,
	Mst->getMemOperand(), false);
	}

	static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	StoreSDNode *St = cast<StoreSDNode>(N);
	EVT VT = St->getValue().getValueType();
	EVT StVT = St->getMemoryVT();
	SDLoc dl(St);
	SDValue StoredVal = St->getOperand(1);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// If we are saving a concatenation of two XMM registers and 32-byte stores
	// are slow, such as on Sandy Bridge, perform two 16-byte stores.
	bool Fast;
	unsigned AddressSpace = St->getAddressSpace();
	unsigned Alignment = St->getAlignment();
	if (VT.is256BitVector() && StVT == VT &&
	TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
	AddressSpace, Alignment, &Fast) &&
	!Fast) {
	unsigned NumElems = VT.getVectorNumElements();
	if (NumElems < 2)
	return SDValue();

	SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
	SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);

	SDValue Ptr0 = St->getBasePtr();
	SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);

	SDValue Ch0 =
	DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
	Alignment, St->getMemOperand()->getFlags());
	SDValue Ch1 =
	DAG.getStore(St->getChain(), dl, Value1, Ptr1,
	St->getPointerInfo().getWithOffset(16),
	MinAlign(Alignment, 16U), St->getMemOperand()->getFlags());
	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
	}

	// Optimize trunc store (of multiple scalars) to shuffle and store.
	// First, pack all of the elements in one place. Next, store to memory
	// in fewer chunks.
	if (St->isTruncatingStore() && VT.isVector()) {
	// Check if we can detect an AVG pattern from the truncation. If yes,
	// replace the trunc store by a normal store with the result of X86ISD::AVG
	// instruction.
	if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
	Subtarget, dl))
	return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());

	if (SDValue Val =
	detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
	return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
	dl, Val, St->getBasePtr(),
	St->getMemoryVT(), St->getMemOperand(), DAG);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned NumElems = VT.getVectorNumElements();
	assert(StVT != VT && "Cannot truncate to the same type");
	unsigned FromSz = VT.getScalarSizeInBits();
	unsigned ToSz = StVT.getScalarSizeInBits();

	// The truncating store is legal in some cases. For example
	// vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
	// are designated for truncate store.
	// In this case we don't need any further transformations.
	if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
	return SDValue();

	// From, To sizes and ElemCount must be pow of two
	if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
	// We are going to use the original vector elt for storing.
	// Accumulated smaller vector elements must be a multiple of the store size.
	if (0 != (NumElems * FromSz) % ToSz) return SDValue();

	unsigned SizeRatio = FromSz / ToSz;

	assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());

	// Create a type on which we perform the shuffle
	EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
	StVT.getScalarType(), NumElems*SizeRatio);

	assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());

	SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
	SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;

	// Can't shuffle using an illegal type.
	if (!TLI.isTypeLegal(WideVecVT))
	return SDValue();

	SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
	DAG.getUNDEF(WideVecVT),
	ShuffleVec);
	// At this point all of the data is stored at the bottom of the
	// register. We now need to save it to mem.

	// Find the largest store unit
	MVT StoreType = MVT::i8;
	for (MVT Tp : MVT::integer_valuetypes()) {
	if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
	StoreType = Tp;
	}

	// On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
	if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
	(64 <= NumElems * ToSz))
	StoreType = MVT::f64;

	// Bitcast the original vector into a vector of store-size units
	EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
	StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
	assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
	SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
	SmallVector<SDValue, 8> Chains;
	SDValue Ptr = St->getBasePtr();

	// Perform one or more big stores into memory.
	for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
	SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
	StoreType, ShuffWide,
	DAG.getIntPtrConstant(i, dl));
	SDValue Ch =
	DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
	St->getAlignment(), St->getMemOperand()->getFlags());
	Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
	Chains.push_back(Ch);
	}

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
	}

	// Turn load->store of MMX types into GPR load/stores. This avoids clobbering
	// the FP state in cases where an emms may be missing.
	// A preferable solution to the general problem is to figure out the right
	// places to insert EMMS. This qualifies as a quick hack.

	// Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
	if (VT.getSizeInBits() != 64)
	return SDValue();

	const Function &F = DAG.getMachineFunction().getFunction();
	bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
	bool F64IsLegal =
	!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
	if ((VT.isVector() \|\|
	(VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
	isa<LoadSDNode>(St->getValue()) &&
	!cast<LoadSDNode>(St->getValue())->isVolatile() &&
	St->getChain().hasOneUse() && !St->isVolatile()) {
	LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
	SmallVector<SDValue, 8> Ops;

	if (!ISD::isNormalLoad(Ld))
	return SDValue();

	// If this is not the MMX case, i.e. we are just turning i64 load/store
	// into f64 load/store, avoid the transformation if there are multiple
	// uses of the loaded value.
	if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
	return SDValue();

	SDLoc LdDL(Ld);
	SDLoc StDL(N);
	// If we are a 64-bit capable x86, lower to a single movq load/store pair.
	// Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
	// pair instead.
	if (Subtarget.is64Bit() \|\| F64IsLegal) {
	MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
	SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
	Ld->getMemOperand());

	// Make sure new load is placed in same chain order.
	DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
	return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
	St->getMemOperand());
	}

	// Otherwise, lower to two pairs of 32-bit loads / stores.
	SDValue LoAddr = Ld->getBasePtr();
	SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);

	SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
	Ld->getPointerInfo(), Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());
	SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
	Ld->getPointerInfo().getWithOffset(4),
	MinAlign(Ld->getAlignment(), 4),
	Ld->getMemOperand()->getFlags());
	// Make sure new loads are placed in same chain order.
	DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
	DAG.makeEquivalentMemoryOrdering(Ld, HiLd);

	LoAddr = St->getBasePtr();
	HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);

	SDValue LoSt =
	DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(),
	St->getAlignment(), St->getMemOperand()->getFlags());
	SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr,
	St->getPointerInfo().getWithOffset(4),
	MinAlign(St->getAlignment(), 4),
	St->getMemOperand()->getFlags());
	return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
	}

	// This is similar to the above case, but here we handle a scalar 64-bit
	// integer store that is extracted from a vector on a 32-bit target.
	// If we have SSE2, then we can treat it like a floating-point double
	// to get past legalization. The execution dependencies fixup pass will
	// choose the optimal machine instruction for the store if this really is
	// an integer or v2f32 rather than an f64.
	if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
	St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	SDValue OldExtract = St->getOperand(1);
	SDValue ExtOp0 = OldExtract.getOperand(0);
	unsigned VecSize = ExtOp0.getValueSizeInBits();
	EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
	SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
	SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
	BitCast, OldExtract.getOperand(1));
	return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());
	}

	return SDValue();
	}

	/// Return 'true' if this vector operation is "horizontal"
	/// and return the operands for the horizontal operation in LHS and RHS. A
	/// horizontal operation performs the binary operation on successive elements
	/// of its first operand, then on successive elements of its second operand,
	/// returning the resulting values in a vector. For example, if
	/// A = < float a0, float a1, float a2, float a3 >
	/// and
	/// B = < float b0, float b1, float b2, float b3 >
	/// then the result of doing a horizontal operation on A and B is
	/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
	/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
	/// A horizontal-op B, for some already available A and B, and if so then LHS is
	/// set to A, RHS to B, and the routine returns 'true'.
	/// Note that the binary operation should have the property that if one of the
	/// operands is UNDEF then the result is UNDEF.
	static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
	// Look for the following pattern: if
	// A = < float a0, float a1, float a2, float a3 >
	// B = < float b0, float b1, float b2, float b3 >
	// and
	// LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
	// RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
	// then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
	// which is A horizontal-op B.

	// At least one of the operands should be a vector shuffle.
	if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
	RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
	return false;

	MVT VT = LHS.getSimpleValueType();

	assert((VT.is128BitVector() \|\| VT.is256BitVector()) &&
	"Unsupported vector type for horizontal add/sub");

	// Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
	// operate independently on 128-bit lanes.
	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumLanes = VT.getSizeInBits()/128;
	unsigned NumLaneElts = NumElts / NumLanes;
	assert((NumLaneElts % 2 == 0) &&
	"Vector type should have an even number of elements in each lane");
	unsigned HalfLaneElts = NumLaneElts/2;

	// View LHS in the form
	// LHS = VECTOR_SHUFFLE A, B, LMask
	// If LHS is not a shuffle then pretend it is the shuffle
	// LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
	// NOTE: in what follows a default initialized SDValue represents an UNDEF of
	// type VT.
	SDValue A, B;
	SmallVector<int, 16> LMask(NumElts);
	if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
	if (!LHS.getOperand(0).isUndef())
	A = LHS.getOperand(0);
	if (!LHS.getOperand(1).isUndef())
	B = LHS.getOperand(1);
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
	std::copy(Mask.begin(), Mask.end(), LMask.begin());
	} else {
	if (!LHS.isUndef())
	A = LHS;
	for (unsigned i = 0; i != NumElts; ++i)
	LMask[i] = i;
	}

	// Likewise, view RHS in the form
	// RHS = VECTOR_SHUFFLE C, D, RMask
	SDValue C, D;
	SmallVector<int, 16> RMask(NumElts);
	if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
	if (!RHS.getOperand(0).isUndef())
	C = RHS.getOperand(0);
	if (!RHS.getOperand(1).isUndef())
	D = RHS.getOperand(1);
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
	std::copy(Mask.begin(), Mask.end(), RMask.begin());
	} else {
	if (!RHS.isUndef())
	C = RHS;
	for (unsigned i = 0; i != NumElts; ++i)
	RMask[i] = i;
	}

	// Check that the shuffles are both shuffling the same vectors.
	if (!(A == C && B == D) && !(A == D && B == C))
	return false;

	// If everything is UNDEF then bail out: it would be better to fold to UNDEF.
	if (!A.getNode() && !B.getNode())
	return false;

	// If A and B occur in reverse order in RHS, then "swap" them (which means
	// rewriting the mask).
	if (A != C)
	ShuffleVectorSDNode::commuteMask(RMask);

	// At this point LHS and RHS are equivalent to
	// LHS = VECTOR_SHUFFLE A, B, LMask
	// RHS = VECTOR_SHUFFLE A, B, RMask
	// Check that the masks correspond to performing a horizontal operation.
	for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
	for (unsigned i = 0; i != NumLaneElts; ++i) {
	int LIdx = LMask[i+l], RIdx = RMask[i+l];

	// Ignore any UNDEF components.
	if (LIdx < 0 \|\| RIdx < 0 \|\|
	(!A.getNode() && (LIdx < (int)NumElts \|\| RIdx < (int)NumElts)) \|\|
	(!B.getNode() && (LIdx >= (int)NumElts \|\| RIdx >= (int)NumElts)))
	continue;

	// Check that successive elements are being operated on. If not, this is
	// not a horizontal operation.
	unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
	int Index = 2(i%HalfLaneElts) + NumEltsSrc + l;
	if (!(LIdx == Index && RIdx == Index + 1) &&
	!(IsCommutative && LIdx == Index + 1 && RIdx == Index))
	return false;
	}
	}

	LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
	RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
	return true;
	}

	/// Do target-specific dag combines on floating-point adds/subs.
	static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	bool IsFadd = N->getOpcode() == ISD::FADD;
	assert((IsFadd \|\| N->getOpcode() == ISD::FSUB) && "Wrong opcode");

	// Try to synthesize horizontal add/sub from adds/subs of shuffles.
	if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 \|\| VT == MVT::v2f64)) \|\|
	(Subtarget.hasFp256() && (VT == MVT::v8f32 \|\| VT == MVT::v4f64))) &&
	isHorizontalBinOp(LHS, RHS, IsFadd)) {
	auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
	return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
	}
	return SDValue();
	}

	/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
	/// the codegen.
	/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
	static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SDLoc &DL) {
	assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
	SDValue Src = N->getOperand(0);
	unsigned Opcode = Src.getOpcode();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	EVT VT = N->getValueType(0);
	EVT SrcVT = Src.getValueType();

	auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
	unsigned TruncSizeInBits = VT.getScalarSizeInBits();

	// Repeated operand, so we are only trading one output truncation for
	// one input truncation.
	if (Op0 == Op1)
	return true;

	// See if either operand has been extended from a smaller/equal size to
	// the truncation size, allowing a truncation to combine with the extend.
	unsigned Opcode0 = Op0.getOpcode();
	if ((Opcode0 == ISD::ANY_EXTEND \|\| Opcode0 == ISD::SIGN_EXTEND \|\|
	Opcode0 == ISD::ZERO_EXTEND) &&
	Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
	return true;

	unsigned Opcode1 = Op1.getOpcode();
	if ((Opcode1 == ISD::ANY_EXTEND \|\| Opcode1 == ISD::SIGN_EXTEND \|\|
	Opcode1 == ISD::ZERO_EXTEND) &&
	Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
	return true;

	// See if either operand is a single use constant which can be constant
	// folded.
	SDValue BC0 = peekThroughOneUseBitcasts(Op0);
	SDValue BC1 = peekThroughOneUseBitcasts(Op1);
	return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) \|\|
	ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
	};

	auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
	SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
	SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
	return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
	};

	// Don't combine if the operation has other uses.
	if (!N->isOnlyUserOf(Src.getNode()))
	return SDValue();

	// Only support vector truncation for now.
	// TODO: i64 scalar math would benefit as well.
	if (!VT.isVector())
	return SDValue();

	// In most cases its only worth pre-truncating if we're only facing the cost
	// of one truncation.
	// i.e. if one of the inputs will constant fold or the input is repeated.
	switch (Opcode) {
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR: {
	SDValue Op0 = Src.getOperand(0);
	SDValue Op1 = Src.getOperand(1);
	if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
	IsRepeatedOpOrFreeTruncation(Op0, Op1))
	return TruncateArithmetic(Op0, Op1);
	break;
	}

	case ISD::MUL:
	// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
	// better to truncate if we have the chance.
	if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
	!Subtarget.hasDQI())
	return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
	LLVM_FALLTHROUGH;
	case ISD::ADD: {
	// TODO: ISD::SUB should be here but interferes with combineSubToSubus.
	SDValue Op0 = Src.getOperand(0);
	SDValue Op1 = Src.getOperand(1);
	if (TLI.isOperationLegal(Opcode, VT) &&
	IsRepeatedOpOrFreeTruncation(Op0, Op1))
	return TruncateArithmetic(Op0, Op1);
	break;
	}
	}

	return SDValue();
	}

	/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
	static SDValue
	combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
	SmallVector<SDValue, 8> &Regs) {
	assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 \|\|
	Regs[0].getValueType() == MVT::v2i64));
	EVT OutVT = N->getValueType(0);
	EVT OutSVT = OutVT.getVectorElementType();
	EVT InVT = Regs[0].getValueType();
	EVT InSVT = InVT.getVectorElementType();
	SDLoc DL(N);

	// First, use mask to unset all bits that won't appear in the result.
	assert((OutSVT == MVT::i8 \|\| OutSVT == MVT::i16) &&
	"OutSVT can only be either i8 or i16.");
	APInt Mask =
	APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
	SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
	for (auto &Reg : Regs)
	Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);

	MVT UnpackedVT, PackedVT;
	if (OutSVT == MVT::i8) {
	UnpackedVT = MVT::v8i16;
	PackedVT = MVT::v16i8;
	} else {
	UnpackedVT = MVT::v4i32;
	PackedVT = MVT::v8i16;
	}

	// In each iteration, truncate the type by a half size.
	auto RegNum = Regs.size();
	for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
	j < e; j *= 2, RegNum /= 2) {
	for (unsigned i = 0; i < RegNum; i++)
	Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
	for (unsigned i = 0; i < RegNum / 2; i++)
	Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
	Regs[i * 2 + 1]);
	}

	// If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
	// then extract a subvector as the result since v8i8 is not a legal type.
	if (OutVT == MVT::v8i8) {
	Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
	Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
	DAG.getIntPtrConstant(0, DL));
	return Regs[0];
	} else if (RegNum > 1) {
	Regs.resize(RegNum);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
	} else
	return Regs[0];
	}

	/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
	static SDValue
	combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
	SelectionDAG &DAG,
	SmallVector<SDValue, 8> &Regs) {
	assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
	EVT OutVT = N->getValueType(0);
	SDLoc DL(N);

	// Shift left by 16 bits, then arithmetic-shift right by 16 bits.
	SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
	for (auto &Reg : Regs) {
	Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
	Subtarget, DAG);
	Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
	Subtarget, DAG);
	}

	for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
	Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
	Regs[i * 2 + 1]);

	if (Regs.size() > 2) {
	Regs.resize(Regs.size() / 2);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
	} else
	return Regs[0];
	}

	/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
	/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
	/// legalization the truncation will be translated into a BUILD_VECTOR with each
	/// element that is extracted from a vector and then truncated, and it is
	/// difficult to do this optimization based on them.
	static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT OutVT = N->getValueType(0);
	if (!OutVT.isVector())
	return SDValue();

	SDValue In = N->getOperand(0);
	if (!In.getValueType().isSimple())
	return SDValue();

	EVT InVT = In.getValueType();
	unsigned NumElems = OutVT.getVectorNumElements();

	// TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
	// SSE2, and we need to take care of it specially.
	// AVX512 provides vpmovdb.
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX2())
	return SDValue();

	EVT OutSVT = OutVT.getVectorElementType();
	EVT InSVT = InVT.getVectorElementType();
	if (!((InSVT == MVT::i32 \|\| InSVT == MVT::i64) &&
	(OutSVT == MVT::i8 \|\| OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
	NumElems >= 8))
	return SDValue();

	// SSSE3's pshufb results in less instructions in the cases below.
	if (Subtarget.hasSSSE3() && NumElems == 8 &&
	((OutSVT == MVT::i8 && InSVT != MVT::i64) \|\|
	(InSVT == MVT::i32 && OutSVT == MVT::i16)))
	return SDValue();

	SDLoc DL(N);

	// Split a long vector into vectors of legal type.
	unsigned RegNum = InVT.getSizeInBits() / 128;
	SmallVector<SDValue, 8> SubVec(RegNum);
	unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
	EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);

	for (unsigned i = 0; i < RegNum; i++)
	SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
	DAG.getIntPtrConstant(i * NumSubRegElts, DL));

	// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
	// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
	// truncate 2 x v4i32 to v8i16.
	if (Subtarget.hasSSE41() \|\| OutSVT == MVT::i8)
	return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
	else if (InSVT == MVT::i32)
	return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
	else
	return SDValue();
	}

	/// This function transforms vector truncation of 'extended sign-bits' or
	/// 'extended zero-bits' values.
	/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
	static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Requires SSE2 but AVX512 has fast truncate.
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX512())
	return SDValue();

	if (!N->getValueType(0).isVector() \|\| !N->getValueType(0).isSimple())
	return SDValue();

	SDValue In = N->getOperand(0);
	if (!In.getValueType().isSimple())
	return SDValue();

	MVT VT = N->getValueType(0).getSimpleVT();
	MVT SVT = VT.getScalarType();

	MVT InVT = In.getValueType().getSimpleVT();
	MVT InSVT = InVT.getScalarType();

	// Check we have a truncation suited for PACKSS.
	if (!VT.is128BitVector() && !VT.is256BitVector())
	return SDValue();
	if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
	return SDValue();
	if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
	return SDValue();

	// Use PACKSS if the input has sign-bits that extend all the way to the
	// packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
	unsigned NumSignBits = DAG.ComputeNumSignBits(In);
	unsigned NumPackedBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
	if (NumSignBits > (InSVT.getSizeInBits() - NumPackedBits))
	return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);

	// Use PACKUS if the input has zero-bits that extend all the way to the
	// packed/truncated value. e.g. masks, zext_in_reg, etc.
	KnownBits Known;
	DAG.computeKnownBits(In, Known);
	unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
	NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
	if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedBits))
	return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);

	return SDValue();
	}

	static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue Src = N->getOperand(0);
	SDLoc DL(N);

	// Attempt to pre-truncate inputs to arithmetic ops instead.
	if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
	return V;

	// Try to detect AVG pattern first.
	if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
	return Avg;

	// Try to combine truncation with unsigned saturation.
	if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
	return Val;

	// The bitcast source is a direct mmx result.
	// Detect bitcasts between i32 to x86mmx
	if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
	SDValue BCSrc = Src.getOperand(0);
	if (BCSrc.getValueType() == MVT::x86mmx)
	return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
	}

	// Try to truncate extended sign/zero bits with PACKSS/PACKUS.
	if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
	return V;

	return combineVectorTruncation(N, DAG, Subtarget);
	}

	/// Returns the negated value if the node \p N flips sign of FP value.
	///
	/// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
	/// AVX512F does not have FXOR, so FNEG is lowered as
	/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
	/// In this case we go though all bitcasts.
	static SDValue isFNEG(SDNode *N) {
	if (N->getOpcode() == ISD::FNEG)
	return N->getOperand(0);

	SDValue Op = peekThroughBitcasts(SDValue(N, 0));
	if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
	return SDValue();

	SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
	if (!Op1.getValueType().isFloatingPoint())
	return SDValue();

	SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));

	unsigned EltBits = Op1.getScalarValueSizeInBits();
	auto isSignMask = [&](const ConstantFP *C) {
	return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
	};

	// There is more than one way to represent the same constant on
	// the different X86 targets. The type of the node may also depend on size.
	// - load scalar value and broadcast
	// - BUILD_VECTOR node
	// - load from a constant pool.
	// We check all variants here.
	if (Op1.getOpcode() == X86ISD::VBROADCAST) {
	if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
	if (isSignMask(cast<ConstantFP>(C)))
	return Op0;

	} else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
	if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
	if (isSignMask(CN->getConstantFPValue()))
	return Op0;

	} else if (auto *C = getTargetConstantFromNode(Op1)) {
	if (C->getType()->isVectorTy()) {
	if (auto *SplatV = C->getSplatValue())
	if (isSignMask(cast<ConstantFP>(SplatV)))
	return Op0;
	} else if (auto *FPConst = dyn_cast<ConstantFP>(C))
	if (isSignMask(FPConst))
	return Op0;
	}
	return SDValue();
	}

	/// Do target-specific dag combines on floating point negations.
	static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT OrigVT = N->getValueType(0);
	SDValue Arg = isFNEG(N);
	assert(Arg.getNode() && "N is expected to be an FNEG node");

	EVT VT = Arg.getValueType();
	EVT SVT = VT.getScalarType();
	SDLoc DL(N);

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	// If we're negating a FMUL node on a target with FMA, then we can avoid the
	// use of a constant by performing (-0 - A*B) instead.
	// FIXME: Check rounding control flags as well once it becomes available.
	if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 \|\| SVT == MVT::f64) &&
	Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
	SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
	SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
	Arg.getOperand(1), Zero);
	return DAG.getBitcast(OrigVT, NewNode);
	}

	// If we're negating an FMA node, then we can adjust the
	// instruction to include the extra negation.
	unsigned NewOpcode = 0;
	if (Arg.hasOneUse()) {
	switch (Arg.getOpcode()) {
	case ISD::FMA: NewOpcode = X86ISD::FNMSUB; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
	case X86ISD::FNMSUB: NewOpcode = ISD::FMA; break;
	case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
	case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
	case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
	case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
	// We can't handle scalar intrinsic node here because it would only
	// invert one element and not the whole vector. But we could try to handle
	// a negation of the lower element only.
	}
	}
	if (NewOpcode)
	return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
	Arg.getNode()->ops()));

	return SDValue();
	}

	static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = N->getSimpleValueType(0);
	// If we have integer vector types available, use the integer opcodes.
	if (VT.isVector() && Subtarget.hasSSE2()) {
	SDLoc dl(N);

	MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);

	SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
	SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
	unsigned IntOpcode;
	switch (N->getOpcode()) {
	default: llvm_unreachable("Unexpected FP logic op");
	case X86ISD::FOR: IntOpcode = ISD::OR; break;
	case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
	case X86ISD::FAND: IntOpcode = ISD::AND; break;
	case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
	}
	SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
	return DAG.getBitcast(VT, IntOp);
	}
	return SDValue();
	}


	/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
	static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
	if (N->getOpcode() != ISD::XOR)
	return SDValue();

	SDValue LHS = N->getOperand(0);
	auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!RHSC \|\| RHSC->getZExtValue() != 1 \|\| LHS->getOpcode() != X86ISD::SETCC)
	return SDValue();

	X86::CondCode NewCC = X86::GetOppositeBranchCondition(
	X86::CondCode(LHS->getConstantOperandVal(0)));
	SDLoc DL(N);
	return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
	}

	static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// If this is SSE1 only convert to FXOR to avoid scalarization.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
	N->getValueType(0) == MVT::v4i32) {
	return DAG.getBitcast(
	MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
	DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
	DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
	}

	if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
	return Cmp;

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue SetCC = foldXor1SetCC(N, DAG))
	return SetCC;

	if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
	return RV;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	if (isFNEG(N))
	return combineFneg(N, DAG, Subtarget);
	return SDValue();
	}


	static bool isNullFPScalarOrVectorConst(SDValue V) {
	return isNullFPConstant(V) \|\| ISD::isBuildVectorAllZeros(V.getNode());
	}

	/// If a value is a scalar FP zero or a vector FP zero (potentially including
	/// undefined elements), return a zero constant that may be used to fold away
	/// that value. In the case of a vector, the returned constant will not contain
	/// undefined elements even if the input parameter does. This makes it suitable
	/// to be used as a replacement operand with operations (eg, bitwise-and) where
	/// an undef should not propagate.
	static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!isNullFPScalarOrVectorConst(V))
	return SDValue();

	if (V.getValueType().isVector())
	return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));

	return V;
	}

	static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	// Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
	if (!((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::f64 && Subtarget.hasSSE2()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
	return SDValue();

	auto isAllOnesConstantFP = [](SDValue V) {
	if (V.getSimpleValueType().isVector())
	return ISD::isBuildVectorAllOnes(V.getNode());
	auto *C = dyn_cast<ConstantFPSDNode>(V);
	return C && C->getConstantFPValue()->isAllOnesValue();
	};

	// fand (fxor X, -1), Y --> fandn X, Y
	if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
	return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);

	// fand X, (fxor Y, -1) --> fandn Y, X
	if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
	return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);

	return SDValue();
	}

	/// Do target-specific dag combines on X86ISD::FAND nodes.
	static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// FAND(0.0, x) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
	return V;

	// FAND(x, 0.0) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
	return V;

	if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
	return V;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FANDN nodes.
	static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// FANDN(0.0, x) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(0)))
	return N->getOperand(1);

	// FANDN(x, 0.0) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
	return V;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
	static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == X86ISD::FOR \|\| N->getOpcode() == X86ISD::FXOR);

	// F[X]OR(0.0, x) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(0)))
	return N->getOperand(1);

	// F[X]OR(x, 0.0) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(1)))
	return N->getOperand(0);

	if (isFNEG(N))
	if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
	return NewVal;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
	static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
	assert(N->getOpcode() == X86ISD::FMIN \|\| N->getOpcode() == X86ISD::FMAX);

	// Only perform optimizations if UnsafeMath is used.
	if (!DAG.getTarget().Options.UnsafeFPMath)
	return SDValue();

	// If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
	// into FMINC and FMAXC, which are Commutative operations.
	unsigned NewOp = 0;
	switch (N->getOpcode()) {
	default: llvm_unreachable("unknown opcode");
	case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
	case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
	}

	return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
	N->getOperand(0), N->getOperand(1));
	}

	static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (Subtarget.useSoftFloat())
	return SDValue();

	// TODO: Check for global or instruction-level "nnan". In that case, we
	// should be able to lower to FMAX/FMIN alone.
	// TODO: If an operand is already known to be a NaN or not a NaN, this
	// should be an optional swap and FMAX/FMIN.

	EVT VT = N->getValueType(0);
	if (!((Subtarget.hasSSE1() && (VT == MVT::f32 \|\| VT == MVT::v4f32)) \|\|
	(Subtarget.hasSSE2() && (VT == MVT::f64 \|\| VT == MVT::v2f64)) \|\|
	(Subtarget.hasAVX() && (VT == MVT::v8f32 \|\| VT == MVT::v4f64))))
	return SDValue();

	// This takes at least 3 instructions, so favor a library call when operating
	// on a scalar and minimizing code size.
	if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
	return SDValue();

	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	SDLoc DL(N);
	EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
	DAG.getDataLayout(), *DAG.getContext(), VT);

	// There are 4 possibilities involving NaN inputs, and these are the required
	// outputs:
	// Op1
	// Num NaN
	// ----------------
	// Num \| Max \| Op0 \|
	// Op0 ----------------
	// NaN \| Op1 \| NaN \|
	// ----------------
	//
	// The SSE FP max/min instructions were not designed for this case, but rather
	// to implement:
	// Min = Op1 < Op0 ? Op1 : Op0
	// Max = Op1 > Op0 ? Op1 : Op0
	//
	// So they always return Op0 if either input is a NaN. However, we can still
	// use those instructions for fmaxnum by selecting away a NaN input.

	// If either operand is NaN, the 2nd source operand (Op0) is passed through.
	auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
	SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
	SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);

	// If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
	// are NaN, the NaN value of Op1 is the result.
	return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
	}

	/// Do target-specific dag combines on X86ISD::ANDNP nodes.
	static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// ANDNP(0, x) -> x
	if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
	return N->getOperand(1);

	// ANDNP(x, 0) -> 0
	if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
	return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));

	EVT VT = N->getValueType(0);

	// Attempt to recursively combine a bitmask ANDNP with shuffles.
	if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
	SDValue Op(N, 0);
	if (SDValue Res = combineX86ShufflesRecursively(
	{Op}, 0, Op, {0}, {}, /Depth/ 1,
	/HasVarMask/ false, DAG, DCI, Subtarget)) {
	DCI.CombineTo(N, Res);
	return SDValue();
	}
	}

	return SDValue();
	}

	static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// BT ignores high bits in the bit index operand.
	unsigned BitWidth = N1.getValueSizeInBits();
	APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
	if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))
	return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);

	return SDValue();
	}

	static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	if (!VT.isVector())
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
	SDLoc dl(N);

	// The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
	// both SSE and AVX2 since there is no sign-extended shift right
	// operation on a vector with 64-bit elements.
	//(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
	// (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
	if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND \|\|
	N0.getOpcode() == ISD::SIGN_EXTEND)) {
	SDValue N00 = N0.getOperand(0);

	// EXTLOAD has a better solution on AVX2,
	// it may be replaced with X86ISD::VSEXT node.
	if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
	if (!ISD::isNormalLoad(N00.getNode()))
	return SDValue();

	if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
	SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
	N00, N1);
	return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
	}
	}
	return SDValue();
	}

	/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
	/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
	/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
	/// opportunities to combine math ops, use an LEA, or use a complex addressing
	/// mode. This can eliminate extend, add, and shift instructions.
	static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
	Ext->getOpcode() != ISD::ZERO_EXTEND)
	return SDValue();

	// TODO: This should be valid for other integer types.
	EVT VT = Ext->getValueType(0);
	if (VT != MVT::i64)
	return SDValue();

	SDValue Add = Ext->getOperand(0);
	if (Add.getOpcode() != ISD::ADD)
	return SDValue();

	bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
	bool NSW = Add->getFlags().hasNoSignedWrap();
	bool NUW = Add->getFlags().hasNoUnsignedWrap();

	// We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
	// into the 'zext'
	if ((Sext && !NSW) \|\| (!Sext && !NUW))
	return SDValue();

	// Having a constant operand to the 'add' ensures that we are not increasing
	// the instruction count because the constant is extended for free below.
	// A constant operand can also become the displacement field of an LEA.
	auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
	if (!AddOp1)
	return SDValue();

	// Don't make the 'add' bigger if there's no hope of combining it with some
	// other 'add' or 'shl' instruction.
	// TODO: It may be profitable to generate simpler LEA instructions in place
	// of single 'add' instructions, but the cost model for selecting an LEA
	// currently has a high threshold.
	bool HasLEAPotential = false;
	for (auto *User : Ext->uses()) {
	if (User->getOpcode() == ISD::ADD \|\| User->getOpcode() == ISD::SHL) {
	HasLEAPotential = true;
	break;
	}
	}
	if (!HasLEAPotential)
	return SDValue();

	// Everything looks good, so pull the '{s\|z}ext' ahead of the 'add'.
	int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
	SDValue AddOp0 = Add.getOperand(0);
	SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
	SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);

	// The wider add is guaranteed to not wrap because both operands are
	// sign-extended.
	SDNodeFlags Flags;
	Flags.setNoSignedWrap(NSW);
	Flags.setNoUnsignedWrap(NUW);
	return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
	}

	/// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
	/// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
	/// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
	/// extends from AH (which we otherwise need to do contortions to access).
	static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	auto OpcodeN = N->getOpcode();
	auto OpcodeN0 = N0.getOpcode();
	if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) \|\|
	(OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
	return SDValue();

	EVT VT = N->getValueType(0);
	EVT InVT = N0.getValueType();
	if (N0.getResNo() != 1 \|\| InVT != MVT::i8 \|\|
	!(VT == MVT::i32 \|\| VT == MVT::i64))
	return SDValue();

	SDVTList NodeTys = DAG.getVTList(MVT::i8, MVT::i32);
	auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
	: X86ISD::UDIVREM8_ZEXT_HREG;
	SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
	N0.getOperand(1));
	DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
	// If this was a 64-bit extend, complete it.
	if (VT == MVT::i64)
	return DAG.getNode(OpcodeN, SDLoc(N), VT, R.getValue(1));
	return R.getValue(1);
	}

	// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
	// operands and the result of CMOV is not used anywhere else - promote CMOV
	// itself instead of promoting its result. This could be beneficial, because:
	// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
	// (or more) pseudo-CMOVs only when they go one-after-another and
	// getting rid of result extension code after CMOV will help that.
	// 2) Promotion of constant CMOV arguments is free, hence the
	// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
	// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
	// promotion is also good in terms of code-size.
	// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
	// promotion).
	static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
	SDValue CMovN = Extend->getOperand(0);
	if (CMovN.getOpcode() != X86ISD::CMOV)
	return SDValue();

	EVT TargetVT = Extend->getValueType(0);
	unsigned ExtendOpcode = Extend->getOpcode();
	SDLoc DL(Extend);

	EVT VT = CMovN.getValueType();
	SDValue CMovOp0 = CMovN.getOperand(0);
	SDValue CMovOp1 = CMovN.getOperand(1);

	bool DoPromoteCMOV =
	(VT == MVT::i16 && (TargetVT == MVT::i32 \|\| TargetVT == MVT::i64)) &&
	CMovN.hasOneUse() &&
	(isa<ConstantSDNode>(CMovOp0.getNode()) &&
	isa<ConstantSDNode>(CMovOp1.getNode()));

	if (!DoPromoteCMOV)
	return SDValue();

	CMovOp0 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp0);
	CMovOp1 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp1);

	return DAG.getNode(X86ISD::CMOV, DL, TargetVT, CMovOp0, CMovOp1,
	CMovN.getOperand(2), CMovN.getOperand(3));
	}

	// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
	// This is more or less the reverse of combineBitcastvxi1.
	static SDValue
	combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
	Opcode != ISD::ANY_EXTEND)
	return SDValue();
	if (!DCI.isBeforeLegalizeOps())
	return SDValue();
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX512())
	return SDValue();

	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT SVT = VT.getScalarType();
	EVT InSVT = N0.getValueType().getScalarType();
	unsigned EltSizeInBits = SVT.getSizeInBits();

	// Input type must be extending a bool vector (bit-casted from a scalar
	// integer) to legal integer types.
	if (!VT.isVector())
	return SDValue();
	if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
	return SDValue();
	if (InSVT != MVT::i1 \|\| N0.getOpcode() != ISD::BITCAST)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	EVT SclVT = N0.getOperand(0).getValueType();
	if (!SclVT.isScalarInteger())
	return SDValue();

	SDLoc DL(N);
	SDValue Vec;
	SmallVector<int, 32> ShuffleMask;
	unsigned NumElts = VT.getVectorNumElements();
	assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");

	// Broadcast the scalar integer to the vector elements.
	if (NumElts > EltSizeInBits) {
	// If the scalar integer is greater than the vector element size, then we
	// must split it down into sub-sections for broadcasting. For example:
	// i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
	// i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
	assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
	unsigned Scale = NumElts / EltSizeInBits;
	EVT BroadcastVT =
	EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
	Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
	Vec = DAG.getBitcast(VT, Vec);

	for (unsigned i = 0; i != Scale; ++i)
	ShuffleMask.append(EltSizeInBits, i);
	} else {
	// For smaller scalar integers, we can simply any-extend it to the vector
	// element size (we don't care about the upper bits) and broadcast it to all
	// elements.
	SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
	Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
	ShuffleMask.append(NumElts, 0);
	}
	Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);

	// Now, mask the relevant bit in each element.
	SmallVector<SDValue, 32> Bits;
	for (unsigned i = 0; i != NumElts; ++i) {
	int BitIdx = (i % EltSizeInBits);
	APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
	Bits.push_back(DAG.getConstant(Bit, DL, SVT));
	}
	SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
	Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);

	// Compare against the bitmask and extend the result.
	EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
	Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
	Vec = DAG.getSExtOrTrunc(Vec, DL, VT);

	// For SEXT, this is now done, otherwise shift the result down for
	// zero-extension.
	if (Opcode == ISD::SIGN_EXTEND)
	return Vec;
	return DAG.getNode(ISD::SRL, DL, VT, Vec,
	DAG.getConstant(EltSizeInBits - 1, DL, VT));
	}

	/// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
	/// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
	/// with UNDEFs) of the input to vectors of the same size as the target type
	/// which then extends the lowest elements.
	static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
	return SDValue();
	if (!DCI.isBeforeLegalizeOps())
	return SDValue();
	if (!Subtarget.hasSSE2())
	return SDValue();

	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT SVT = VT.getScalarType();
	EVT InVT = N0.getValueType();
	EVT InSVT = InVT.getScalarType();

	// Input type must be a vector and we must be extending legal integer types.
	if (!VT.isVector())
	return SDValue();
	if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
	return SDValue();
	if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
	return SDValue();

	// On AVX2+ targets, if the input/output types are both legal then we will be
	// able to use SIGN_EXTEND/ZERO_EXTEND directly.
	if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
	DAG.getTargetLoweringInfo().isTypeLegal(InVT))
	return SDValue();

	SDLoc DL(N);

	auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
	EVT InVT = N.getValueType();
	EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
	Size / InVT.getScalarSizeInBits());
	SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
	DAG.getUNDEF(InVT));
	Opnds[0] = N;
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
	};

	// If target-size is less than 128-bits, extend to a type that would extend
	// to 128 bits, extend that and extract the original target vector.
	if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
	unsigned Scale = 128 / VT.getSizeInBits();
	EVT ExVT =
	EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
	SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
	SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
	DAG.getIntPtrConstant(0, DL));
	}

	// If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
	// ISD::_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::VEXT.
	// Also use this if we don't have SSE41 to allow the legalizer do its job.
	if (!Subtarget.hasSSE41() \|\| VT.is128BitVector() \|\|
	(VT.is256BitVector() && Subtarget.hasInt256()) \|\|
	(VT.is512BitVector() && Subtarget.hasAVX512())) {
	SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
	return Opcode == ISD::SIGN_EXTEND
	? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
	: DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
	}

	auto SplitAndExtendInReg = [&](unsigned SplitSize) {
	unsigned NumVecs = VT.getSizeInBits() / SplitSize;
	unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
	EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
	EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);

	SmallVector<SDValue, 8> Opnds;
	for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
	SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
	DAG.getIntPtrConstant(Offset, DL));
	SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
	SrcVec = Opcode == ISD::SIGN_EXTEND
	? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
	: DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
	Opnds.push_back(SrcVec);
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
	};

	// On pre-AVX2 targets, split into 128-bit nodes of
	// ISD::*_EXTEND_VECTOR_INREG.
	if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
	return SplitAndExtendInReg(128);

	// On pre-AVX512 targets, split into 256-bit nodes of
	// ISD::*_EXTEND_VECTOR_INREG.
	if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
	return SplitAndExtendInReg(256);

	return SDValue();
	}

	static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT InVT = N0.getValueType();
	SDLoc DL(N);

	if (SDValue DivRem8 = getDivRem8(N, DAG))
	return DivRem8;

	if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
	return NewCMov;

	if (!DCI.isBeforeLegalizeOps())
	return SDValue();

	if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
	isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
	// Invert and sign-extend a boolean is the same as zero-extend and subtract
	// 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
	// lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
	// sext (xor Bool, -1) --> sub (zext Bool), 1
	SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
	return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
	}

	if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (VT.isVector())
	if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
	return NewAdd;

	return SDValue();
	}

	static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// TODO: Handle FMSUB/FNMADD/FNMSUB as the starting opcode.
	SDLoc dl(N);
	EVT VT = N->getValueType(0);

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	EVT ScalarVT = VT.getScalarType();
	if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) \|\| !Subtarget.hasAnyFMA())
	return SDValue();

	SDValue A = N->getOperand(0);
	SDValue B = N->getOperand(1);
	SDValue C = N->getOperand(2);

	auto invertIfNegative = [](SDValue &V) {
	if (SDValue NegVal = isFNEG(V.getNode())) {
	V = NegVal;
	return true;
	}
	return false;
	};

	// Do not convert the passthru input of scalar intrinsics.
	// FIXME: We could allow negations of the lower element only.
	bool NegA = N->getOpcode() != X86ISD::FMADDS1 &&
	N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
	bool NegB = invertIfNegative(B);
	bool NegC = N->getOpcode() != X86ISD::FMADDS3 &&
	N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);

	// Negative multiplication when NegA xor NegB
	bool NegMul = (NegA != NegB);
	bool HasNeg = NegA \|\| NegB \|\| NegC;

	unsigned NewOpcode;
	if (!NegMul)
	NewOpcode = (!NegC) ? unsigned(ISD::FMA) : unsigned(X86ISD::FMSUB);
	else
	NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;

	// For FMA, we risk reconstructing the node we started with.
	// In order to avoid this, we check for negation or opcode change. If
	// one of the two happened, then it is a new node and we return it.
	if (N->getOpcode() == ISD::FMA) {
	if (HasNeg \|\| NewOpcode != N->getOpcode())
	return DAG.getNode(NewOpcode, dl, VT, A, B, C);
	return SDValue();
	}

	if (N->getOpcode() == X86ISD::FMADD_RND) {
	switch (NewOpcode) {
	case ISD::FMA: NewOpcode = X86ISD::FMADD_RND; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
	case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
	}
	} else if (N->getOpcode() == X86ISD::FMADDS1) {
	switch (NewOpcode) {
	case ISD::FMA: NewOpcode = X86ISD::FMADDS1; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1; break;
	case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1; break;
	}
	} else if (N->getOpcode() == X86ISD::FMADDS3) {
	switch (NewOpcode) {
	case ISD::FMA: NewOpcode = X86ISD::FMADDS3; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3; break;
	case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3; break;
	}
	} else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
	switch (NewOpcode) {
	case ISD::FMA: NewOpcode = X86ISD::FMADDS1_RND; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
	case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
	}
	} else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
	switch (NewOpcode) {
	case ISD::FMA: NewOpcode = X86ISD::FMADDS3_RND; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
	case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
	}
	} else if (N->getOpcode() == X86ISD::FMADD4S) {
	switch (NewOpcode) {
	case ISD::FMA: NewOpcode = X86ISD::FMADD4S; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB4S; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD4S; break;
	case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB4S; break;
	}
	} else {
	llvm_unreachable("Unexpected opcode!");
	}

	// Only return the node is the opcode was changed or one of the
	// operand was negated. If not, we'll just recreate the same node.
	if (HasNeg \|\| NewOpcode != N->getOpcode()) {
	if (N->getNumOperands() == 4)
	return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
	return DAG.getNode(NewOpcode, dl, VT, A, B, C);
	}

	return SDValue();
	}

	// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
	static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);

	SDValue NegVal = isFNEG(N->getOperand(2).getNode());
	if (!NegVal)
	return SDValue();

	unsigned NewOpcode;
	switch (N->getOpcode()) {
	default: llvm_unreachable("Unexpected opcode!");
	case X86ISD::FMADDSUB: NewOpcode = X86ISD::FMSUBADD; break;
	case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break;
	case X86ISD::FMSUBADD: NewOpcode = X86ISD::FMADDSUB; break;
	case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break;
	}

	if (N->getNumOperands() == 4)
	return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
	NegVal, N->getOperand(3));
	return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
	NegVal);
	}

	static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
	// (and (i32 x86isd::setcc_carry), 1)
	// This eliminates the zext. This transformation is necessary because
	// ISD::SETCC is always legalized to i8.
	SDLoc dl(N);
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	if (N0.getOpcode() == ISD::AND &&
	N0.hasOneUse() &&
	N0.getOperand(0).hasOneUse()) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
	if (!isOneConstant(N0.getOperand(1)))
	return SDValue();
	return DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
	N00.getOperand(0), N00.getOperand(1)),
	DAG.getConstant(1, dl, VT));
	}
	}

	if (N0.getOpcode() == ISD::TRUNCATE &&
	N0.hasOneUse() &&
	N0.getOperand(0).hasOneUse()) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
	return DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
	N00.getOperand(0), N00.getOperand(1)),
	DAG.getConstant(1, dl, VT));
	}
	}

	if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
	return NewCMov;

	if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (VT.isVector())
	if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue DivRem8 = getDivRem8(N, DAG))
	return DivRem8;

	if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
	return NewAdd;

	if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
	return R;

	return SDValue();
	}

	/// Try to map a 128-bit or larger integer comparison to vector instructions
	/// before type legalization splits it up into chunks.
	static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
	assert((CC == ISD::SETNE \|\| CC == ISD::SETEQ) && "Bad comparison predicate");

	// We're looking for an oversized integer equality comparison.
	SDValue X = SetCC->getOperand(0);
	SDValue Y = SetCC->getOperand(1);
	EVT OpVT = X.getValueType();
	unsigned OpSize = OpVT.getSizeInBits();
	if (!OpVT.isScalarInteger() \|\| OpSize < 128)
	return SDValue();

	// Ignore a comparison with zero because that gets special treatment in
	// EmitTest(). But make an exception for the special case of a pair of
	// logically-combined vector-sized operands compared to zero. This pattern may
	// be generated by the memcmp expansion pass with oversized integer compares
	// (see PR33325).
	bool IsOrXorXorCCZero = isNullConstant(Y) && X.getOpcode() == ISD::OR &&
	X.getOperand(0).getOpcode() == ISD::XOR &&
	X.getOperand(1).getOpcode() == ISD::XOR;
	if (isNullConstant(Y) && !IsOrXorXorCCZero)
	return SDValue();

	// Bail out if we know that this is not really just an oversized integer.
	if (peekThroughBitcasts(X).getValueType() == MVT::f128 \|\|
	peekThroughBitcasts(Y).getValueType() == MVT::f128)
	return SDValue();

	// TODO: Use PXOR + PTEST for SSE4.1 or later?
	// TODO: Add support for AVX-512.
	EVT VT = SetCC->getValueType(0);
	SDLoc DL(SetCC);
	if ((OpSize == 128 && Subtarget.hasSSE2()) \|\|
	(OpSize == 256 && Subtarget.hasAVX2())) {
	EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
	SDValue Cmp;
	if (IsOrXorXorCCZero) {
	// This is a bitwise-combined equality comparison of 2 pairs of vectors:
	// setcc i128 (or (xor A, B), (xor C, D)), 0, eq\|ne
	// Use 2 vector equality compares and 'and' the results before doing a
	// MOVMSK.
	SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0));
	SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
	SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
	SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
	SDValue Cmp1 = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, A, B);
	SDValue Cmp2 = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, C, D);
	Cmp = DAG.getNode(ISD::AND, DL, VecVT, Cmp1, Cmp2);
	} else {
	SDValue VecX = DAG.getBitcast(VecVT, X);
	SDValue VecY = DAG.getBitcast(VecVT, Y);
	Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
	}
	// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
	// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
	// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
	// setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
	// setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
	SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
	SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
	MVT::i32);
	return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
	}

	return SDValue();
	}

	static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	if (CC == ISD::SETNE \|\| CC == ISD::SETEQ) {
	EVT OpVT = LHS.getValueType();
	// 0-x == y --> x+y == 0
	// 0-x != y --> x+y != 0
	if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
	LHS.hasOneUse()) {
	SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
	return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
	}
	// x == 0-y --> x+y == 0
	// x != 0-y --> x+y != 0
	if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
	RHS.hasOneUse()) {
	SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
	return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
	}

	if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
	return V;
	}

	if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
	(CC == ISD::SETNE \|\| CC == ISD::SETEQ \|\| ISD::isSignedIntSetCC(CC))) {
	// Put build_vectors on the right.
	if (LHS.getOpcode() == ISD::BUILD_VECTOR) {
	std::swap(LHS, RHS);
	CC = ISD::getSetCCSwappedOperands(CC);
	}

	bool IsSEXT0 =
	(LHS.getOpcode() == ISD::SIGN_EXTEND) &&
	(LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
	bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());

	if (IsSEXT0 && IsVZero1) {
	assert(VT == LHS.getOperand(0).getValueType() &&
	"Uexpected operand type");
	if (CC == ISD::SETGT)
	return DAG.getConstant(0, DL, VT);
	if (CC == ISD::SETLE)
	return DAG.getConstant(1, DL, VT);
	if (CC == ISD::SETEQ \|\| CC == ISD::SETGE)
	return DAG.getNOT(DL, LHS.getOperand(0), VT);

	assert((CC == ISD::SETNE \|\| CC == ISD::SETLT) &&
	"Unexpected condition code!");
	return LHS.getOperand(0);
	}
	}

	// For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
	// to avoid scalarization via legalization because v4i32 is not a legal type.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
	LHS.getValueType() == MVT::v4f32)
	return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);

	return SDValue();
	}

	static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	SDValue Src = N->getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
	!DCI.isBeforeLegalizeOps());

	// MOVMSK only uses the MSB from each vector element.
	KnownBits Known;
	APInt DemandedMask(APInt::getSignMask(SrcVT.getScalarSizeInBits()));
	if (TLI.SimplifyDemandedBits(Src, DemandedMask, Known, TLO)) {
	DCI.AddToWorklist(Src.getNode());
	DCI.CommitTargetLoweringOpt(TLO);
	return SDValue(N, 0);
	}

	return SDValue();
	}

	static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);

	if (DCI.isBeforeLegalizeOps()) {
	SDValue Index = N->getOperand(4);
	// Remove any sign extends from 32 or smaller to larger than 32.
	// Only do this before LegalizeOps in case we need the sign extend for
	// legalization.
	if (Index.getOpcode() == ISD::SIGN_EXTEND) {
	if (Index.getScalarValueSizeInBits() > 32 &&
	Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
	SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
	NewOps[4] = Index.getOperand(0);
	DAG.UpdateNodeOperands(N, NewOps);
	// The original sign extend has less users, add back to worklist in case
	// it needs to be removed
	DCI.AddToWorklist(Index.getNode());
	DCI.AddToWorklist(N);
	return SDValue(N, 0);
	}
	}

	// Make sure the index is either i32 or i64
	unsigned ScalarSize = Index.getScalarValueSizeInBits();
	if (ScalarSize != 32 && ScalarSize != 64) {
	MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32;
	EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
	Index.getValueType().getVectorNumElements());
	Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
	SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
	NewOps[4] = Index;
	DAG.UpdateNodeOperands(N, NewOps);
	DCI.AddToWorklist(N);
	return SDValue(N, 0);
	}

	// Try to remove zero extends from 32->64 if we know the sign bit of
	// the input is zero.
	if (Index.getOpcode() == ISD::ZERO_EXTEND &&
	Index.getScalarValueSizeInBits() == 64 &&
	Index.getOperand(0).getScalarValueSizeInBits() == 32) {
	if (DAG.SignBitIsZero(Index.getOperand(0))) {
	SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
	NewOps[4] = Index.getOperand(0);
	DAG.UpdateNodeOperands(N, NewOps);
	// The original zero extend has less users, add back to worklist in case
	// it needs to be removed
	DCI.AddToWorklist(Index.getNode());
	DCI.AddToWorklist(N);
	return SDValue(N, 0);
	}
	}
	}

	// Gather and Scatter instructions use k-registers for masks. The type of
	// the masks is v*i1. So the mask will be truncated anyway.
	// The SIGN_EXTEND_INREG my be dropped.
	SDValue Mask = N->getOperand(2);
	if (Subtarget.hasAVX512() && Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
	SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
	NewOps[2] = Mask.getOperand(0);
	DAG.UpdateNodeOperands(N, NewOps);
	return SDValue(N, 0);
	}

	// With AVX2 we only demand the upper bit of the mask.
	if (!Subtarget.hasAVX512()) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
	!DCI.isBeforeLegalizeOps());
	KnownBits Known;
	APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
	if (TLI.SimplifyDemandedBits(Mask, DemandedMask, Known, TLO)) {
	DCI.AddToWorklist(Mask.getNode());
	DCI.CommitTargetLoweringOpt(TLO);
	return SDValue(N, 0);
	}
	}

	return SDValue();
	}

	// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
	static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
	SDValue EFLAGS = N->getOperand(1);

	// Try to simplify the EFLAGS and condition code operands.
	if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
	return getSETCC(CC, Flags, DL, DAG);

	return SDValue();
	}

	/// Optimize branch condition evaluation.
	static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	SDValue EFLAGS = N->getOperand(3);
	X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));

	// Try to simplify the EFLAGS and condition code operands.
	// Make sure to not keep references to operands, as combineSetCCEFLAGS can
	// RAUW them under us.
	if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
	SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
	return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
	N->getOperand(1), Cond, Flags);
	}

	return SDValue();
	}

	static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
	SelectionDAG &DAG) {
	// Take advantage of vector comparisons producing 0 or -1 in each lane to
	// optimize away operation when it's from a constant.
	//
	// The general transformation is:
	// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
	// AND(VECTOR_CMP(x,y), constant2)
	// constant2 = UNARYOP(constant)

	// Early exit if this isn't a vector operation, the operand of the
	// unary operation isn't a bitwise AND, or if the sizes of the operations
	// aren't the same.
	EVT VT = N->getValueType(0);
	if (!VT.isVector() \|\| N->getOperand(0)->getOpcode() != ISD::AND \|\|
	N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC \|\|
	VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
	return SDValue();

	// Now check that the other operand of the AND is a constant. We could
	// make the transformation for non-constant splats as well, but it's unclear
	// that would be a benefit as it would not eliminate any operations, just
	// perform one more step in scalar code before moving to the vector unit.
	if (BuildVectorSDNode *BV =
	dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
	// Bail out if the vector isn't a constant.
	if (!BV->isConstant())
	return SDValue();

	// Everything checks out. Build up the new and improved node.
	SDLoc DL(N);
	EVT IntVT = BV->getValueType(0);
	// Create a new constant of the appropriate type for the transformed
	// DAG.
	SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
	// The AND node needs bitcasts to/from an integer vector type around it.
	SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
	SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
	N->getOperand(0)->getOperand(0), MaskConst);
	SDValue Res = DAG.getBitcast(VT, NewAnd);
	return Res;
	}

	return SDValue();
	}

	static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT InVT = Op0.getValueType();
	EVT InSVT = InVT.getScalarType();

	// UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
	// UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
	if (InVT.isVector() && (InSVT == MVT::i8 \|\| InSVT == MVT::i16)) {
	SDLoc dl(N);
	EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	InVT.getVectorNumElements());
	SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);

	// UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
	}

	// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
	// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
	// the optimization here.
	if (DAG.SignBitIsZero(Op0))
	return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);

	return SDValue();
	}

	static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// First try to optimize away the conversion entirely when it's
	// conditionally from a constant. Vectors only.
	if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
	return Res;

	// Now move on to more general possibilities.
	SDValue Op0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT InVT = Op0.getValueType();
	EVT InSVT = InVT.getScalarType();

	// SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
	// SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
	// SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
	if (InVT.isVector() &&
	(InSVT == MVT::i8 \|\| InSVT == MVT::i16 \|\|
	(InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
	SDLoc dl(N);
	EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	InVT.getVectorNumElements());
	SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
	}

	// Without AVX512DQ we only support i64 to float scalar conversion. For both
	// vectors and scalars, see if we know that the upper bits are all the sign
	// bit, in which case we can truncate the input to i32 and convert from that.
	if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
	unsigned BitWidth = InVT.getScalarSizeInBits();
	unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
	if (NumSignBits >= (BitWidth - 31)) {
	EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
	if (InVT.isVector())
	TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
	InVT.getVectorNumElements());
	SDLoc dl(N);
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
	}
	}

	// Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
	// a 32-bit target where SSE doesn't support i64->FP operations.
	if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
	LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
	EVT LdVT = Ld->getValueType(0);

	// This transformation is not supported if the result type is f16 or f128.
	if (VT == MVT::f16 \|\| VT == MVT::f128)
	return SDValue();

	if (!Ld->isVolatile() && !VT.isVector() &&
	ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
	!Subtarget.is64Bit() && LdVT == MVT::i64) {
	SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
	SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
	DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
	return FILDChain;
	}
	}
	return SDValue();
	}

	static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
	if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
	MVT VT = N->getSimpleValueType(0);
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
	N->getOperand(0), N->getOperand(1),
	Flags);
	}

	return SDValue();
	}

	// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
	static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	// If the LHS and RHS of the ADC node are zero, then it can't overflow and
	// the result is either zero or one (depending on the input carry bit).
	// Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
	if (X86::isZeroNode(N->getOperand(0)) &&
	X86::isZeroNode(N->getOperand(1)) &&
	// We don't have a good way to replace an EFLAGS use, so only do this when
	// dead right now.
	SDValue(N, 1).use_empty()) {
	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
	SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
	DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL,
	MVT::i8),
	N->getOperand(2)),
	DAG.getConstant(1, DL, VT));
	return DCI.CombineTo(N, Res1, CarryOut);
	}

	if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
	MVT VT = N->getSimpleValueType(0);
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
	N->getOperand(0), N->getOperand(1),
	Flags);
	}

	return SDValue();
	}

	/// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
	/// which is more useful than 0/1 in some cases.
	static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
	SDLoc DL(N);
	// "Condition code B" is also known as "the carry flag" (CF).
	SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
	SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
	MVT VT = N->getSimpleValueType(0);
	if (VT == MVT::i8)
	return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));

	assert(VT == MVT::i1 && "Unexpected type for SETCC node");
	return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
	}

	/// If this is an add or subtract where one operand is produced by a cmp+setcc,
	/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
	/// with CMP+{ADC, SBB}.
	static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
	bool IsSub = N->getOpcode() == ISD::SUB;
	SDValue X = N->getOperand(0);
	SDValue Y = N->getOperand(1);

	// If this is an add, canonicalize a zext operand to the RHS.
	// TODO: Incomplete? What if both sides are zexts?
	if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
	Y.getOpcode() != ISD::ZERO_EXTEND)
	std::swap(X, Y);

	// Look through a one-use zext.
	bool PeekedThroughZext = false;
	if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
	Y = Y.getOperand(0);
	PeekedThroughZext = true;
	}

	// If this is an add, canonicalize a setcc operand to the RHS.
	// TODO: Incomplete? What if both sides are setcc?
	// TODO: Should we allow peeking through a zext of the other operand?
	if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
	Y.getOpcode() != X86ISD::SETCC)
	std::swap(X, Y);

	if (Y.getOpcode() != X86ISD::SETCC \|\| !Y.hasOneUse())
	return SDValue();

	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);

	// If X is -1 or 0, then we have an opportunity to avoid constants required in
	// the general case below.
	auto *ConstantX = dyn_cast<ConstantSDNode>(X);
	if (ConstantX) {
	if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) \|\|
	(IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
	// This is a complicated way to get -1 or 0 from the carry flag:
	// -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
	// 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	Y.getOperand(1));
	}

	if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) \|\|
	(IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
	SDValue EFLAGS = Y->getOperand(1);
	if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
	EFLAGS.getValueType().isInteger() &&
	!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
	// Swap the operands of a SUB, and we have the same pattern as above.
	// -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
	// 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
	SDValue NewSub = DAG.getNode(
	X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
	EFLAGS.getOperand(1), EFLAGS.getOperand(0));
	SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	NewEFLAGS);
	}
	}
	}

	if (CC == X86::COND_B) {
	// X + SETB Z --> X + (mask SBB Z, Z)
	// X - SETB Z --> X - (mask SBB Z, Z)
	// TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
	SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
	if (SBB.getValueSizeInBits() != VT.getSizeInBits())
	SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
	return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
	}

	if (CC == X86::COND_A) {
	SDValue EFLAGS = Y->getOperand(1);
	// Try to convert COND_A into COND_B in an attempt to facilitate
	// materializing "setb reg".
	//
	// Do not flip "e > c", where "c" is a constant, because Cmp instruction
	// cannot take an immediate as its first operand.
	//
	if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
	EFLAGS.getValueType().isInteger() &&
	!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
	SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
	EFLAGS.getNode()->getVTList(),
	EFLAGS.getOperand(1), EFLAGS.getOperand(0));
	SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
	SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
	if (SBB.getValueSizeInBits() != VT.getSizeInBits())
	SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
	return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
	}
	}

	if (CC != X86::COND_E && CC != X86::COND_NE)
	return SDValue();

	SDValue Cmp = Y.getOperand(1);
	if (Cmp.getOpcode() != X86ISD::CMP \|\| !Cmp.hasOneUse() \|\|
	!X86::isZeroNode(Cmp.getOperand(1)) \|\|
	!Cmp.getOperand(0).getValueType().isInteger())
	return SDValue();

	SDValue Z = Cmp.getOperand(0);
	EVT ZVT = Z.getValueType();

	// If X is -1 or 0, then we have an opportunity to avoid constants required in
	// the general case below.
	if (ConstantX) {
	// 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
	// fake operands:
	// 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
	// -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
	if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) \|\|
	(!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
	SDValue Zero = DAG.getConstant(0, DL, ZVT);
	SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	SDValue(Neg.getNode(), 1));
	}

	// cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
	// with fake operands:
	// 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
	// -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
	if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) \|\|
	(!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
	SDValue One = DAG.getConstant(1, DL, ZVT);
	SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
	}
	}

	// (cmp Z, 1) sets the carry flag if Z is 0.
	SDValue One = DAG.getConstant(1, DL, ZVT);
	SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);

	// Add the flags type for ADC/SBB nodes.
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);

	// X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
	// X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
	if (CC == X86::COND_NE)
	return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
	DAG.getConstant(-1ULL, DL, VT), Cmp1);

	// X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
	// X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
	return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
	DAG.getConstant(0, DL, VT), Cmp1);
	}

	static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	SDValue MulOp = N->getOperand(0);
	SDValue Phi = N->getOperand(1);

	if (MulOp.getOpcode() != ISD::MUL)
	std::swap(MulOp, Phi);
	if (MulOp.getOpcode() != ISD::MUL)
	return SDValue();

	ShrinkMode Mode;
	if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) \|\| Mode == MULU16)
	return SDValue();

	EVT VT = N->getValueType(0);

	unsigned RegSize = 128;
	if (Subtarget.hasBWI())
	RegSize = 512;
	else if (Subtarget.hasAVX2())
	RegSize = 256;
	unsigned VectorSize = VT.getVectorNumElements() * 16;
	// If the vector size is less than 128, or greater than the supported RegSize,
	// do not use PMADD.
	if (VectorSize < 128 \|\| VectorSize > RegSize)
	return SDValue();

	SDLoc DL(N);
	EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
	VT.getVectorNumElements());
	EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	VT.getVectorNumElements() / 2);

	// Shrink the operands of mul.
	SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
	SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));

	// Madd vector size is half of the original vector size
	SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
	// Fill the rest of the output with 0
	SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
	return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
	}

	static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasSSE2())
	return SDValue();

	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	// TODO: There's nothing special about i32, any integer type above i16 should
	// work just as well.
	if (!VT.isVector() \|\| !VT.isSimple() \|\|
	!(VT.getVectorElementType() == MVT::i32))
	return SDValue();

	unsigned RegSize = 128;
	if (Subtarget.hasBWI())
	RegSize = 512;
	else if (Subtarget.hasAVX2())
	RegSize = 256;

	// We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
	// TODO: We should be able to handle larger vectors by splitting them before
	// feeding them into several SADs, and then reducing over those.
	if (VT.getSizeInBits() / 4 > RegSize)
	return SDValue();

	// We know N is a reduction add, which means one of its operands is a phi.
	// To match SAD, we need the other operand to be a vector select.
	SDValue SelectOp, Phi;
	if (Op0.getOpcode() == ISD::VSELECT) {
	SelectOp = Op0;
	Phi = Op1;
	} else if (Op1.getOpcode() == ISD::VSELECT) {
	SelectOp = Op1;
	Phi = Op0;
	} else
	return SDValue();

	// Check whether we have an abs-diff pattern feeding into the select.
	if(!detectZextAbsDiff(SelectOp, Op0, Op1))
	return SDValue();

	// SAD pattern detected. Now build a SAD instruction and an addition for
	// reduction. Note that the number of elements of the result of SAD is less
	// than the number of elements of its input. Therefore, we could only update
	// part of elements in the reduction vector.
	SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);

	// The output of PSADBW is a vector of i64.
	// We need to turn the vector of i64 into a vector of i32.
	// If the reduction vector is at least as wide as the psadbw result, just
	// bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
	// anyway.
	MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
	if (VT.getSizeInBits() >= ResVT.getSizeInBits())
	Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
	else
	Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);

	if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
	// Fill the upper elements with zero to match the add width.
	SDValue Zero = DAG.getConstant(0, DL, VT);
	Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
	DAG.getIntPtrConstant(0, DL));
	}

	return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
	}

	/// Convert vector increment or decrement to sub/add with an all-ones constant:
	/// add X, <1, 1...> --> sub X, <-1, -1...>
	/// sub X, <1, 1...> --> add X, <-1, -1...>
	/// The all-ones vector constant can be materialized using a pcmpeq instruction
	/// that is commonly recognized as an idiom (has no register dependency), so
	/// that's better/smaller than loading a splat 1 constant.
	static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
	assert((N->getOpcode() == ISD::ADD \|\| N->getOpcode() == ISD::SUB) &&
	"Unexpected opcode for increment/decrement transform");

	// Pseudo-legality check: getOnesVector() expects one of these types, so bail
	// out and wait for legalization if we have an unsupported vector length.
	EVT VT = N->getValueType(0);
	if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
	return SDValue();

	SDNode *N1 = N->getOperand(1).getNode();
	APInt SplatVal;
	if (!ISD::isConstantSplatVector(N1, SplatVal) \|\|
	!SplatVal.isOneValue())
	return SDValue();

	SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
	unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
	return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
	}

	static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	const SDNodeFlags Flags = N->getFlags();
	if (Flags.hasVectorReduction()) {
	if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
	return Sad;
	if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
	return MAdd;
	}
	EVT VT = N->getValueType(0);
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	// Try to synthesize horizontal adds from adds of shuffles.
	if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 \|\| VT == MVT::v4i32)) \|\|
	(Subtarget.hasInt256() && (VT == MVT::v16i16 \|\| VT == MVT::v8i32))) &&
	isHorizontalBinOp(Op0, Op1, true))
	return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);

	if (SDValue V = combineIncDecVector(N, DAG))
	return V;

	return combineAddOrSubToADCOrSBB(N, DAG);
	}

	static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	EVT VT = N->getValueType(0);

	// PSUBUS is supported, starting from SSE2, but special preprocessing
	// for v8i32 requires umin, which appears in SSE41.
	if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 \|\| VT == MVT::v8i16)) &&
	!(Subtarget.hasSSE41() && (VT == MVT::v8i32)) &&
	!(Subtarget.hasAVX2() && (VT == MVT::v32i8 \|\| VT == MVT::v16i16)) &&
	!(Subtarget.hasAVX512() && Subtarget.hasBWI() &&
	(VT == MVT::v64i8 \|\| VT == MVT::v32i16 \|\| VT == MVT::v16i32 \|\|
	VT == MVT::v8i64)))
	return SDValue();

	SDValue SubusLHS, SubusRHS;
	// Try to find umax(a,b) - b or a - umin(a,b) patterns
	// they may be converted to subus(a,b).
	// TODO: Need to add IR cannonicialization for this code.
	if (Op0.getOpcode() == ISD::UMAX) {
	SubusRHS = Op1;
	SDValue MaxLHS = Op0.getOperand(0);
	SDValue MaxRHS = Op0.getOperand(1);
	if (MaxLHS == Op1)
	SubusLHS = MaxRHS;
	else if (MaxRHS == Op1)
	SubusLHS = MaxLHS;
	else
	return SDValue();
	} else if (Op1.getOpcode() == ISD::UMIN) {
	SubusLHS = Op0;
	SDValue MinLHS = Op1.getOperand(0);
	SDValue MinRHS = Op1.getOperand(1);
	if (MinLHS == Op0)
	SubusRHS = MinRHS;
	else if (MinRHS == Op0)
	SubusRHS = MinLHS;
	else
	return SDValue();
	} else
	return SDValue();

	// PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
	// special preprocessing in some cases.
	if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
	return DAG.getNode(X86ISD::SUBUS, SDLoc(N), VT, SubusLHS, SubusRHS);

	// Special preprocessing case can be only applied
	// if the value was zero extended from 16 bit,
	// so we require first 16 bits to be zeros for 32 bit
	// values, or first 48 bits for 64 bit values.
	KnownBits Known;
	DAG.computeKnownBits(SubusLHS, Known);
	unsigned NumZeros = Known.countMinLeadingZeros();
	if ((VT == MVT::v8i64 && NumZeros < 48) \|\| NumZeros < 16)
	return SDValue();

	EVT ExtType = SubusLHS.getValueType();
	EVT ShrinkedType;
	if (VT == MVT::v8i32 \|\| VT == MVT::v8i64)
	ShrinkedType = MVT::v8i16;
	else
	ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;

	// If SubusLHS is zeroextended - truncate SubusRHS to it's
	// size SubusRHS = umin(0xFFF.., SubusRHS).
	SDValue SaturationConst =
	DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
	ShrinkedType.getScalarSizeInBits()),
	SDLoc(SubusLHS), ExtType);
	SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
	SaturationConst);
	SDValue NewSubusLHS =
	DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
	SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
	SDValue Psubus = DAG.getNode(X86ISD::SUBUS, SDLoc(N), ShrinkedType,
	NewSubusLHS, NewSubusRHS);
	// Zero extend the result, it may be used somewhere as 32 bit,
	// if not zext and following trunc will shrink.
	return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
	}

	static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	// X86 can't encode an immediate LHS of a sub. See if we can push the
	// negation into a preceding instruction.
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
	// If the RHS of the sub is a XOR with one use and a constant, invert the
	// immediate. Then add one to the LHS of the sub so we can turn
	// X-Y -> X+~Y+1, saving one register.
	if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
	isa<ConstantSDNode>(Op1.getOperand(1))) {
	APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
	EVT VT = Op0.getValueType();
	SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
	Op1.getOperand(0),
	DAG.getConstant(~XorC, SDLoc(Op1), VT));
	return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
	DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
	}
	}

	// Try to synthesize horizontal subs from subs of shuffles.
	EVT VT = N->getValueType(0);
	if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 \|\| VT == MVT::v4i32)) \|\|
	(Subtarget.hasInt256() && (VT == MVT::v16i16 \|\| VT == MVT::v8i32))) &&
	isHorizontalBinOp(Op0, Op1, false))
	return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);

	if (SDValue V = combineIncDecVector(N, DAG))
	return V;

	// Try to create PSUBUS if SUB's argument is max/min
	if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
	return V;

	return combineAddOrSubToADCOrSBB(N, DAG);
	}

	static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalize())
	return SDValue();

	SDLoc DL(N);
	unsigned Opcode = N->getOpcode();
	MVT VT = N->getSimpleValueType(0);
	MVT SVT = VT.getVectorElementType();
	unsigned NumElts = VT.getVectorNumElements();
	unsigned EltSizeInBits = SVT.getSizeInBits();

	SDValue Op = N->getOperand(0);
	MVT OpVT = Op.getSimpleValueType();
	MVT OpEltVT = OpVT.getVectorElementType();
	unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
	unsigned InputBits = OpEltSizeInBits * NumElts;

	// Perform any constant folding.
	// FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
	APInt UndefElts;
	SmallVector<APInt, 64> EltBits;
	if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
	APInt Undefs(NumElts, 0);
	SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
	bool IsZEXT =
	(Opcode == X86ISD::VZEXT) \|\| (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
	for (unsigned i = 0; i != NumElts; ++i) {
	if (UndefElts[i]) {
	Undefs.setBit(i);
	continue;
	}
	Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
	: EltBits[i].sextOrTrunc(EltSizeInBits);
	}
	return getConstVector(Vals, Undefs, VT, DAG, DL);
	}

	// (vzext (bitcast (vzext (x)) -> (vzext x)
	// TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
	SDValue V = peekThroughBitcasts(Op);
	if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
	MVT InnerVT = V.getSimpleValueType();
	MVT InnerEltVT = InnerVT.getVectorElementType();

	// If the element sizes match exactly, we can just do one larger vzext. This
	// is always an exact type match as vzext operates on integer types.
	if (OpEltVT == InnerEltVT) {
	assert(OpVT == InnerVT && "Types must match for vzext!");
	return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
	}

	// The only other way we can combine them is if only a single element of the
	// inner vzext is used in the input to the outer vzext.
	if (InnerEltVT.getSizeInBits() < InputBits)
	return SDValue();

	// In this case, the inner vzext is completely dead because we're going to
	// only look at bits inside of the low element. Just do the outer vzext on
	// a bitcast of the input to the inner.
	return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
	}

	// Check if we can bypass extracting and re-inserting an element of an input
	// vector. Essentially:
	// (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
	// TODO: Add X86ISD::VSEXT support
	if (Opcode == X86ISD::VZEXT &&
	V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
	SDValue ExtractedV = V.getOperand(0);
	SDValue OrigV = ExtractedV.getOperand(0);
	if (isNullConstant(ExtractedV.getOperand(1))) {
	MVT OrigVT = OrigV.getSimpleValueType();
	// Extract a subvector if necessary...
	if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
	int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
	OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
	OrigVT.getVectorNumElements() / Ratio);
	OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
	DAG.getIntPtrConstant(0, DL));
	}
	Op = DAG.getBitcast(OpVT, OrigV);
	return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
	}
	}

	return SDValue();
	}

	static SDValue combineTestM(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	MVT VT = N->getSimpleValueType(0);
	SDLoc DL(N);

	// TEST (AND a, b) ,(AND a, b) -> TEST a, b
	if (Op0 == Op1 && Op1->getOpcode() == ISD::AND)
	return DAG.getNode(X86ISD::TESTM, DL, VT, Op0->getOperand(0),
	Op0->getOperand(1));

	// TEST op0, BUILD_VECTOR(all_zero) -> BUILD_VECTOR(all_zero)
	// TEST BUILD_VECTOR(all_zero), op1 -> BUILD_VECTOR(all_zero)
	if (ISD::isBuildVectorAllZeros(Op0.getNode()) \|\|
	ISD::isBuildVectorAllZeros(Op1.getNode()))
	return getZeroVector(VT, Subtarget, DAG, DL);

	return SDValue();
	}

	static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = N->getSimpleValueType(0);
	SDLoc DL(N);

	if (N->getOperand(0) == N->getOperand(1)) {
	if (N->getOpcode() == X86ISD::PCMPEQ)
	return getOnesVector(VT, DAG, DL);
	if (N->getOpcode() == X86ISD::PCMPGT)
	return getZeroVector(VT, Subtarget, DAG, DL);
	}

	return SDValue();
	}

	static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	MVT OpVT = N->getSimpleValueType(0);

	// Early out for mask vectors.
	if (OpVT.getVectorElementType() == MVT::i1)
	return SDValue();

	SDLoc dl(N);
	SDValue Vec = N->getOperand(0);
	SDValue SubVec = N->getOperand(1);

	unsigned IdxVal = N->getConstantOperandVal(2);
	MVT SubVecVT = SubVec.getSimpleValueType();

	if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
	// Inserting zeros into zeros is a nop.
	if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
	return Vec;

	// If we're inserting into a zero vector and then into a larger zero vector,
	// just insert into the larger zero vector directly.
	if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
	ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
	unsigned Idx2Val = SubVec.getConstantOperandVal(2);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
	SubVec.getOperand(1),
	DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
	}

	// If we're inserting a bitcast into zeros, rewrite the insert and move the
	// bitcast to the other side. This helps with detecting zero extending
	// during isel.
	// TODO: Is this useful for other indices than 0?
	if (SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) {
	MVT CastVT = SubVec.getOperand(0).getSimpleValueType();
	unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits();
	MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems);
	SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
	DAG.getBitcast(NewVT, Vec),
	SubVec.getOperand(0), N->getOperand(2));
	return DAG.getBitcast(OpVT, Insert);
	}
	}

	// If this is an insert of an extract, combine to a shuffle. Don't do this
	// if the insert or extract can be represented with a subregister operation.
	if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	SubVec.getOperand(0).getSimpleValueType() == OpVT &&
	(IdxVal != 0 \|\| !Vec.isUndef())) {
	int ExtIdxVal = SubVec.getConstantOperandVal(1);
	if (ExtIdxVal != 0) {
	int VecNumElts = OpVT.getVectorNumElements();
	int SubVecNumElts = SubVecVT.getVectorNumElements();
	SmallVector<int, 64> Mask(VecNumElts);
	// First create an identity shuffle mask.
	for (int i = 0; i != VecNumElts; ++i)
	Mask[i] = i;
	// Now insert the extracted portion.
	for (int i = 0; i != SubVecNumElts; ++i)
	Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;

	return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
	}
	}

	// Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
	// load:
	// (insert_subvector (insert_subvector undef, (load16 addr), 0),
	// (load16 addr + 16), Elts/2)
	// --> load32 addr
	// or:
	// (insert_subvector (insert_subvector undef, (load32 addr), 0),
	// (load32 addr + 32), Elts/2)
	// --> load64 addr
	// or a 16-byte or 32-byte broadcast:
	// (insert_subvector (insert_subvector undef, (load16 addr), 0),
	// (load16 addr), Elts/2)
	// --> X86SubVBroadcast(load16 addr)
	// or:
	// (insert_subvector (insert_subvector undef, (load32 addr), 0),
	// (load32 addr), Elts/2)
	// --> X86SubVBroadcast(load32 addr)
	if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
	Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
	OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
	auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
	if (Idx2 && Idx2->getZExtValue() == 0) {
	SDValue SubVec2 = Vec.getOperand(1);
	// If needed, look through bitcasts to get to the load.
	if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
	bool Fast;
	unsigned Alignment = FirstLd->getAlignment();
	unsigned AS = FirstLd->getAddressSpace();
	const X86TargetLowering *TLI = Subtarget.getTargetLowering();
	if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
	OpVT, AS, Alignment, &Fast) && Fast) {
	SDValue Ops[] = {SubVec2, SubVec};
	if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
	Subtarget, false))
	return Ld;
	}
	}
	// If lower/upper loads are the same and the only users of the load, then
	// lower to a VBROADCASTF128/VBROADCASTI128/etc.
	if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2)))
	if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
	SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode()))
	return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);

	// If this is subv_broadcast insert into both halves, use a larger
	// subv_broadcast.
	if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2)
	return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
	SubVec.getOperand(0));

	// If we're inserting all zeros into the upper half, change this to
	// an insert into an all zeros vector. We will match this to a move
	// with implicit upper bit zeroing during isel.
	if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
	getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2,
	Vec.getOperand(2));

	// If we are inserting into both halves of the vector, the starting
	// vector should be undef. If it isn't, make it so. Only do this if the
	// the early insert has no other uses.
	// TODO: Should this be a generic DAG combine?
	if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) {
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
	SubVec2, Vec.getOperand(2));
	DCI.AddToWorklist(Vec.getNode());
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
	N->getOperand(2));

	}
	}
	}

	return SDValue();
	}

	static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	MVT OpVT = N->getSimpleValueType(0);
	SDValue InVec = N->getOperand(0);
	unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();

	if (ISD::isBuildVectorAllZeros(InVec.getNode()))
	return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N));

	if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
	if (OpVT.getScalarType() == MVT::i1)
	return DAG.getConstant(1, SDLoc(N), OpVT);
	return getOnesVector(OpVT, DAG, SDLoc(N));
	}

	if (InVec.getOpcode() == ISD::BUILD_VECTOR)
	return DAG.getBuildVector(
	OpVT, SDLoc(N),
	InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements()));

	return SDValue();
	}

	SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	switch (N->getOpcode()) {
	default: break;
	case ISD::EXTRACT_VECTOR_ELT:
	case X86ISD::PEXTRW:
	case X86ISD::PEXTRB:
	return combineExtractVectorElt(N, DAG, DCI, Subtarget);
	case ISD::INSERT_SUBVECTOR:
	return combineInsertSubvector(N, DAG, DCI, Subtarget);
	case ISD::EXTRACT_SUBVECTOR:
	return combineExtractSubvector(N, DAG, DCI, Subtarget);
	case ISD::VSELECT:
	case ISD::SELECT:
	case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
	case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
	case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
	case ISD::ADD: return combineAdd(N, DAG, Subtarget);
	case ISD::SUB: return combineSub(N, DAG, Subtarget);
	case X86ISD::SBB: return combineSBB(N, DAG);
	case X86ISD::ADC: return combineADC(N, DAG, DCI);
	case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
	case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
	case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
	case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
	case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
	case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
	case ISD::STORE: return combineStore(N, DAG, Subtarget);
	case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
	case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
	case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
	case ISD::FADD:
	case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
	case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
	case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
	case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
	case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
	case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
	case X86ISD::FXOR:
	case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
	case X86ISD::FMIN:
	case X86ISD::FMAX: return combineFMinFMax(N, DAG);
	case ISD::FMINNUM:
	case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
	case X86ISD::BT: return combineBT(N, DAG, DCI);
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
	case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
	case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
	case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
	case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
	case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
	case X86ISD::PACKSS:
	case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
	case X86ISD::VSHLI:
	case X86ISD::VSRAI:
	case X86ISD::VSRLI:
	return combineVectorShiftImm(N, DAG, DCI, Subtarget);
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	case X86ISD::VSEXT:
	case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
	case X86ISD::PINSRB:
	case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
	case X86ISD::SHUFP: // Handle all target specific shuffles
	case X86ISD::INSERTPS:
	case X86ISD::EXTRQI:
	case X86ISD::INSERTQI:
	case X86ISD::PALIGNR:
	case X86ISD::VSHLDQ:
	case X86ISD::VSRLDQ:
	case X86ISD::BLENDI:
	case X86ISD::UNPCKH:
	case X86ISD::UNPCKL:
	case X86ISD::MOVHLPS:
	case X86ISD::MOVLHPS:
	case X86ISD::PSHUFB:
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFHW:
	case X86ISD::PSHUFLW:
	case X86ISD::MOVSHDUP:
	case X86ISD::MOVSLDUP:
	case X86ISD::MOVDDUP:
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	case X86ISD::VBROADCAST:
	case X86ISD::VPPERM:
	case X86ISD::VPERMI:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	case X86ISD::VPERMIV3:
	case X86ISD::VPERMIL2:
	case X86ISD::VPERMILPI:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERM2X128:
	case X86ISD::VZEXT_MOVL:
	case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
	case X86ISD::FMADD_RND:
	case X86ISD::FMADDS1_RND:
	case X86ISD::FMADDS3_RND:
	case X86ISD::FMADDS1:
	case X86ISD::FMADDS3:
	case X86ISD::FMADD4S:
	case ISD::FMA: return combineFMA(N, DAG, Subtarget);
	case X86ISD::FMADDSUB_RND:
	case X86ISD::FMSUBADD_RND:
	case X86ISD::FMADDSUB:
	case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget);
	case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI);
	case X86ISD::MGATHER:
	case X86ISD::MSCATTER:
	case ISD::MGATHER:
	case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget);
	case X86ISD::TESTM: return combineTestM(N, DAG, Subtarget);
	case X86ISD::PCMPEQ:
	case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
	}

	return SDValue();
	}

	/// Return true if the target has native support for the specified value type
	/// and it is 'desirable' to use the type for the given node type. e.g. On x86
	/// i16 is legal, but undesirable since i16 instruction encodings are longer and
	/// some i16 instructions are slow.
	bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
	if (!isTypeLegal(VT))
	return false;
	if (VT != MVT::i16)
	return true;

	switch (Opc) {
	default:
	return true;
	case ISD::LOAD:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	case ISD::SHL:
	case ISD::SRL:
	case ISD::SUB:
	case ISD::ADD:
	case ISD::MUL:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	return false;
	}
	}

	/// This function checks if any of the users of EFLAGS copies the EFLAGS. We
	/// know that the code that lowers COPY of EFLAGS has to use the stack, and if
	/// we don't adjust the stack we clobber the first frame index.
	/// See X86InstrInfo::copyPhysReg.
	static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {
	const MachineRegisterInfo &MRI = MF.getRegInfo();
	return any_of(MRI.reg_instructions(X86::EFLAGS),
	[](const MachineInstr &RI) { return RI.isCopy(); });
	}

	void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {
	if (hasCopyImplyingStackAdjustment(MF)) {
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MFI.setHasCopyImplyingStackAdjustment(true);
	}

	TargetLoweringBase::finalizeLowering(MF);
	}

	/// This method query the target whether it is beneficial for dag combiner to
	/// promote the specified node. If true, it should return the desired promotion
	/// type by reference.
	bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
	EVT VT = Op.getValueType();
	if (VT != MVT::i16)
	return false;

	bool Promote = false;
	bool Commute = false;
	switch (Op.getOpcode()) {
	default: break;
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	Promote = true;
	break;
	case ISD::SHL:
	case ISD::SRL: {
	SDValue N0 = Op.getOperand(0);
	// Look out for (store (shl (load), x)).
	if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
	return false;
	Promote = true;
	break;
	}
	case ISD::ADD:
	case ISD::MUL:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	Commute = true;
	LLVM_FALLTHROUGH;
	case ISD::SUB: {
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);
	if (!Commute && MayFoldLoad(N1))
	return false;
	// Avoid disabling potential load folding opportunities.
	if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) \|\| MayFoldIntoStore(Op)))
	return false;
	if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) \|\| MayFoldIntoStore(Op)))
	return false;
	Promote = true;
	}
	}

	PVT = MVT::i32;
	return Promote;
	}

	bool X86TargetLowering::
	isDesirableToCombineBuildVectorToShuffleTruncate(
	ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {

	assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&
	"Element count mismatch");
	assert(
	Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&
	"Shuffle Mask expected to be legal");

	// For 32-bit elements VPERMD is better than shuffle+truncate.
	// TODO: After we improve lowerBuildVector, add execption for VPERMW.
	if (SrcVT.getScalarSizeInBits() == 32 \|\| !Subtarget.hasAVX2())
	return false;

	if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))
	return false;

	return true;
	}

	//===----------------------------------------------------------------------===//
	// X86 Inline Assembly Support
	//===----------------------------------------------------------------------===//

	// Helper to match a string separated by whitespace.
	static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
	S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.

	for (StringRef Piece : Pieces) {
	if (!S.startswith(Piece)) // Check if the piece matches.
	return false;

	S = S.substr(Piece.size());
	StringRef::size_type Pos = S.find_first_not_of(" \t");
	if (Pos == 0) // We matched a prefix.
	return false;

	S = S.substr(Pos);
	}

	return S.empty();
	}

	static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {

	if (AsmPieces.size() == 3 \|\| AsmPieces.size() == 4) {
	if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
	std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
	std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {

	if (AsmPieces.size() == 3)
	return true;
	else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
	return true;
	}
	}
	return false;
	}

	bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
	InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());

	const std::string &AsmStr = IA->getAsmString();

	IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
	if (!Ty \|\| Ty->getBitWidth() % 16 != 0)
	return false;

	// TODO: should remove alternatives from the asmstring: "foo {a\|b}" -> "foo a"
	SmallVector<StringRef, 4> AsmPieces;
	SplitString(AsmStr, AsmPieces, ";\n");

	switch (AsmPieces.size()) {
	default: return false;
	case 1:
	// FIXME: this should verify that we are targeting a 486 or better. If not,
	// we will turn this bswap into something that will be lowered to logical
	// ops instead of emitting the bswap asm. For now, we don't support 486 or
	// lower so don't worry about this.
	// bswap $0
	if (matchAsm(AsmPieces[0], {"bswap", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswapl", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswapq", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) \|\|
	matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) \|\|
	matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
	// No need to check constraints, nothing other than the equivalent of
	// "=r,0" would be valid here.
	return IntrinsicLowering::LowerToByteSwap(CI);
	}

	// rorw $$8, ${0:w} --> llvm.bswap.i16
	if (CI->getType()->isIntegerTy(16) &&
	IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
	(matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) \|\|
	matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
	AsmPieces.clear();
	StringRef ConstraintsStr = IA->getConstraintString();
	SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
	array_pod_sort(AsmPieces.begin(), AsmPieces.end());
	if (clobbersFlagRegisters(AsmPieces))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}
	break;
	case 3:
	if (CI->getType()->isIntegerTy(32) &&
	IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
	matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
	matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
	matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
	AsmPieces.clear();
	StringRef ConstraintsStr = IA->getConstraintString();
	SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
	array_pod_sort(AsmPieces.begin(), AsmPieces.end());
	if (clobbersFlagRegisters(AsmPieces))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}

	if (CI->getType()->isIntegerTy(64)) {
	InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
	if (Constraints.size() >= 2 &&
	Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
	Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
	// bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
	if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
	matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
	matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}
	}
	break;
	}
	return false;
	}

	/// Given a constraint letter, return the type of constraint for this target.
	X86TargetLowering::ConstraintType
	X86TargetLowering::getConstraintType(StringRef Constraint) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	case 'R':
	case 'q':
	case 'Q':
	case 'f':
	case 't':
	case 'u':
	case 'y':
	case 'x':
	case 'v':
	case 'Y':
	case 'l':
	case 'k': // AVX512 masking registers.
	return C_RegisterClass;
	case 'a':
	case 'b':
	case 'c':
	case 'd':
	case 'S':
	case 'D':
	case 'A':
	return C_Register;
	case 'I':
	case 'J':
	case 'K':
	case 'L':
	case 'M':
	case 'N':
	case 'G':
	case 'C':
	case 'e':
	case 'Z':
	return C_Other;
	default:
	break;
	}
	}
	else if (Constraint.size() == 2) {
	switch (Constraint[0]) {
	default:
	break;
	case 'Y':
	switch (Constraint[1]) {
	default:
	break;
	case 'z':
	case '0':
	return C_Register;
	case 'i':
	case 'm':
	case 'k':
	case 't':
	case '2':
	return C_RegisterClass;
	}
	}
	}
	return TargetLowering::getConstraintType(Constraint);
	}

	/// Examine constraint type and operand type and determine a weight value.
	/// This object must already have been set up with the operand type
	/// and the current alternative constraint selected.
	TargetLowering::ConstraintWeight
	X86TargetLowering::getSingleConstraintMatchWeight(
	AsmOperandInfo &info, const char *constraint) const {
	ConstraintWeight weight = CW_Invalid;
	Value *CallOperandVal = info.CallOperandVal;
	// If we don't have a value, we can't do a match,
	// but allow it at the lowest weight.
	if (!CallOperandVal)
	return CW_Default;
	Type *type = CallOperandVal->getType();
	// Look at the constraint type.
	switch (*constraint) {
	default:
	weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
	LLVM_FALLTHROUGH;
	case 'R':
	case 'q':
	case 'Q':
	case 'a':
	case 'b':
	case 'c':
	case 'd':
	case 'S':
	case 'D':
	case 'A':
	if (CallOperandVal->getType()->isIntegerTy())
	weight = CW_SpecificReg;
	break;
	case 'f':
	case 't':
	case 'u':
	if (type->isFloatingPointTy())
	weight = CW_SpecificReg;
	break;
	case 'y':
	if (type->isX86_MMXTy() && Subtarget.hasMMX())
	weight = CW_SpecificReg;
	break;
	case 'Y': {
	unsigned Size = StringRef(constraint).size();
	// Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
	char NextChar = Size == 2 ? constraint[1] : 'i';
	if (Size > 2)
	break;
	switch (NextChar) {
	default:
	return CW_Invalid;
	// XMM0
	case 'z':
	case '0':
	if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
	return CW_SpecificReg;
	return CW_Invalid;
	// Conditional OpMask regs (AVX512)
	case 'k':
	if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
	return CW_Register;
	return CW_Invalid;
	// Any MMX reg
	case 'm':
	if (type->isX86_MMXTy() && Subtarget.hasMMX())
	return weight;
	return CW_Invalid;
	// Any SSE reg when ISA >= SSE2, same as 'Y'
	case 'i':
	case 't':
	case '2':
	if (!Subtarget.hasSSE2())
	return CW_Invalid;
	break;
	}
	// Fall through (handle "Y" constraint).
	LLVM_FALLTHROUGH;
	}
	case 'v':
	if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
	weight = CW_Register;
	LLVM_FALLTHROUGH;
	case 'x':
	if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) \|\|
	((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
	weight = CW_Register;
	break;
	case 'k':
	// Enable conditional vector operations using %k<#> registers.
	if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
	weight = CW_Register;
	break;
	case 'I':
	if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
	if (C->getZExtValue() <= 31)
	weight = CW_Constant;
	}
	break;
	case 'J':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 63)
	weight = CW_Constant;
	}
	break;
	case 'K':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
	weight = CW_Constant;
	}
	break;
	case 'L':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getZExtValue() == 0xff) \|\| (C->getZExtValue() == 0xffff))
	weight = CW_Constant;
	}
	break;
	case 'M':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 3)
	weight = CW_Constant;
	}
	break;
	case 'N':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 0xff)
	weight = CW_Constant;
	}
	break;
	case 'G':
	case 'C':
	if (isa<ConstantFP>(CallOperandVal)) {
	weight = CW_Constant;
	}
	break;
	case 'e':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getSExtValue() >= -0x80000000LL) &&
	(C->getSExtValue() <= 0x7fffffffLL))
	weight = CW_Constant;
	}
	break;
	case 'Z':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 0xffffffff)
	weight = CW_Constant;
	}
	break;
	}
	return weight;
	}

	/// Try to replace an X constraint, which matches anything, with another that
	/// has more specific requirements based on the type of the corresponding
	/// operand.
	const char *X86TargetLowering::
	LowerXConstraint(EVT ConstraintVT) const {
	// FP X constraints get lowered to SSE1/2 registers if available, otherwise
	// 'f' like normal targets.
	if (ConstraintVT.isFloatingPoint()) {
	if (Subtarget.hasSSE2())
	return "Y";
	if (Subtarget.hasSSE1())
	return "x";
	}

	return TargetLowering::LowerXConstraint(ConstraintVT);
	}

	/// Lower the specified operand into the Ops vector.
	/// If it is invalid, don't add anything to Ops.
	void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
	std::string &Constraint,
	std::vector<SDValue>&Ops,
	SelectionDAG &DAG) const {
	SDValue Result;

	// Only support length 1 constraints for now.
	if (Constraint.length() > 1) return;

	char ConstraintLetter = Constraint[0];
	switch (ConstraintLetter) {
	default: break;
	case 'I':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 31) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'J':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 63) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'K':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (isInt<8>(C->getSExtValue())) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'L':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() == 0xff \|\| C->getZExtValue() == 0xffff \|\|
	(Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
	Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'M':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 3) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'N':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 255) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'O':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 127) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'e': {
	// 32-bit signed value
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
	C->getSExtValue())) {
	// Widen to 64 bits here to get it sign extended.
	Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
	break;
	}
	// FIXME gcc accepts some relocatable values here too, but only in certain
	// memory models; it's complicated.
	}
	return;
	}
	case 'Z': {
	// 32-bit unsigned value
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
	C->getZExtValue())) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	// FIXME gcc accepts some relocatable values here too, but only in certain
	// memory models; it's complicated.
	return;
	}
	case 'i': {
	// Literal immediates are always ok.
	if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
	// Widen to 64 bits here to get it sign extended.
	Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
	break;
	}

	// In any sort of PIC mode addresses need to be computed at runtime by
	// adding in a register or some sort of table lookup. These can't
	// be used as immediates.
	if (Subtarget.isPICStyleGOT() \|\| Subtarget.isPICStyleStubPIC())
	return;

	// If we are in non-pic codegen mode, we allow the address of a global (with
	// an optional displacement) to be used with 'i'.
	GlobalAddressSDNode *GA = nullptr;
	int64_t Offset = 0;

	// Match either (GA), (GA+C), (GA+C1+C2), etc.
	while (1) {
	if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
	Offset += GA->getOffset();
	break;
	} else if (Op.getOpcode() == ISD::ADD) {
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	Offset += C->getZExtValue();
	Op = Op.getOperand(0);
	continue;
	}
	} else if (Op.getOpcode() == ISD::SUB) {
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	Offset += -C->getZExtValue();
	Op = Op.getOperand(0);
	continue;
	}
	}

	// Otherwise, this isn't something we can handle, reject it.
	return;
	}

	const GlobalValue *GV = GA->getGlobal();
	// If we require an extra load to get this address, as in PIC mode, we
	// can't accept it.
	if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
	return;

	Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
	GA->getValueType(0), Offset);
	break;
	}
	}

	if (Result.getNode()) {
	Ops.push_back(Result);
	return;
	}
	return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
	}

	/// Check if \p RC is a general purpose register class.
	/// I.e., GR* or one of their variant.
	static bool isGRClass(const TargetRegisterClass &RC) {
	return RC.hasSuperClassEq(&X86::GR8RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR16RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR32RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR64RegClass) \|\|
	RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
	}

	/// Check if \p RC is a vector register class.
	/// I.e., FR* / VR* or one of their variant.
	static bool isFRClass(const TargetRegisterClass &RC) {
	return RC.hasSuperClassEq(&X86::FR32XRegClass) \|\|
	RC.hasSuperClassEq(&X86::FR64XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR128XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR256XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR512RegClass);
	}

	std::pair<unsigned, const TargetRegisterClass *>
	X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint,
	MVT VT) const {
	// First, see if this is a constraint that directly corresponds to an LLVM
	// register class.
	if (Constraint.size() == 1) {
	// GCC Constraint Letters
	switch (Constraint[0]) {
	default: break;
	// TODO: Slight differences here in allocation order and leaving
	// RIP in the class. Do they matter any more here than they do
	// in the normal allocation?
	case 'k':
	if (Subtarget.hasAVX512()) {
	// Only supported in AVX512 or later.
	switch (VT.SimpleTy) {
	default: break;
	case MVT::i32:
	return std::make_pair(0U, &X86::VK32RegClass);
	case MVT::i16:
	return std::make_pair(0U, &X86::VK16RegClass);
	case MVT::i8:
	return std::make_pair(0U, &X86::VK8RegClass);
	case MVT::i1:
	return std::make_pair(0U, &X86::VK1RegClass);
	case MVT::i64:
	return std::make_pair(0U, &X86::VK64RegClass);
	}
	}
	break;
	case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
	if (Subtarget.is64Bit()) {
	if (VT == MVT::i32 \|\| VT == MVT::f32)
	return std::make_pair(0U, &X86::GR32RegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16RegClass);
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8RegClass);
	if (VT == MVT::i64 \|\| VT == MVT::f64)
	return std::make_pair(0U, &X86::GR64RegClass);
	break;
	}
	LLVM_FALLTHROUGH;
	// 32-bit fallthrough
	case 'Q': // Q_REGS
	if (VT == MVT::i32 \|\| VT == MVT::f32)
	return std::make_pair(0U, &X86::GR32_ABCDRegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16_ABCDRegClass);
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
	if (VT == MVT::i64)
	return std::make_pair(0U, &X86::GR64_ABCDRegClass);
	break;
	case 'r': // GENERAL_REGS
	case 'l': // INDEX_REGS
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8RegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16RegClass);
	if (VT == MVT::i32 \|\| VT == MVT::f32 \|\| !Subtarget.is64Bit())
	return std::make_pair(0U, &X86::GR32RegClass);
	return std::make_pair(0U, &X86::GR64RegClass);
	case 'R': // LEGACY_REGS
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8_NOREXRegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16_NOREXRegClass);
	if (VT == MVT::i32 \|\| !Subtarget.is64Bit())
	return std::make_pair(0U, &X86::GR32_NOREXRegClass);
	return std::make_pair(0U, &X86::GR64_NOREXRegClass);
	case 'f': // FP Stack registers.
	// If SSE is enabled for this VT, use f80 to ensure the isel moves the
	// value to the correct fpstack register class.
	if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
	return std::make_pair(0U, &X86::RFP32RegClass);
	if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
	return std::make_pair(0U, &X86::RFP64RegClass);
	return std::make_pair(0U, &X86::RFP80RegClass);
	case 'y': // MMX_REGS if MMX allowed.
	if (!Subtarget.hasMMX()) break;
	return std::make_pair(0U, &X86::VR64RegClass);
	case 'Y': // SSE_REGS if SSE2 allowed
	if (!Subtarget.hasSSE2()) break;
	LLVM_FALLTHROUGH;
	case 'v':
	case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
	if (!Subtarget.hasSSE1()) break;
	bool VConstraint = (Constraint[0] == 'v');

	switch (VT.SimpleTy) {
	default: break;
	// Scalar SSE types.
	case MVT::f32:
	case MVT::i32:
	if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::FR32XRegClass);
	return std::make_pair(0U, &X86::FR32RegClass);
	case MVT::f64:
	case MVT::i64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::FR64XRegClass);
	return std::make_pair(0U, &X86::FR64RegClass);
	// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
	// Vector types.
	case MVT::v16i8:
	case MVT::v8i16:
	case MVT::v4i32:
	case MVT::v2i64:
	case MVT::v4f32:
	case MVT::v2f64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::VR128XRegClass);
	return std::make_pair(0U, &X86::VR128RegClass);
	// AVX types.
	case MVT::v32i8:
	case MVT::v16i16:
	case MVT::v8i32:
	case MVT::v4i64:
	case MVT::v8f32:
	case MVT::v4f64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::VR256XRegClass);
	return std::make_pair(0U, &X86::VR256RegClass);
	case MVT::v8f64:
	case MVT::v16f32:
	case MVT::v16i32:
	case MVT::v8i64:
	return std::make_pair(0U, &X86::VR512RegClass);
	}
	break;
	}
	} else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
	switch (Constraint[1]) {
	default:
	break;
	case 'i':
	case 't':
	case '2':
	return getRegForInlineAsmConstraint(TRI, "Y", VT);
	case 'm':
	if (!Subtarget.hasMMX()) break;
	return std::make_pair(0U, &X86::VR64RegClass);
	case 'z':
	case '0':
	if (!Subtarget.hasSSE1()) break;
	return std::make_pair(X86::XMM0, &X86::VR128RegClass);
	case 'k':
	// This register class doesn't allocate k0 for masked vector operation.
	if (Subtarget.hasAVX512()) { // Only supported in AVX512.
	switch (VT.SimpleTy) {
	default: break;
	case MVT::i32:
	return std::make_pair(0U, &X86::VK32WMRegClass);
	case MVT::i16:
	return std::make_pair(0U, &X86::VK16WMRegClass);
	case MVT::i8:
	return std::make_pair(0U, &X86::VK8WMRegClass);
	case MVT::i1:
	return std::make_pair(0U, &X86::VK1WMRegClass);
	case MVT::i64:
	return std::make_pair(0U, &X86::VK64WMRegClass);
	}
	}
	break;
	}
	}

	// Use the default implementation in TargetLowering to convert the register
	// constraint into a member of a register class.
	std::pair<unsigned, const TargetRegisterClass*> Res;
	Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

	// Not found as a standard register?
	if (!Res.second) {
	// Map st(0) -> st(7) -> ST0
	if (Constraint.size() == 7 && Constraint[0] == '{' &&
	tolower(Constraint[1]) == 's' &&
	tolower(Constraint[2]) == 't' &&
	Constraint[3] == '(' &&
	(Constraint[4] >= '0' && Constraint[4] <= '7') &&
	Constraint[5] == ')' &&
	Constraint[6] == '}') {

	Res.first = X86::FP0+Constraint[4]-'0';
	Res.second = &X86::RFP80RegClass;
	return Res;
	}

	// GCC allows "st(0)" to be called just plain "st".
	if (StringRef("{st}").equals_lower(Constraint)) {
	Res.first = X86::FP0;
	Res.second = &X86::RFP80RegClass;
	return Res;
	}

	// flags -> EFLAGS
	if (StringRef("{flags}").equals_lower(Constraint)) {
	Res.first = X86::EFLAGS;
	Res.second = &X86::CCRRegClass;
	return Res;
	}

	// 'A' means [ER]AX + [ER]DX.
	if (Constraint == "A") {
	if (Subtarget.is64Bit()) {
	Res.first = X86::RAX;
	Res.second = &X86::GR64_ADRegClass;
	} else {
	assert((Subtarget.is32Bit() \|\| Subtarget.is16Bit()) &&
	"Expecting 64, 32 or 16 bit subtarget");
	Res.first = X86::EAX;
	Res.second = &X86::GR32_ADRegClass;
	}
	return Res;
	}
	return Res;
	}

	// Otherwise, check to see if this is a register class of the wrong value
	// type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
	// turn into {ax},{dx}.
	// MVT::Other is used to specify clobber names.
	if (TRI->isTypeLegalForClass(*Res.second, VT) \|\| VT == MVT::Other)
	return Res; // Correct type already, nothing to do.

	// Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
	// return "eax". This should even work for things like getting 64bit integer
	// registers when given an f64 type.
	const TargetRegisterClass *Class = Res.second;
	// The generic code will match the first register class that contains the
	// given register. Thus, based on the ordering of the tablegened file,
	// the "plain" GR classes might not come first.
	// Therefore, use a helper method.
	if (isGRClass(*Class)) {
	unsigned Size = VT.getSizeInBits();
	if (Size == 1) Size = 8;
	unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
	if (DestReg > 0) {
	bool is64Bit = Subtarget.is64Bit();
	const TargetRegisterClass *RC =
	Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
	: Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
	: Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
	: &X86::GR64RegClass;
	if (RC->contains(DestReg))
	Res = std::make_pair(DestReg, RC);
	} else {
	// No register found/type mismatch.
	Res.first = 0;
	Res.second = nullptr;
	}
	} else if (isFRClass(*Class)) {
	// Handle references to XMM physical registers that got mapped into the
	// wrong class. This can happen with constraints like {xmm0} where the
	// target independent register mapper will just pick the first match it can
	// find, ignoring the required type.

	// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
	if (VT == MVT::f32 \|\| VT == MVT::i32)
	Res.second = &X86::FR32RegClass;
	else if (VT == MVT::f64 \|\| VT == MVT::i64)
	Res.second = &X86::FR64RegClass;
	else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
	Res.second = &X86::VR128RegClass;
	else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
	Res.second = &X86::VR256RegClass;
	else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
	Res.second = &X86::VR512RegClass;
	else {
	// Type mismatch and not a clobber: Return an error;
	Res.first = 0;
	Res.second = nullptr;
	}
	}

	return Res;
	}

	int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS) const {
	// Scaling factors are not free at all.
	// An indexed folded instruction, i.e., inst (reg1, reg2, scale),
	// will take 2 allocations in the out of order engine instead of 1
	// for plain addressing mode, i.e. inst (reg1).
	// E.g.,
	// vaddps (%rsi,%drx), %ymm0, %ymm1
	// Requires two allocations (one for the load, one for the computation)
	// whereas:
	// vaddps (%rsi), %ymm0, %ymm1
	// Requires just 1 allocation, i.e., freeing allocations for other operations
	// and having less micro operations to execute.
	//
	// For some X86 architectures, this is even worse because for instance for
	// stores, the complex addressing mode forces the instruction to use the
	// "load" ports instead of the dedicated "store" port.
	// E.g., on Haswell:
	// vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
	// vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
	if (isLegalAddressingMode(DL, AM, Ty, AS))
	// Scale represents reg2 * scale, thus account for 1
	// as soon as we use a second register.
	return AM.Scale != 0;
	return -1;
	}

	bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
	// Integer division on x86 is expensive. However, when aggressively optimizing
	// for code size, we prefer to use a div instruction, as it is usually smaller
	// than the alternative sequence.
	// The exception to this is vector division. Since x86 doesn't have vector
	// integer division, leaving the division as-is is a loss even in terms of
	// size, because it will have to be scalarized, while the alternative code
	// sequence can be performed in vector form.
	bool OptSize =
	Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
	return OptSize && !VT.isVector();
	}

	void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
	if (!Subtarget.is64Bit())
	return;

	// Update IsSplitCSR in X86MachineFunctionInfo.
	X86MachineFunctionInfo *AFI =
	Entry->getParent()->getInfo<X86MachineFunctionInfo>();
	AFI->setIsSplitCSR(true);
	}

	void X86TargetLowering::insertCopiesSplitCSR(
	MachineBasicBlock *Entry,
	const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
	if (!IStart)
	return;

	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
	MachineBasicBlock::iterator MBBI = Entry->begin();
	for (const MCPhysReg I = IStart; I; ++I) {
	const TargetRegisterClass *RC = nullptr;
	if (X86::GR64RegClass.contains(*I))
	RC = &X86::GR64RegClass;
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");

	unsigned NewVR = MRI->createVirtualRegister(RC);
	// Create copy from CSR to a virtual register.
	// FIXME: this currently does not emit CFI pseudo-instructions, it works
	// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
	// nounwind. If we want to generalize this later, we may need to emit
	// CFI pseudo-instructions.
	assert(Entry->getParent()->getFunction().hasFnAttribute(
	Attribute::NoUnwind) &&
	"Function should be nounwind in insertCopiesSplitCSR!");
	Entry->addLiveIn(*I);
	BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
	.addReg(*I);

	// Insert the copy-back instructions right before the terminator.
	for (auto *Exit : Exits)
	BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
	TII->get(TargetOpcode::COPY), *I)
	.addReg(NewVR);
	}
	}

	bool X86TargetLowering::supportSwiftError() const {
	return Subtarget.is64Bit();
	}

	/// Returns the name of the symbol used to emit stack probes or the empty
	/// string if not applicable.
	StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
	// If the function specifically requests stack probes, emit them.
	if (MF.getFunction().hasFnAttribute("probe-stack"))
	return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();

	// Generally, if we aren't on Windows, the platform ABI does not include
	// support for stack probes, so don't emit them.
	if (!Subtarget.isOSWindows() \|\| Subtarget.isTargetMachO())
	return "";

	// We need a stack probe to conform to the Windows ABI. Choose the right
	// symbol.
	if (Subtarget.is64Bit())
	return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
	return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
	}
	Index: head/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h
	===================================================================
	--- head/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h (revision 329409)
	+++ head/contrib/llvm/lib/Target/X86/X86IntrinsicsInfo.h (revision 329410)
	@@ -1,1781 +1,1784 @@
	//===-- X86IntrinsicsInfo.h - X86 Intrinsics ------------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains the details for lowering X86 intrinsics
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_X86_X86INTRINSICSINFO_H
	#define LLVM_LIB_TARGET_X86_X86INTRINSICSINFO_H

	#include "X86ISelLowering.h"
	#include "X86InstrInfo.h"

	namespace llvm {

	enum IntrinsicType : uint16_t {
	INTR_NO_TYPE,
	GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASS, FPCLASSS,
	INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP,
	CMP_MASK, CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM,
	CVTPD2PS, CVTPD2PS_MASK,
	INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM,
	INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, INTR_TYPE_2OP_IMM8_MASK,
	INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_MASK_RM, INTR_TYPE_3OP_IMM8_MASK,
	FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3,
	FMA_OP_SCALAR_MASK, FMA_OP_SCALAR_MASKZ, FMA_OP_SCALAR_MASK3,
	IFMA_OP_MASK, IFMA_OP_MASKZ,
	VPERM_2OP_MASK, VPERM_3OP_MASK, VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK,
	INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK,
	COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM,
	TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
	EXPAND_FROM_MEM,
	- TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
	+ TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
	FIXUPIMMS_MASKZ, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP,
	ROUNDP, ROUNDS
	};

	struct IntrinsicData {

	uint16_t Id;
	IntrinsicType Type;
	uint16_t Opc0;
	uint16_t Opc1;

	bool operator<(const IntrinsicData &RHS) const {
	return Id < RHS.Id;
	}
	bool operator==(const IntrinsicData &RHS) const {
	return RHS.Id == Id;
	}
	};

	#define X86_INTRINSIC_DATA(id, type, op0, op1) \
	{ Intrinsic::x86_##id, type, op0, op1 }

	/*
	* IntrinsicsWithChain - the table should be sorted by Intrinsic ID - in
	* the alphabetical order.
	*/
	static const IntrinsicData IntrinsicsWithChain[] = {
	X86_INTRINSIC_DATA(addcarry_u32, ADX, X86ISD::ADC, 0),
	X86_INTRINSIC_DATA(addcarry_u64, ADX, X86ISD::ADC, 0),
	X86_INTRINSIC_DATA(addcarryx_u32, ADX, X86ISD::ADC, 0),
	X86_INTRINSIC_DATA(addcarryx_u64, ADX, X86ISD::ADC, 0),

	X86_INTRINSIC_DATA(avx2_gather_d_d, GATHER_AVX2, X86::VPGATHERDDrm, 0),
	X86_INTRINSIC_DATA(avx2_gather_d_d_256, GATHER_AVX2, X86::VPGATHERDDYrm, 0),
	X86_INTRINSIC_DATA(avx2_gather_d_pd, GATHER_AVX2, X86::VGATHERDPDrm, 0),
	X86_INTRINSIC_DATA(avx2_gather_d_pd_256, GATHER_AVX2, X86::VGATHERDPDYrm, 0),
	X86_INTRINSIC_DATA(avx2_gather_d_ps, GATHER_AVX2, X86::VGATHERDPSrm, 0),
	X86_INTRINSIC_DATA(avx2_gather_d_ps_256, GATHER_AVX2, X86::VGATHERDPSYrm, 0),
	X86_INTRINSIC_DATA(avx2_gather_d_q, GATHER_AVX2, X86::VPGATHERDQrm, 0),
	X86_INTRINSIC_DATA(avx2_gather_d_q_256, GATHER_AVX2, X86::VPGATHERDQYrm, 0),
	X86_INTRINSIC_DATA(avx2_gather_q_d, GATHER_AVX2, X86::VPGATHERQDrm, 0),
	X86_INTRINSIC_DATA(avx2_gather_q_d_256, GATHER_AVX2, X86::VPGATHERQDYrm, 0),
	X86_INTRINSIC_DATA(avx2_gather_q_pd, GATHER_AVX2, X86::VGATHERQPDrm, 0),
	X86_INTRINSIC_DATA(avx2_gather_q_pd_256, GATHER_AVX2, X86::VGATHERQPDYrm, 0),
	X86_INTRINSIC_DATA(avx2_gather_q_ps, GATHER_AVX2, X86::VGATHERQPSrm, 0),
	X86_INTRINSIC_DATA(avx2_gather_q_ps_256, GATHER_AVX2, X86::VGATHERQPSYrm, 0),
	X86_INTRINSIC_DATA(avx2_gather_q_q, GATHER_AVX2, X86::VPGATHERQQrm, 0),
	X86_INTRINSIC_DATA(avx2_gather_q_q_256, GATHER_AVX2, X86::VPGATHERQQYrm, 0),

	X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0),
	X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0),
	X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0),
	X86_INTRINSIC_DATA(avx512_gather_dps_512, GATHER, X86::VGATHERDPSZrm, 0),
	X86_INTRINSIC_DATA(avx512_gather_qpd_512, GATHER, X86::VGATHERQPDZrm, 0),
	X86_INTRINSIC_DATA(avx512_gather_qpi_512, GATHER, X86::VPGATHERQDZrm, 0),
	X86_INTRINSIC_DATA(avx512_gather_qpq_512, GATHER, X86::VPGATHERQQZrm, 0),
	X86_INTRINSIC_DATA(avx512_gather_qps_512, GATHER, X86::VGATHERQPSZrm, 0),
	X86_INTRINSIC_DATA(avx512_gather3div2_df, GATHER, X86::VGATHERQPDZ128rm, 0),
	X86_INTRINSIC_DATA(avx512_gather3div2_di, GATHER, X86::VPGATHERQQZ128rm, 0),
	X86_INTRINSIC_DATA(avx512_gather3div4_df, GATHER, X86::VGATHERQPDZ256rm, 0),
	X86_INTRINSIC_DATA(avx512_gather3div4_di, GATHER, X86::VPGATHERQQZ256rm, 0),
	X86_INTRINSIC_DATA(avx512_gather3div4_sf, GATHER, X86::VGATHERQPSZ128rm, 0),
	X86_INTRINSIC_DATA(avx512_gather3div4_si, GATHER, X86::VPGATHERQDZ128rm, 0),
	X86_INTRINSIC_DATA(avx512_gather3div8_sf, GATHER, X86::VGATHERQPSZ256rm, 0),
	X86_INTRINSIC_DATA(avx512_gather3div8_si, GATHER, X86::VPGATHERQDZ256rm, 0),
	X86_INTRINSIC_DATA(avx512_gather3siv2_df, GATHER, X86::VGATHERDPDZ128rm, 0),
	X86_INTRINSIC_DATA(avx512_gather3siv2_di, GATHER, X86::VPGATHERDQZ128rm, 0),
	X86_INTRINSIC_DATA(avx512_gather3siv4_df, GATHER, X86::VGATHERDPDZ256rm, 0),
	X86_INTRINSIC_DATA(avx512_gather3siv4_di, GATHER, X86::VPGATHERDQZ256rm, 0),
	X86_INTRINSIC_DATA(avx512_gather3siv4_sf, GATHER, X86::VGATHERDPSZ128rm, 0),
	X86_INTRINSIC_DATA(avx512_gather3siv4_si, GATHER, X86::VPGATHERDDZ128rm, 0),
	X86_INTRINSIC_DATA(avx512_gather3siv8_sf, GATHER, X86::VGATHERDPSZ256rm, 0),
	X86_INTRINSIC_DATA(avx512_gather3siv8_si, GATHER, X86::VPGATHERDDZ256rm, 0),

	X86_INTRINSIC_DATA(avx512_gatherpf_dpd_512, PREFETCH,
	X86::VGATHERPF0DPDm, X86::VGATHERPF1DPDm),
	X86_INTRINSIC_DATA(avx512_gatherpf_dps_512, PREFETCH,
	X86::VGATHERPF0DPSm, X86::VGATHERPF1DPSm),
	X86_INTRINSIC_DATA(avx512_gatherpf_qpd_512, PREFETCH,
	X86::VGATHERPF0QPDm, X86::VGATHERPF1QPDm),
	X86_INTRINSIC_DATA(avx512_gatherpf_qps_512, PREFETCH,
	X86::VGATHERPF0QPSm, X86::VGATHERPF1QPSm),

	X86_INTRINSIC_DATA(avx512_mask_compress_store_b_128,
	COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_store_b_256,
	COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_store_b_512,
	COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_store_d_128,
	COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_store_d_256,
	COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_store_d_512,
	COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_128,
	COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_256,
	COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_store_pd_512,
	COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_128,
	COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_256,
	COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_store_ps_512,
	COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_store_q_128,
	COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_store_q_256,
	COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_store_q_512,
	COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_store_w_128,
	COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_store_w_256,
	COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_store_w_512,
	COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_load_b_128,
	EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_load_b_256,
	EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_load_b_512,
	EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_load_d_128,
	EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_load_d_256,
	EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_load_d_512,
	EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_128,
	EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_256,
	EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_load_pd_512,
	EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_128,
	EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_256,
	EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_load_ps_512,
	EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_load_q_128,
	EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_load_q_256,
	EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512,
	EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_load_w_128,
	EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_load_w_256,
	EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_load_w_512,
	EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_512, TRUNCATE_TO_MEM_VI8,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_128, TRUNCATE_TO_MEM_VI16,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_256, TRUNCATE_TO_MEM_VI16,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_dw_mem_512, TRUNCATE_TO_MEM_VI16,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_128, TRUNCATE_TO_MEM_VI8,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_256, TRUNCATE_TO_MEM_VI8,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_qb_mem_512, TRUNCATE_TO_MEM_VI8,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_128, TRUNCATE_TO_MEM_VI32,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_256, TRUNCATE_TO_MEM_VI32,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_qd_mem_512, TRUNCATE_TO_MEM_VI32,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_128, TRUNCATE_TO_MEM_VI16,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_256, TRUNCATE_TO_MEM_VI16,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_qw_mem_512, TRUNCATE_TO_MEM_VI16,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_128, TRUNCATE_TO_MEM_VI8,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_256, TRUNCATE_TO_MEM_VI8,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_512, TRUNCATE_TO_MEM_VI8,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_db_mem_128, TRUNCATE_TO_MEM_VI8,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_db_mem_256, TRUNCATE_TO_MEM_VI8,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_db_mem_512, TRUNCATE_TO_MEM_VI8,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_mem_128, TRUNCATE_TO_MEM_VI16,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_mem_256, TRUNCATE_TO_MEM_VI16,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_mem_512, TRUNCATE_TO_MEM_VI16,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_mem_128, TRUNCATE_TO_MEM_VI8,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_mem_256, TRUNCATE_TO_MEM_VI8,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_mem_512, TRUNCATE_TO_MEM_VI8,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_mem_128, TRUNCATE_TO_MEM_VI32,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_mem_256, TRUNCATE_TO_MEM_VI32,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_mem_512, TRUNCATE_TO_MEM_VI32,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_mem_128, TRUNCATE_TO_MEM_VI16,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_mem_256, TRUNCATE_TO_MEM_VI16,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_mem_512, TRUNCATE_TO_MEM_VI16,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_mem_128, TRUNCATE_TO_MEM_VI8,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_mem_256, TRUNCATE_TO_MEM_VI8,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_mem_512, TRUNCATE_TO_MEM_VI8,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_db_mem_128, TRUNCATE_TO_MEM_VI8,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_db_mem_256, TRUNCATE_TO_MEM_VI8,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_db_mem_512, TRUNCATE_TO_MEM_VI8,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_mem_128, TRUNCATE_TO_MEM_VI16,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_mem_256, TRUNCATE_TO_MEM_VI16,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_mem_512, TRUNCATE_TO_MEM_VI16,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_mem_128, TRUNCATE_TO_MEM_VI8,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_mem_256, TRUNCATE_TO_MEM_VI8,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_mem_512, TRUNCATE_TO_MEM_VI8,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_mem_128, TRUNCATE_TO_MEM_VI32,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_mem_256, TRUNCATE_TO_MEM_VI32,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_mem_512, TRUNCATE_TO_MEM_VI32,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_mem_128, TRUNCATE_TO_MEM_VI16,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_mem_256, TRUNCATE_TO_MEM_VI16,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_mem_512, TRUNCATE_TO_MEM_VI16,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_mem_128, TRUNCATE_TO_MEM_VI8,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_mem_256, TRUNCATE_TO_MEM_VI8,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_mem_512, TRUNCATE_TO_MEM_VI8,
	X86ISD::VTRUNCUS, 0),

	X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0),
	X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0),
	X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0),
	X86_INTRINSIC_DATA(avx512_scatter_dps_512, SCATTER, X86::VSCATTERDPSZmr, 0),
	X86_INTRINSIC_DATA(avx512_scatter_qpd_512, SCATTER, X86::VSCATTERQPDZmr, 0),
	X86_INTRINSIC_DATA(avx512_scatter_qpi_512, SCATTER, X86::VPSCATTERQDZmr, 0),
	X86_INTRINSIC_DATA(avx512_scatter_qpq_512, SCATTER, X86::VPSCATTERQQZmr, 0),
	X86_INTRINSIC_DATA(avx512_scatter_qps_512, SCATTER, X86::VSCATTERQPSZmr, 0),
	X86_INTRINSIC_DATA(avx512_scatterdiv2_df, SCATTER, X86::VSCATTERQPDZ128mr, 0),
	X86_INTRINSIC_DATA(avx512_scatterdiv2_di, SCATTER, X86::VPSCATTERQQZ128mr, 0),
	X86_INTRINSIC_DATA(avx512_scatterdiv4_df, SCATTER, X86::VSCATTERQPDZ256mr, 0),
	X86_INTRINSIC_DATA(avx512_scatterdiv4_di, SCATTER, X86::VPSCATTERQQZ256mr, 0),
	X86_INTRINSIC_DATA(avx512_scatterdiv4_sf, SCATTER, X86::VSCATTERQPSZ128mr, 0),
	X86_INTRINSIC_DATA(avx512_scatterdiv4_si, SCATTER, X86::VPSCATTERQDZ128mr, 0),
	X86_INTRINSIC_DATA(avx512_scatterdiv8_sf, SCATTER, X86::VSCATTERQPSZ256mr, 0),
	X86_INTRINSIC_DATA(avx512_scatterdiv8_si, SCATTER, X86::VPSCATTERQDZ256mr, 0),
	X86_INTRINSIC_DATA(avx512_scatterpf_dpd_512, PREFETCH, X86::VSCATTERPF0DPDm,
	X86::VSCATTERPF1DPDm),
	X86_INTRINSIC_DATA(avx512_scatterpf_dps_512, PREFETCH, X86::VSCATTERPF0DPSm,
	X86::VSCATTERPF1DPSm),
	X86_INTRINSIC_DATA(avx512_scatterpf_qpd_512, PREFETCH, X86::VSCATTERPF0QPDm,
	X86::VSCATTERPF1QPDm),
	X86_INTRINSIC_DATA(avx512_scatterpf_qps_512, PREFETCH, X86::VSCATTERPF0QPSm,
	X86::VSCATTERPF1QPSm),
	X86_INTRINSIC_DATA(avx512_scattersiv2_df, SCATTER, X86::VSCATTERDPDZ128mr, 0),
	X86_INTRINSIC_DATA(avx512_scattersiv2_di, SCATTER, X86::VPSCATTERDQZ128mr, 0),
	X86_INTRINSIC_DATA(avx512_scattersiv4_df, SCATTER, X86::VSCATTERDPDZ256mr, 0),
	X86_INTRINSIC_DATA(avx512_scattersiv4_di, SCATTER, X86::VPSCATTERDQZ256mr, 0),
	X86_INTRINSIC_DATA(avx512_scattersiv4_sf, SCATTER, X86::VSCATTERDPSZ128mr, 0),
	X86_INTRINSIC_DATA(avx512_scattersiv4_si, SCATTER, X86::VPSCATTERDDZ128mr, 0),
	X86_INTRINSIC_DATA(avx512_scattersiv8_sf, SCATTER, X86::VSCATTERDPSZ256mr, 0),
	X86_INTRINSIC_DATA(avx512_scattersiv8_si, SCATTER, X86::VPSCATTERDDZ256mr, 0),
	X86_INTRINSIC_DATA(rdpmc, RDPMC, X86ISD::RDPMC_DAG, 0),
	X86_INTRINSIC_DATA(rdrand_16, RDRAND, X86ISD::RDRAND, 0),
	X86_INTRINSIC_DATA(rdrand_32, RDRAND, X86ISD::RDRAND, 0),
	X86_INTRINSIC_DATA(rdrand_64, RDRAND, X86ISD::RDRAND, 0),
	X86_INTRINSIC_DATA(rdseed_16, RDSEED, X86ISD::RDSEED, 0),
	X86_INTRINSIC_DATA(rdseed_32, RDSEED, X86ISD::RDSEED, 0),
	X86_INTRINSIC_DATA(rdseed_64, RDSEED, X86ISD::RDSEED, 0),
	X86_INTRINSIC_DATA(rdtsc, RDTSC, X86ISD::RDTSC_DAG, 0),
	X86_INTRINSIC_DATA(rdtscp, RDTSC, X86ISD::RDTSCP_DAG, 0),

	X86_INTRINSIC_DATA(subborrow_u32, ADX, X86ISD::SBB, 0),
	X86_INTRINSIC_DATA(subborrow_u64, ADX, X86ISD::SBB, 0),
	X86_INTRINSIC_DATA(xgetbv, XGETBV, X86::XGETBV, 0),
	X86_INTRINSIC_DATA(xtest, XTEST, X86ISD::XTEST, 0),
	};

	/*
	* Find Intrinsic data by intrinsic ID
	*/
	static const IntrinsicData* getIntrinsicWithChain(uint16_t IntNo) {

	IntrinsicData IntrinsicToFind = {IntNo, INTR_NO_TYPE, 0, 0 };
	const IntrinsicData *Data = std::lower_bound(std::begin(IntrinsicsWithChain),
	std::end(IntrinsicsWithChain),
	IntrinsicToFind);
	if (Data != std::end(IntrinsicsWithChain) && *Data == IntrinsicToFind)
	return Data;
	return nullptr;
	}

	/*
	* IntrinsicsWithoutChain - the table should be sorted by Intrinsic ID - in
	* the alphabetical order.
	*/
	static const IntrinsicData IntrinsicsWithoutChain[] = {
	X86_INTRINSIC_DATA(avx_addsub_pd_256, INTR_TYPE_2OP, X86ISD::ADDSUB, 0),
	X86_INTRINSIC_DATA(avx_addsub_ps_256, INTR_TYPE_2OP, X86ISD::ADDSUB, 0),
	X86_INTRINSIC_DATA(avx_cmp_pd_256, INTR_TYPE_3OP, X86ISD::CMPP, 0),
	X86_INTRINSIC_DATA(avx_cmp_ps_256, INTR_TYPE_3OP, X86ISD::CMPP, 0),
	X86_INTRINSIC_DATA(avx_cvt_pd2_ps_256,CVTPD2PS, ISD::FP_ROUND, 0),
	X86_INTRINSIC_DATA(avx_cvt_pd2dq_256, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0),
	X86_INTRINSIC_DATA(avx_cvtdq2_ps_256, INTR_TYPE_1OP, ISD::SINT_TO_FP, 0),
	X86_INTRINSIC_DATA(avx_cvtt_pd2dq_256,INTR_TYPE_1OP, ISD::FP_TO_SINT, 0),
	X86_INTRINSIC_DATA(avx_cvtt_ps2dq_256,INTR_TYPE_1OP, ISD::FP_TO_SINT, 0),
	X86_INTRINSIC_DATA(avx_hadd_pd_256, INTR_TYPE_2OP, X86ISD::FHADD, 0),
	X86_INTRINSIC_DATA(avx_hadd_ps_256, INTR_TYPE_2OP, X86ISD::FHADD, 0),
	X86_INTRINSIC_DATA(avx_hsub_pd_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
	X86_INTRINSIC_DATA(avx_hsub_ps_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
	X86_INTRINSIC_DATA(avx_max_pd_256, INTR_TYPE_2OP, X86ISD::FMAX, 0),
	X86_INTRINSIC_DATA(avx_max_ps_256, INTR_TYPE_2OP, X86ISD::FMAX, 0),
	X86_INTRINSIC_DATA(avx_min_pd_256, INTR_TYPE_2OP, X86ISD::FMIN, 0),
	X86_INTRINSIC_DATA(avx_min_ps_256, INTR_TYPE_2OP, X86ISD::FMIN, 0),
	X86_INTRINSIC_DATA(avx_movmsk_pd_256, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
	X86_INTRINSIC_DATA(avx_movmsk_ps_256, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
	X86_INTRINSIC_DATA(avx_rcp_ps_256, INTR_TYPE_1OP, X86ISD::FRCP, 0),
	X86_INTRINSIC_DATA(avx_round_pd_256, ROUNDP, X86ISD::VRNDSCALE, 0),
	X86_INTRINSIC_DATA(avx_round_ps_256, ROUNDP, X86ISD::VRNDSCALE, 0),
	X86_INTRINSIC_DATA(avx_rsqrt_ps_256, INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
	X86_INTRINSIC_DATA(avx_sqrt_pd_256, INTR_TYPE_1OP, ISD::FSQRT, 0),
	X86_INTRINSIC_DATA(avx_sqrt_ps_256, INTR_TYPE_1OP, ISD::FSQRT, 0),
	X86_INTRINSIC_DATA(avx_vpermilvar_pd, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
	X86_INTRINSIC_DATA(avx_vpermilvar_pd_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
	X86_INTRINSIC_DATA(avx_vpermilvar_ps, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
	X86_INTRINSIC_DATA(avx_vpermilvar_ps_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
	X86_INTRINSIC_DATA(avx2_packssdw, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
	X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
	X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
	X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
	X86_INTRINSIC_DATA(avx2_padds_b, INTR_TYPE_2OP, X86ISD::ADDS, 0),
	X86_INTRINSIC_DATA(avx2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0),
	X86_INTRINSIC_DATA(avx2_paddus_b, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
	X86_INTRINSIC_DATA(avx2_paddus_w, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
	X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0),
	X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0),
	X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0),
	X86_INTRINSIC_DATA(avx2_phsub_w, INTR_TYPE_2OP, X86ISD::HSUB, 0),
	X86_INTRINSIC_DATA(avx2_pmadd_ub_sw, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0),
	X86_INTRINSIC_DATA(avx2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
	X86_INTRINSIC_DATA(avx2_pmovmskb, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
	X86_INTRINSIC_DATA(avx2_pmul_dq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
	X86_INTRINSIC_DATA(avx2_pmul_hr_sw, INTR_TYPE_2OP, X86ISD::MULHRS, 0),
	X86_INTRINSIC_DATA(avx2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
	X86_INTRINSIC_DATA(avx2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0),
	X86_INTRINSIC_DATA(avx2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
	X86_INTRINSIC_DATA(avx2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
	X86_INTRINSIC_DATA(avx2_pshuf_b, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
	X86_INTRINSIC_DATA(avx2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0),
	X86_INTRINSIC_DATA(avx2_psll_q, INTR_TYPE_2OP, X86ISD::VSHL, 0),
	X86_INTRINSIC_DATA(avx2_psll_w, INTR_TYPE_2OP, X86ISD::VSHL, 0),
	X86_INTRINSIC_DATA(avx2_pslli_d, VSHIFT, X86ISD::VSHLI, 0),
	X86_INTRINSIC_DATA(avx2_pslli_q, VSHIFT, X86ISD::VSHLI, 0),
	X86_INTRINSIC_DATA(avx2_pslli_w, VSHIFT, X86ISD::VSHLI, 0),
	X86_INTRINSIC_DATA(avx2_psllv_d, INTR_TYPE_2OP, ISD::SHL, 0),
	X86_INTRINSIC_DATA(avx2_psllv_d_256, INTR_TYPE_2OP, ISD::SHL, 0),
	X86_INTRINSIC_DATA(avx2_psllv_q, INTR_TYPE_2OP, ISD::SHL, 0),
	X86_INTRINSIC_DATA(avx2_psllv_q_256, INTR_TYPE_2OP, ISD::SHL, 0),
	X86_INTRINSIC_DATA(avx2_psra_d, INTR_TYPE_2OP, X86ISD::VSRA, 0),
	X86_INTRINSIC_DATA(avx2_psra_w, INTR_TYPE_2OP, X86ISD::VSRA, 0),
	X86_INTRINSIC_DATA(avx2_psrai_d, VSHIFT, X86ISD::VSRAI, 0),
	X86_INTRINSIC_DATA(avx2_psrai_w, VSHIFT, X86ISD::VSRAI, 0),
	X86_INTRINSIC_DATA(avx2_psrav_d, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
	X86_INTRINSIC_DATA(avx2_psrav_d_256, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
	X86_INTRINSIC_DATA(avx2_psrl_d, INTR_TYPE_2OP, X86ISD::VSRL, 0),
	X86_INTRINSIC_DATA(avx2_psrl_q, INTR_TYPE_2OP, X86ISD::VSRL, 0),
	X86_INTRINSIC_DATA(avx2_psrl_w, INTR_TYPE_2OP, X86ISD::VSRL, 0),
	X86_INTRINSIC_DATA(avx2_psrli_d, VSHIFT, X86ISD::VSRLI, 0),
	X86_INTRINSIC_DATA(avx2_psrli_q, VSHIFT, X86ISD::VSRLI, 0),
	X86_INTRINSIC_DATA(avx2_psrli_w, VSHIFT, X86ISD::VSRLI, 0),
	X86_INTRINSIC_DATA(avx2_psrlv_d, INTR_TYPE_2OP, ISD::SRL, 0),
	X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, ISD::SRL, 0),
	X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, ISD::SRL, 0),
	X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, ISD::SRL, 0),
	X86_INTRINSIC_DATA(avx2_psubs_b, INTR_TYPE_2OP, X86ISD::SUBS, 0),
	X86_INTRINSIC_DATA(avx2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0),
	X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
	X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
	X86_INTRINSIC_DATA(avx512_cvtb2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
	X86_INTRINSIC_DATA(avx512_cvtb2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
	X86_INTRINSIC_DATA(avx512_cvtb2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
	X86_INTRINSIC_DATA(avx512_cvtd2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
	X86_INTRINSIC_DATA(avx512_cvtd2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
	X86_INTRINSIC_DATA(avx512_cvtd2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
	X86_INTRINSIC_DATA(avx512_cvtq2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
	X86_INTRINSIC_DATA(avx512_cvtq2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
	X86_INTRINSIC_DATA(avx512_cvtq2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
	X86_INTRINSIC_DATA(avx512_cvtsi2sd64, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0),
	X86_INTRINSIC_DATA(avx512_cvtsi2ss32, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0),
	X86_INTRINSIC_DATA(avx512_cvtsi2ss64, INTR_TYPE_3OP, X86ISD::SCALAR_SINT_TO_FP_RND, 0),
	X86_INTRINSIC_DATA(avx512_cvttsd2si, INTR_TYPE_2OP, X86ISD::CVTTS2SI_RND, 0),
	X86_INTRINSIC_DATA(avx512_cvttsd2si64, INTR_TYPE_2OP, X86ISD::CVTTS2SI_RND, 0),
	X86_INTRINSIC_DATA(avx512_cvttsd2usi, INTR_TYPE_2OP, X86ISD::CVTTS2UI_RND, 0),
	X86_INTRINSIC_DATA(avx512_cvttsd2usi64, INTR_TYPE_2OP, X86ISD::CVTTS2UI_RND, 0),
	X86_INTRINSIC_DATA(avx512_cvttss2si, INTR_TYPE_2OP, X86ISD::CVTTS2SI_RND, 0),
	X86_INTRINSIC_DATA(avx512_cvttss2si64, INTR_TYPE_2OP, X86ISD::CVTTS2SI_RND, 0),
	X86_INTRINSIC_DATA(avx512_cvttss2usi, INTR_TYPE_2OP, X86ISD::CVTTS2UI_RND, 0),
	X86_INTRINSIC_DATA(avx512_cvttss2usi64, INTR_TYPE_2OP, X86ISD::CVTTS2UI_RND, 0),
	X86_INTRINSIC_DATA(avx512_cvtusi2ss, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0),
	X86_INTRINSIC_DATA(avx512_cvtusi642sd, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0),
	X86_INTRINSIC_DATA(avx512_cvtusi642ss, INTR_TYPE_3OP, X86ISD::SCALAR_UINT_TO_FP_RND, 0),
	X86_INTRINSIC_DATA(avx512_cvtw2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
	X86_INTRINSIC_DATA(avx512_cvtw2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
	X86_INTRINSIC_DATA(avx512_cvtw2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
	X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
	X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
	X86_INTRINSIC_DATA(avx512_kand_w, MASK_BINOP, ISD::AND, 0),
	X86_INTRINSIC_DATA(avx512_kor_w, MASK_BINOP, ISD::OR, 0),
	+ X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0),
	+ X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0),
	+ X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0),
	X86_INTRINSIC_DATA(avx512_kxor_w, MASK_BINOP, ISD::XOR, 0),
	X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD,
	X86ISD::FADD_RND),
	X86_INTRINSIC_DATA(avx512_mask_add_ps_512, INTR_TYPE_2OP_MASK, ISD::FADD,
	X86ISD::FADD_RND),
	X86_INTRINSIC_DATA(avx512_mask_add_sd_round, INTR_TYPE_SCALAR_MASK_RM,
	X86ISD::FADDS_RND, 0),
	X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK_RM,
	X86ISD::FADDS_RND, 0),
	X86_INTRINSIC_DATA(avx512_mask_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0),
	X86_INTRINSIC_DATA(avx512_mask_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0),
	X86_INTRINSIC_DATA(avx512_mask_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM,
	X86ISD::CMPM_RND),
	X86_INTRINSIC_DATA(avx512_mask_cmp_ps_128, CMP_MASK_CC, X86ISD::CMPM, 0),
	X86_INTRINSIC_DATA(avx512_mask_cmp_ps_256, CMP_MASK_CC, X86ISD::CMPM, 0),
	X86_INTRINSIC_DATA(avx512_mask_cmp_ps_512, CMP_MASK_CC, X86ISD::CMPM,
	X86ISD::CMPM_RND),
	X86_INTRINSIC_DATA(avx512_mask_cmp_sd, CMP_MASK_SCALAR_CC,
	X86ISD::FSETCCM, X86ISD::FSETCCM_RND),
	X86_INTRINSIC_DATA(avx512_mask_cmp_ss, CMP_MASK_SCALAR_CC,
	X86ISD::FSETCCM, X86ISD::FSETCCM_RND),

	X86_INTRINSIC_DATA(avx512_mask_compress_b_128, COMPRESS_EXPAND_IN_REG,
	X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_b_256, COMPRESS_EXPAND_IN_REG,
	X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_b_512, COMPRESS_EXPAND_IN_REG,
	X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_d_128, COMPRESS_EXPAND_IN_REG,
	X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_d_256, COMPRESS_EXPAND_IN_REG,
	X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_d_512, COMPRESS_EXPAND_IN_REG,
	X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_pd_128, COMPRESS_EXPAND_IN_REG,
	X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_pd_256, COMPRESS_EXPAND_IN_REG,
	X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_pd_512, COMPRESS_EXPAND_IN_REG,
	X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_ps_128, COMPRESS_EXPAND_IN_REG,
	X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_ps_256, COMPRESS_EXPAND_IN_REG,
	X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_ps_512, COMPRESS_EXPAND_IN_REG,
	X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_q_128, COMPRESS_EXPAND_IN_REG,
	X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_q_256, COMPRESS_EXPAND_IN_REG,
	X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_q_512, COMPRESS_EXPAND_IN_REG,
	X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_w_128, COMPRESS_EXPAND_IN_REG,
	X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_w_256, COMPRESS_EXPAND_IN_REG,
	X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_compress_w_512, COMPRESS_EXPAND_IN_REG,
	X86ISD::COMPRESS, 0),
	X86_INTRINSIC_DATA(avx512_mask_conflict_d_128, INTR_TYPE_1OP_MASK,
	X86ISD::CONFLICT, 0),
	X86_INTRINSIC_DATA(avx512_mask_conflict_d_256, INTR_TYPE_1OP_MASK,
	X86ISD::CONFLICT, 0),
	X86_INTRINSIC_DATA(avx512_mask_conflict_d_512, INTR_TYPE_1OP_MASK,
	X86ISD::CONFLICT, 0),
	X86_INTRINSIC_DATA(avx512_mask_conflict_q_128, INTR_TYPE_1OP_MASK,
	X86ISD::CONFLICT, 0),
	X86_INTRINSIC_DATA(avx512_mask_conflict_q_256, INTR_TYPE_1OP_MASK,
	X86ISD::CONFLICT, 0),
	X86_INTRINSIC_DATA(avx512_mask_conflict_q_512, INTR_TYPE_1OP_MASK,
	X86ISD::CONFLICT, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_128, INTR_TYPE_1OP_MASK,
	ISD::SINT_TO_FP, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_256, INTR_TYPE_1OP_MASK,
	ISD::SINT_TO_FP, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtdq2ps_512, INTR_TYPE_1OP_MASK,
	ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND), //er
	X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_128, INTR_TYPE_1OP_MASK,
	X86ISD::CVTP2SI, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_256, INTR_TYPE_1OP_MASK,
	X86ISD::CVTP2SI, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtpd2dq_512, INTR_TYPE_1OP_MASK,
	X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
	X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps, INTR_TYPE_1OP_MASK,
	X86ISD::VFPROUND, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_256, CVTPD2PS_MASK,
	ISD::FP_ROUND, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtpd2ps_512, CVTPD2PS_MASK,
	ISD::FP_ROUND, X86ISD::VFPROUND_RND),
	X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_128, INTR_TYPE_1OP_MASK,
	X86ISD::CVTP2SI, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_256, INTR_TYPE_1OP_MASK,
	X86ISD::CVTP2SI, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtpd2qq_512, INTR_TYPE_1OP_MASK,
	X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
	X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_128, INTR_TYPE_1OP_MASK,
	X86ISD::CVTP2UI, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_256, INTR_TYPE_1OP_MASK,
	X86ISD::CVTP2UI, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtpd2udq_512, INTR_TYPE_1OP_MASK,
	X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
	X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_128, INTR_TYPE_1OP_MASK,
	X86ISD::CVTP2UI, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_256, INTR_TYPE_1OP_MASK,
	X86ISD::CVTP2UI, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtpd2uqq_512, INTR_TYPE_1OP_MASK,
	X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
	X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_128, INTR_TYPE_1OP_MASK,
	X86ISD::CVTP2SI, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_256, INTR_TYPE_1OP_MASK,
	X86ISD::CVTP2SI, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtps2dq_512, INTR_TYPE_1OP_MASK,
	X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
	X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_128, INTR_TYPE_1OP_MASK,
	X86ISD::VFPEXT, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_256, INTR_TYPE_1OP_MASK,
	ISD::FP_EXTEND, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtps2pd_512, INTR_TYPE_1OP_MASK,
	ISD::FP_EXTEND, X86ISD::VFPEXT_RND),
	X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_128, INTR_TYPE_1OP_MASK,
	X86ISD::CVTP2SI, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_256, INTR_TYPE_1OP_MASK,
	X86ISD::CVTP2SI, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtps2qq_512, INTR_TYPE_1OP_MASK,
	X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
	X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_128, INTR_TYPE_1OP_MASK,
	X86ISD::CVTP2UI, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_256, INTR_TYPE_1OP_MASK,
	X86ISD::CVTP2UI, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtps2udq_512, INTR_TYPE_1OP_MASK,
	X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
	X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_128, INTR_TYPE_1OP_MASK,
	X86ISD::CVTP2UI, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_256, INTR_TYPE_1OP_MASK,
	X86ISD::CVTP2UI, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtps2uqq_512, INTR_TYPE_1OP_MASK,
	X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
	X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_128, INTR_TYPE_1OP_MASK,
	ISD::SINT_TO_FP, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_256, INTR_TYPE_1OP_MASK,
	ISD::SINT_TO_FP, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtqq2pd_512, INTR_TYPE_1OP_MASK,
	ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND),
	X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_128, INTR_TYPE_1OP_MASK,
	X86ISD::CVTSI2P, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_256, INTR_TYPE_1OP_MASK,
	ISD::SINT_TO_FP, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtqq2ps_512, INTR_TYPE_1OP_MASK,
	ISD::SINT_TO_FP, X86ISD::SINT_TO_FP_RND),
	X86_INTRINSIC_DATA(avx512_mask_cvtsd2ss_round, INTR_TYPE_SCALAR_MASK_RM,
	X86ISD::VFPROUNDS_RND, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtss2sd_round, INTR_TYPE_SCALAR_MASK_RM,
	X86ISD::VFPEXTS_RND, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_128, INTR_TYPE_1OP_MASK,
	X86ISD::CVTTP2SI, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_256, INTR_TYPE_1OP_MASK,
	ISD::FP_TO_SINT, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvttpd2dq_512, INTR_TYPE_1OP_MASK,
	ISD::FP_TO_SINT, X86ISD::CVTTP2SI_RND),
	X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_128, INTR_TYPE_1OP_MASK,
	ISD::FP_TO_SINT, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_256, INTR_TYPE_1OP_MASK,
	ISD::FP_TO_SINT, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvttpd2qq_512, INTR_TYPE_1OP_MASK,
	ISD::FP_TO_SINT, X86ISD::CVTTP2SI_RND),
	X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_128, INTR_TYPE_1OP_MASK,
	X86ISD::CVTTP2UI, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_256, INTR_TYPE_1OP_MASK,
	ISD::FP_TO_UINT, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvttpd2udq_512, INTR_TYPE_1OP_MASK,
	ISD::FP_TO_UINT, X86ISD::CVTTP2UI_RND),
	X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_128, INTR_TYPE_1OP_MASK,
	ISD::FP_TO_UINT, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_256, INTR_TYPE_1OP_MASK,
	ISD::FP_TO_UINT, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvttpd2uqq_512, INTR_TYPE_1OP_MASK,
	ISD::FP_TO_UINT, X86ISD::CVTTP2UI_RND),
	X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_128, INTR_TYPE_1OP_MASK,
	ISD::FP_TO_SINT, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_256, INTR_TYPE_1OP_MASK,
	ISD::FP_TO_SINT, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvttps2dq_512, INTR_TYPE_1OP_MASK,
	ISD::FP_TO_SINT, X86ISD::CVTTP2SI_RND),
	X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_128, INTR_TYPE_1OP_MASK,
	X86ISD::CVTTP2SI, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_256, INTR_TYPE_1OP_MASK,
	ISD::FP_TO_SINT, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvttps2qq_512, INTR_TYPE_1OP_MASK,
	ISD::FP_TO_SINT, X86ISD::CVTTP2SI_RND),
	X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_128, INTR_TYPE_1OP_MASK,
	ISD::FP_TO_UINT, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_256, INTR_TYPE_1OP_MASK,
	ISD::FP_TO_UINT, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvttps2udq_512, INTR_TYPE_1OP_MASK,
	ISD::FP_TO_UINT, X86ISD::CVTTP2UI_RND),
	X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_128, INTR_TYPE_1OP_MASK,
	X86ISD::CVTTP2UI, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_256, INTR_TYPE_1OP_MASK,
	ISD::FP_TO_UINT, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvttps2uqq_512, INTR_TYPE_1OP_MASK,
	ISD::FP_TO_UINT, X86ISD::CVTTP2UI_RND),
	X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_128, INTR_TYPE_1OP_MASK,
	ISD::UINT_TO_FP, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_256, INTR_TYPE_1OP_MASK,
	ISD::UINT_TO_FP, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtudq2ps_512, INTR_TYPE_1OP_MASK,
	ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND),
	X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_128, INTR_TYPE_1OP_MASK,
	ISD::UINT_TO_FP, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_256, INTR_TYPE_1OP_MASK,
	ISD::UINT_TO_FP, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtuqq2pd_512, INTR_TYPE_1OP_MASK,
	ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND),
	X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_128, INTR_TYPE_1OP_MASK,
	X86ISD::CVTUI2P, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_256, INTR_TYPE_1OP_MASK,
	ISD::UINT_TO_FP, 0),
	X86_INTRINSIC_DATA(avx512_mask_cvtuqq2ps_512, INTR_TYPE_1OP_MASK,
	ISD::UINT_TO_FP, X86ISD::UINT_TO_FP_RND),
	X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_128, INTR_TYPE_3OP_IMM8_MASK,
	X86ISD::DBPSADBW, 0),
	X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_256, INTR_TYPE_3OP_IMM8_MASK,
	X86ISD::DBPSADBW, 0),
	X86_INTRINSIC_DATA(avx512_mask_dbpsadbw_512, INTR_TYPE_3OP_IMM8_MASK,
	X86ISD::DBPSADBW, 0),
	X86_INTRINSIC_DATA(avx512_mask_div_pd_512, INTR_TYPE_2OP_MASK, ISD::FDIV,
	X86ISD::FDIV_RND),
	X86_INTRINSIC_DATA(avx512_mask_div_ps_512, INTR_TYPE_2OP_MASK, ISD::FDIV,
	X86ISD::FDIV_RND),
	X86_INTRINSIC_DATA(avx512_mask_div_sd_round, INTR_TYPE_SCALAR_MASK_RM,
	X86ISD::FDIVS_RND, 0),
	X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK_RM,
	X86ISD::FDIVS_RND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_b_128, COMPRESS_EXPAND_IN_REG,
	X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_b_256, COMPRESS_EXPAND_IN_REG,
	X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_b_512, COMPRESS_EXPAND_IN_REG,
	X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_d_128, COMPRESS_EXPAND_IN_REG,
	X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_d_256, COMPRESS_EXPAND_IN_REG,
	X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_d_512, COMPRESS_EXPAND_IN_REG,
	X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_pd_128, COMPRESS_EXPAND_IN_REG,
	X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_pd_256, COMPRESS_EXPAND_IN_REG,
	X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_pd_512, COMPRESS_EXPAND_IN_REG,
	X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_ps_128, COMPRESS_EXPAND_IN_REG,
	X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_ps_256, COMPRESS_EXPAND_IN_REG,
	X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_ps_512, COMPRESS_EXPAND_IN_REG,
	X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_q_128, COMPRESS_EXPAND_IN_REG,
	X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_q_256, COMPRESS_EXPAND_IN_REG,
	X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_q_512, COMPRESS_EXPAND_IN_REG,
	X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_w_128, COMPRESS_EXPAND_IN_REG,
	X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_w_256, COMPRESS_EXPAND_IN_REG,
	X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_expand_w_512, COMPRESS_EXPAND_IN_REG,
	X86ISD::EXPAND, 0),
	X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_128, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
	X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_256, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
	X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_512, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
	X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_128, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
	X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_256, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
	X86_INTRINSIC_DATA(avx512_mask_fixupimm_ps_512, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
	X86_INTRINSIC_DATA(avx512_mask_fixupimm_sd, FIXUPIMMS, X86ISD::VFIXUPIMMS, 0),
	X86_INTRINSIC_DATA(avx512_mask_fixupimm_ss, FIXUPIMMS, X86ISD::VFIXUPIMMS, 0),
	X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_128, FPCLASS, X86ISD::VFPCLASS, 0),
	X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_256, FPCLASS, X86ISD::VFPCLASS, 0),
	X86_INTRINSIC_DATA(avx512_mask_fpclass_pd_512, FPCLASS, X86ISD::VFPCLASS, 0),
	X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_128, FPCLASS, X86ISD::VFPCLASS, 0),
	X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_256, FPCLASS, X86ISD::VFPCLASS, 0),
	X86_INTRINSIC_DATA(avx512_mask_fpclass_ps_512, FPCLASS, X86ISD::VFPCLASS, 0),
	X86_INTRINSIC_DATA(avx512_mask_fpclass_sd, FPCLASSS, X86ISD::VFPCLASSS, 0),
	X86_INTRINSIC_DATA(avx512_mask_fpclass_ss, FPCLASSS, X86ISD::VFPCLASSS, 0),
	X86_INTRINSIC_DATA(avx512_mask_getexp_pd_128, INTR_TYPE_1OP_MASK_RM,
	X86ISD::FGETEXP_RND, 0),
	X86_INTRINSIC_DATA(avx512_mask_getexp_pd_256, INTR_TYPE_1OP_MASK_RM,
	X86ISD::FGETEXP_RND, 0),
	X86_INTRINSIC_DATA(avx512_mask_getexp_pd_512, INTR_TYPE_1OP_MASK_RM,
	X86ISD::FGETEXP_RND, 0),
	X86_INTRINSIC_DATA(avx512_mask_getexp_ps_128, INTR_TYPE_1OP_MASK_RM,
	X86ISD::FGETEXP_RND, 0),
	X86_INTRINSIC_DATA(avx512_mask_getexp_ps_256, INTR_TYPE_1OP_MASK_RM,
	X86ISD::FGETEXP_RND, 0),
	X86_INTRINSIC_DATA(avx512_mask_getexp_ps_512, INTR_TYPE_1OP_MASK_RM,
	X86ISD::FGETEXP_RND, 0),
	X86_INTRINSIC_DATA(avx512_mask_getexp_sd, INTR_TYPE_SCALAR_MASK_RM,
	X86ISD::FGETEXPS_RND, 0),
	X86_INTRINSIC_DATA(avx512_mask_getexp_ss, INTR_TYPE_SCALAR_MASK_RM,
	X86ISD::FGETEXPS_RND, 0),
	X86_INTRINSIC_DATA(avx512_mask_getmant_pd_128, INTR_TYPE_2OP_MASK,
	X86ISD::VGETMANT, 0),
	X86_INTRINSIC_DATA(avx512_mask_getmant_pd_256, INTR_TYPE_2OP_MASK,
	X86ISD::VGETMANT, 0),
	X86_INTRINSIC_DATA(avx512_mask_getmant_pd_512, INTR_TYPE_2OP_MASK,
	X86ISD::VGETMANT, X86ISD::VGETMANT_RND),
	X86_INTRINSIC_DATA(avx512_mask_getmant_ps_128, INTR_TYPE_2OP_MASK,
	X86ISD::VGETMANT, 0),
	X86_INTRINSIC_DATA(avx512_mask_getmant_ps_256, INTR_TYPE_2OP_MASK,
	X86ISD::VGETMANT, 0),
	X86_INTRINSIC_DATA(avx512_mask_getmant_ps_512, INTR_TYPE_2OP_MASK,
	X86ISD::VGETMANT, X86ISD::VGETMANT_RND),
	X86_INTRINSIC_DATA(avx512_mask_getmant_sd, INTR_TYPE_3OP_SCALAR_MASK,
	X86ISD::VGETMANTS, X86ISD::VGETMANTS_RND),
	X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK,
	X86ISD::VGETMANTS, X86ISD::VGETMANTS_RND),
	X86_INTRINSIC_DATA(avx512_mask_max_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
	X86ISD::FMAX_RND),
	X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
	X86ISD::FMAX_RND),
	X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK,
	X86ISD::FMAXS, X86ISD::FMAXS_RND),
	X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK,
	X86ISD::FMAXS, X86ISD::FMAXS_RND),
	X86_INTRINSIC_DATA(avx512_mask_min_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
	X86ISD::FMIN_RND),
	X86_INTRINSIC_DATA(avx512_mask_min_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
	X86ISD::FMIN_RND),
	X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK,
	X86ISD::FMINS, X86ISD::FMINS_RND),
	X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK,
	X86ISD::FMINS, X86ISD::FMINS_RND),
	X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
	X86ISD::FMUL_RND),
	X86_INTRINSIC_DATA(avx512_mask_mul_ps_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
	X86ISD::FMUL_RND),
	X86_INTRINSIC_DATA(avx512_mask_mul_sd_round, INTR_TYPE_SCALAR_MASK_RM,
	X86ISD::FMULS_RND, 0),
	X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK_RM,
	X86ISD::FMULS_RND, 0),
	X86_INTRINSIC_DATA(avx512_mask_padds_b_128, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
	X86_INTRINSIC_DATA(avx512_mask_padds_b_256, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
	X86_INTRINSIC_DATA(avx512_mask_padds_b_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
	X86_INTRINSIC_DATA(avx512_mask_padds_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
	X86_INTRINSIC_DATA(avx512_mask_padds_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
	X86_INTRINSIC_DATA(avx512_mask_padds_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
	X86_INTRINSIC_DATA(avx512_mask_paddus_b_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_paddus_b_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_paddus_b_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_paddus_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_paddus_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_paddus_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_permvar_df_256, VPERM_2OP_MASK,
	X86ISD::VPERMV, 0),
	X86_INTRINSIC_DATA(avx512_mask_permvar_df_512, VPERM_2OP_MASK,
	X86ISD::VPERMV, 0),
	X86_INTRINSIC_DATA(avx512_mask_permvar_di_256, VPERM_2OP_MASK,
	X86ISD::VPERMV, 0),
	X86_INTRINSIC_DATA(avx512_mask_permvar_di_512, VPERM_2OP_MASK,
	X86ISD::VPERMV, 0),
	X86_INTRINSIC_DATA(avx512_mask_permvar_hi_128, VPERM_2OP_MASK,
	X86ISD::VPERMV, 0),
	X86_INTRINSIC_DATA(avx512_mask_permvar_hi_256, VPERM_2OP_MASK,
	X86ISD::VPERMV, 0),
	X86_INTRINSIC_DATA(avx512_mask_permvar_hi_512, VPERM_2OP_MASK,
	X86ISD::VPERMV, 0),
	X86_INTRINSIC_DATA(avx512_mask_permvar_qi_128, VPERM_2OP_MASK,
	X86ISD::VPERMV, 0),
	X86_INTRINSIC_DATA(avx512_mask_permvar_qi_256, VPERM_2OP_MASK,
	X86ISD::VPERMV, 0),
	X86_INTRINSIC_DATA(avx512_mask_permvar_qi_512, VPERM_2OP_MASK,
	X86ISD::VPERMV, 0),
	X86_INTRINSIC_DATA(avx512_mask_permvar_sf_256, VPERM_2OP_MASK,
	X86ISD::VPERMV, 0),
	X86_INTRINSIC_DATA(avx512_mask_permvar_sf_512, VPERM_2OP_MASK,
	X86ISD::VPERMV, 0),
	X86_INTRINSIC_DATA(avx512_mask_permvar_si_256, VPERM_2OP_MASK,
	X86ISD::VPERMV, 0),
	X86_INTRINSIC_DATA(avx512_mask_permvar_si_512, VPERM_2OP_MASK,
	X86ISD::VPERMV, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_128, INTR_TYPE_2OP_MASK,
	X86ISD::VPMADDUBSW, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_256, INTR_TYPE_2OP_MASK,
	X86ISD::VPMADDUBSW, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmaddubs_w_512, INTR_TYPE_2OP_MASK,
	X86ISD::VPMADDUBSW, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_128, INTR_TYPE_2OP_MASK,
	X86ISD::VPMADDWD, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_256, INTR_TYPE_2OP_MASK,
	X86ISD::VPMADDWD, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmaddw_d_512, INTR_TYPE_2OP_MASK,
	X86ISD::VPMADDWD, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_db_128, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_db_256, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_db_512, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_dw_128, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_dw_256, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_dw_512, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_qb_128, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_qb_256, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_qb_512, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_qd_128, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_qd_256, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_qd_512, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_qw_128, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_qw_256, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_qw_512, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_wb_128, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_wb_256, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmov_wb_512, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNC, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_db_128, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_db_256, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_db_512, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_128, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_256, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_dw_512, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_128, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_256, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_512, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_128, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_256, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_512, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_128, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_256, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_qw_512, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_128, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_256, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovs_wb_512, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_db_128, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_db_256, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_db_512, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_128, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_256, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_dw_512, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_128, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_256, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_qb_512, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_128, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_256, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_qd_512, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_128, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_256, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_qw_512, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_128, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_256, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmovus_wb_512, INTR_TYPE_1OP_MASK,
	X86ISD::VTRUNCUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_128, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_256, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmul_hr_sw_512, INTR_TYPE_2OP_MASK, X86ISD::MULHRS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmulh_w_128, INTR_TYPE_2OP_MASK, ISD::MULHS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmulh_w_256, INTR_TYPE_2OP_MASK, ISD::MULHS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmulh_w_512, INTR_TYPE_2OP_MASK, ISD::MULHS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_128, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_256, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmulhu_w_512, INTR_TYPE_2OP_MASK, ISD::MULHU, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmultishift_qb_128, INTR_TYPE_2OP_MASK,
	X86ISD::MULTISHIFT, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmultishift_qb_256, INTR_TYPE_2OP_MASK,
	X86ISD::MULTISHIFT, 0),
	X86_INTRINSIC_DATA(avx512_mask_pmultishift_qb_512, INTR_TYPE_2OP_MASK,
	X86ISD::MULTISHIFT, 0),
	X86_INTRINSIC_DATA(avx512_mask_prol_d_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
	X86_INTRINSIC_DATA(avx512_mask_prol_d_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
	X86_INTRINSIC_DATA(avx512_mask_prol_d_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
	X86_INTRINSIC_DATA(avx512_mask_prol_q_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
	X86_INTRINSIC_DATA(avx512_mask_prol_q_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
	X86_INTRINSIC_DATA(avx512_mask_prol_q_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTLI, 0),
	X86_INTRINSIC_DATA(avx512_mask_prolv_d_128, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
	X86_INTRINSIC_DATA(avx512_mask_prolv_d_256, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
	X86_INTRINSIC_DATA(avx512_mask_prolv_d_512, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
	X86_INTRINSIC_DATA(avx512_mask_prolv_q_128, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
	X86_INTRINSIC_DATA(avx512_mask_prolv_q_256, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
	X86_INTRINSIC_DATA(avx512_mask_prolv_q_512, INTR_TYPE_2OP_MASK, ISD::ROTL, 0),
	X86_INTRINSIC_DATA(avx512_mask_pror_d_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
	X86_INTRINSIC_DATA(avx512_mask_pror_d_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
	X86_INTRINSIC_DATA(avx512_mask_pror_d_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
	X86_INTRINSIC_DATA(avx512_mask_pror_q_128, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
	X86_INTRINSIC_DATA(avx512_mask_pror_q_256, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
	X86_INTRINSIC_DATA(avx512_mask_pror_q_512, INTR_TYPE_2OP_IMM8_MASK, X86ISD::VROTRI, 0),
	X86_INTRINSIC_DATA(avx512_mask_prorv_d_128, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
	X86_INTRINSIC_DATA(avx512_mask_prorv_d_256, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
	X86_INTRINSIC_DATA(avx512_mask_prorv_d_512, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
	X86_INTRINSIC_DATA(avx512_mask_prorv_q_128, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
	X86_INTRINSIC_DATA(avx512_mask_prorv_q_256, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
	X86_INTRINSIC_DATA(avx512_mask_prorv_q_512, INTR_TYPE_2OP_MASK, ISD::ROTR, 0),
	X86_INTRINSIC_DATA(avx512_mask_psubs_b_128, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
	X86_INTRINSIC_DATA(avx512_mask_psubs_b_256, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
	X86_INTRINSIC_DATA(avx512_mask_psubs_b_512, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
	X86_INTRINSIC_DATA(avx512_mask_psubs_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
	X86_INTRINSIC_DATA(avx512_mask_psubs_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
	X86_INTRINSIC_DATA(avx512_mask_psubs_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBS, 0),
	X86_INTRINSIC_DATA(avx512_mask_psubus_b_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_psubus_b_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_psubus_b_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_psubus_w_128, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_psubus_w_256, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_psubus_w_512, INTR_TYPE_2OP_MASK, X86ISD::SUBUS, 0),
	X86_INTRINSIC_DATA(avx512_mask_pternlog_d_128, TERLOG_OP_MASK,
	X86ISD::VPTERNLOG, 0),
	X86_INTRINSIC_DATA(avx512_mask_pternlog_d_256, TERLOG_OP_MASK,
	X86ISD::VPTERNLOG, 0),
	X86_INTRINSIC_DATA(avx512_mask_pternlog_d_512, TERLOG_OP_MASK,
	X86ISD::VPTERNLOG, 0),
	X86_INTRINSIC_DATA(avx512_mask_pternlog_q_128, TERLOG_OP_MASK,
	X86ISD::VPTERNLOG, 0),
	X86_INTRINSIC_DATA(avx512_mask_pternlog_q_256, TERLOG_OP_MASK,
	X86ISD::VPTERNLOG, 0),
	X86_INTRINSIC_DATA(avx512_mask_pternlog_q_512, TERLOG_OP_MASK,
	X86ISD::VPTERNLOG, 0),
	X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0),
	X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0),
	X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, X86ISD::VRANGE_RND),
	X86_INTRINSIC_DATA(avx512_mask_range_ps_128, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0),
	X86_INTRINSIC_DATA(avx512_mask_range_ps_256, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0),
	X86_INTRINSIC_DATA(avx512_mask_range_ps_512, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, X86ISD::VRANGE_RND),
	X86_INTRINSIC_DATA(avx512_mask_range_sd, INTR_TYPE_SCALAR_MASK, X86ISD::VRANGES, X86ISD::VRANGES_RND),
	X86_INTRINSIC_DATA(avx512_mask_range_ss, INTR_TYPE_SCALAR_MASK, X86ISD::VRANGES, X86ISD::VRANGES_RND),
	X86_INTRINSIC_DATA(avx512_mask_reduce_pd_128, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, 0),
	X86_INTRINSIC_DATA(avx512_mask_reduce_pd_256, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, 0),
	X86_INTRINSIC_DATA(avx512_mask_reduce_pd_512, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, X86ISD::VREDUCE_RND),
	X86_INTRINSIC_DATA(avx512_mask_reduce_ps_128, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, 0),
	X86_INTRINSIC_DATA(avx512_mask_reduce_ps_256, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, 0),
	X86_INTRINSIC_DATA(avx512_mask_reduce_ps_512, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, X86ISD::VREDUCE_RND),
	X86_INTRINSIC_DATA(avx512_mask_reduce_sd, INTR_TYPE_SCALAR_MASK, X86ISD::VREDUCES, X86ISD::VREDUCES_RND),
	X86_INTRINSIC_DATA(avx512_mask_reduce_ss, INTR_TYPE_SCALAR_MASK, X86ISD::VREDUCES, X86ISD::VREDUCES_RND),
	X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_128, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, 0),
	X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_256, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, 0),
	X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_512, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, X86ISD::VRNDSCALE_RND),
	X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_128, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, 0),
	X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_256, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, 0),
	X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_512, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, X86ISD::VRNDSCALE_RND),
	X86_INTRINSIC_DATA(avx512_mask_rndscale_sd, INTR_TYPE_SCALAR_MASK,
	X86ISD::VRNDSCALES, X86ISD::VRNDSCALES_RND),
	X86_INTRINSIC_DATA(avx512_mask_rndscale_ss, INTR_TYPE_SCALAR_MASK,
	X86ISD::VRNDSCALES, X86ISD::VRNDSCALES_RND),
	X86_INTRINSIC_DATA(avx512_mask_scalef_pd_128, INTR_TYPE_2OP_MASK_RM,
	X86ISD::SCALEF, 0),
	X86_INTRINSIC_DATA(avx512_mask_scalef_pd_256, INTR_TYPE_2OP_MASK_RM,
	X86ISD::SCALEF, 0),
	X86_INTRINSIC_DATA(avx512_mask_scalef_pd_512, INTR_TYPE_2OP_MASK_RM,
	X86ISD::SCALEF, 0),
	X86_INTRINSIC_DATA(avx512_mask_scalef_ps_128, INTR_TYPE_2OP_MASK_RM,
	X86ISD::SCALEF, 0),
	X86_INTRINSIC_DATA(avx512_mask_scalef_ps_256, INTR_TYPE_2OP_MASK_RM,
	X86ISD::SCALEF, 0),
	X86_INTRINSIC_DATA(avx512_mask_scalef_ps_512, INTR_TYPE_2OP_MASK_RM,
	X86ISD::SCALEF, 0),
	X86_INTRINSIC_DATA(avx512_mask_scalef_sd, INTR_TYPE_SCALAR_MASK_RM,
	X86ISD::SCALEFS, 0),
	X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK_RM,
	X86ISD::SCALEFS, 0),
	X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
	X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
	X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_512, INTR_TYPE_1OP_MASK, ISD::FSQRT,
	X86ISD::FSQRT_RND),
	X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
	X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
	X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_512, INTR_TYPE_1OP_MASK, ISD::FSQRT,
	X86ISD::FSQRT_RND),
	X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK_RM,
	X86ISD::FSQRTS_RND, 0),
	X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK_RM,
	X86ISD::FSQRTS_RND, 0),
	X86_INTRINSIC_DATA(avx512_mask_sub_pd_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
	X86ISD::FSUB_RND),
	X86_INTRINSIC_DATA(avx512_mask_sub_ps_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
	X86ISD::FSUB_RND),
	X86_INTRINSIC_DATA(avx512_mask_sub_sd_round, INTR_TYPE_SCALAR_MASK_RM,
	X86ISD::FSUBS_RND, 0),
	X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM,
	X86ISD::FSUBS_RND, 0),
	X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK,
	X86ISD::CVTPH2PS, 0),
	X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK,
	X86ISD::CVTPH2PS, 0),
	X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK,
	X86ISD::CVTPH2PS, X86ISD::CVTPH2PS_RND),
	X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, INTR_TYPE_2OP_MASK,
	X86ISD::CVTPS2PH, 0),
	X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, INTR_TYPE_2OP_MASK,
	X86ISD::CVTPS2PH, 0),
	X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_512, INTR_TYPE_2OP_MASK,
	X86ISD::CVTPS2PH, 0),
	X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_128, FMA_OP_MASK, ISD::FMA, 0),
	X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_256, FMA_OP_MASK, ISD::FMA, 0),
	X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_512, FMA_OP_MASK, ISD::FMA,
	X86ISD::FMADD_RND),
	X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_128, FMA_OP_MASK, ISD::FMA, 0),
	X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_256, FMA_OP_MASK, ISD::FMA, 0),
	X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_512, FMA_OP_MASK, ISD::FMA,
	X86ISD::FMADD_RND),

	X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
	X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
	X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
	X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
	X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB,
	X86ISD::FMADDSUB_RND),
	X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
	X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
	X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_ps_512, FMA_OP_MASK, X86ISD::FMADDSUB,
	X86ISD::FMADDSUB_RND),

	X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_128, FMA_OP_MASK, X86ISD::FNMADD, 0),
	X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_256, FMA_OP_MASK, X86ISD::FNMADD, 0),
	X86_INTRINSIC_DATA(avx512_mask_vfnmadd_pd_512, FMA_OP_MASK, X86ISD::FNMADD,
	X86ISD::FNMADD_RND),
	X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_128, FMA_OP_MASK, X86ISD::FNMADD, 0),
	X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_256, FMA_OP_MASK, X86ISD::FNMADD, 0),
	X86_INTRINSIC_DATA(avx512_mask_vfnmadd_ps_512, FMA_OP_MASK, X86ISD::FNMADD,
	X86ISD::FNMADD_RND),

	X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_128, FMA_OP_MASK, X86ISD::FNMSUB, 0),
	X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_256, FMA_OP_MASK, X86ISD::FNMSUB, 0),
	X86_INTRINSIC_DATA(avx512_mask_vfnmsub_pd_512, FMA_OP_MASK, X86ISD::FNMSUB,
	X86ISD::FNMSUB_RND),
	X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_128, FMA_OP_MASK, X86ISD::FNMSUB, 0),
	X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_256, FMA_OP_MASK, X86ISD::FNMSUB, 0),
	X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_512, FMA_OP_MASK, X86ISD::FNMSUB,
	X86ISD::FNMSUB_RND),

	X86_INTRINSIC_DATA(avx512_mask_vpdpbusd_128, FMA_OP_MASK, X86ISD::VPDPBUSD, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpdpbusd_256, FMA_OP_MASK, X86ISD::VPDPBUSD, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpdpbusd_512, FMA_OP_MASK, X86ISD::VPDPBUSD, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpdpbusds_128, FMA_OP_MASK, X86ISD::VPDPBUSDS, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpdpbusds_256, FMA_OP_MASK, X86ISD::VPDPBUSDS, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpdpbusds_512, FMA_OP_MASK, X86ISD::VPDPBUSDS, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpdpwssd_128, FMA_OP_MASK, X86ISD::VPDPWSSD, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpdpwssd_256, FMA_OP_MASK, X86ISD::VPDPWSSD, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpdpwssd_512, FMA_OP_MASK, X86ISD::VPDPWSSD, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpdpwssds_128, FMA_OP_MASK, X86ISD::VPDPWSSDS, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpdpwssds_256, FMA_OP_MASK, X86ISD::VPDPWSSDS, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpdpwssds_512, FMA_OP_MASK, X86ISD::VPDPWSSDS, 0),

	X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_128, VPERM_3OP_MASK,
	X86ISD::VPERMIV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_256, VPERM_3OP_MASK,
	X86ISD::VPERMIV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_512, VPERM_3OP_MASK,
	X86ISD::VPERMIV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_128, VPERM_3OP_MASK,
	X86ISD::VPERMIV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_256, VPERM_3OP_MASK,
	X86ISD::VPERMIV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_512, VPERM_3OP_MASK,
	X86ISD::VPERMIV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_128, VPERM_3OP_MASK,
	X86ISD::VPERMIV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_256, VPERM_3OP_MASK,
	X86ISD::VPERMIV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_512, VPERM_3OP_MASK,
	X86ISD::VPERMIV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_128, VPERM_3OP_MASK,
	X86ISD::VPERMIV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_256, VPERM_3OP_MASK,
	X86ISD::VPERMIV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_512, VPERM_3OP_MASK,
	X86ISD::VPERMIV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_128, VPERM_3OP_MASK,
	X86ISD::VPERMIV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_256, VPERM_3OP_MASK,
	X86ISD::VPERMIV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_512, VPERM_3OP_MASK,
	X86ISD::VPERMIV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermi2var_qi_128, VPERM_3OP_MASK,
	X86ISD::VPERMIV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermi2var_qi_256, VPERM_3OP_MASK,
	X86ISD::VPERMIV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermi2var_qi_512, VPERM_3OP_MASK,
	X86ISD::VPERMIV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_128, VPERM_3OP_MASK,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_256, VPERM_3OP_MASK,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_512, VPERM_3OP_MASK,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_128, VPERM_3OP_MASK,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_256, VPERM_3OP_MASK,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_512, VPERM_3OP_MASK,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_128, VPERM_3OP_MASK,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_256, VPERM_3OP_MASK,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_512, VPERM_3OP_MASK,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_128, VPERM_3OP_MASK,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_256, VPERM_3OP_MASK,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_512, VPERM_3OP_MASK,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_128, VPERM_3OP_MASK,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_256, VPERM_3OP_MASK,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_512, VPERM_3OP_MASK,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermt2var_qi_128, VPERM_3OP_MASK,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermt2var_qi_256, VPERM_3OP_MASK,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpermt2var_qi_512, VPERM_3OP_MASK,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_128 , IFMA_OP_MASK,
	X86ISD::VPMADD52H, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_256 , IFMA_OP_MASK,
	X86ISD::VPMADD52H, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_512 , IFMA_OP_MASK,
	X86ISD::VPMADD52H, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_128 , IFMA_OP_MASK,
	X86ISD::VPMADD52L, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_256 , IFMA_OP_MASK,
	X86ISD::VPMADD52L, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_512 , IFMA_OP_MASK,
	X86ISD::VPMADD52L, 0),

	X86_INTRINSIC_DATA(avx512_mask_vpshld_d_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshld_d_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshld_d_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshld_q_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshld_q_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshld_q_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshld_w_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshld_w_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshld_w_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshldv_d_128, FMA_OP_MASK, X86ISD::VSHLDV, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshldv_d_256, FMA_OP_MASK, X86ISD::VSHLDV, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshldv_d_512, FMA_OP_MASK, X86ISD::VSHLDV, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshldv_q_128, FMA_OP_MASK, X86ISD::VSHLDV, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshldv_q_256, FMA_OP_MASK, X86ISD::VSHLDV, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshldv_q_512, FMA_OP_MASK, X86ISD::VSHLDV, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshldv_w_128, FMA_OP_MASK, X86ISD::VSHLDV, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshldv_w_256, FMA_OP_MASK, X86ISD::VSHLDV, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshldv_w_512, FMA_OP_MASK, X86ISD::VSHLDV, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshrd_d_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshrd_d_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshrd_d_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshrd_q_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshrd_q_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshrd_q_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshrd_w_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshrd_w_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshrd_w_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshrdv_d_128, FMA_OP_MASK, X86ISD::VSHRDV, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshrdv_d_256, FMA_OP_MASK, X86ISD::VSHRDV, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshrdv_d_512, FMA_OP_MASK, X86ISD::VSHRDV, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshrdv_q_128, FMA_OP_MASK, X86ISD::VSHRDV, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshrdv_q_256, FMA_OP_MASK, X86ISD::VSHRDV, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshrdv_q_512, FMA_OP_MASK, X86ISD::VSHRDV, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshrdv_w_128, FMA_OP_MASK, X86ISD::VSHRDV, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshrdv_w_256, FMA_OP_MASK, X86ISD::VSHRDV, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshrdv_w_512, FMA_OP_MASK, X86ISD::VSHRDV, 0),

	X86_INTRINSIC_DATA(avx512_mask_vpshufbitqmb_128, CMP_MASK,
	X86ISD::VPSHUFBITQMB, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshufbitqmb_256, CMP_MASK,
	X86ISD::VPSHUFBITQMB, 0),
	X86_INTRINSIC_DATA(avx512_mask_vpshufbitqmb_512, CMP_MASK,
	X86ISD::VPSHUFBITQMB, 0),

	X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_128, FMA_OP_MASK3, ISD::FMA, 0),
	X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_256, FMA_OP_MASK3, ISD::FMA, 0),
	X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_512, FMA_OP_MASK3, ISD::FMA,
	X86ISD::FMADD_RND),
	X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_128, FMA_OP_MASK3, ISD::FMA, 0),
	X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_256, FMA_OP_MASK3, ISD::FMA, 0),
	X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_512, FMA_OP_MASK3, ISD::FMA,
	X86ISD::FMADD_RND),

	X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3, X86ISD::FMADDS3_RND),
	X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3, X86ISD::FMADDS3_RND),
	X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
	X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
	X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_512, FMA_OP_MASK3, X86ISD::FMADDSUB,
	X86ISD::FMADDSUB_RND),
	X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
	X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
	X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_ps_512, FMA_OP_MASK3, X86ISD::FMADDSUB,
	X86ISD::FMADDSUB_RND),

	X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_128, FMA_OP_MASK3, X86ISD::FMSUB, 0),
	X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_256, FMA_OP_MASK3, X86ISD::FMSUB, 0),
	X86_INTRINSIC_DATA(avx512_mask3_vfmsub_pd_512, FMA_OP_MASK3, X86ISD::FMSUB,
	X86ISD::FMSUB_RND),
	X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_128, FMA_OP_MASK3, X86ISD::FMSUB, 0),
	X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_256, FMA_OP_MASK3, X86ISD::FMSUB, 0),
	X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_512, FMA_OP_MASK3, X86ISD::FMSUB,
	X86ISD::FMSUB_RND),
	X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3, X86ISD::FMSUBS3_RND),
	X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3, X86ISD::FMSUBS3_RND),

	X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
	X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
	X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_512, FMA_OP_MASK3, X86ISD::FMSUBADD,
	X86ISD::FMSUBADD_RND),
	X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
	X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
	X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_ps_512, FMA_OP_MASK3, X86ISD::FMSUBADD,
	X86ISD::FMSUBADD_RND),

	X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_128, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
	X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
	X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_pd_512, FMA_OP_MASK3, X86ISD::FNMSUB,
	X86ISD::FNMSUB_RND),
	X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_128, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
	X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
	X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_512, FMA_OP_MASK3, X86ISD::FNMSUB,
	X86ISD::FNMSUB_RND),
	X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3, X86ISD::FNMSUBS3_RND),
	X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3, X86ISD::FNMSUBS3_RND),
	X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_128, FIXUPIMM_MASKZ,
	X86ISD::VFIXUPIMM, 0),
	X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_256, FIXUPIMM_MASKZ,
	X86ISD::VFIXUPIMM, 0),
	X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_512, FIXUPIMM_MASKZ,
	X86ISD::VFIXUPIMM, 0),
	X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ps_128, FIXUPIMM_MASKZ,
	X86ISD::VFIXUPIMM, 0),
	X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ps_256, FIXUPIMM_MASKZ,
	X86ISD::VFIXUPIMM, 0),
	X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ps_512, FIXUPIMM_MASKZ,
	X86ISD::VFIXUPIMM, 0),
	X86_INTRINSIC_DATA(avx512_maskz_fixupimm_sd, FIXUPIMMS_MASKZ,
	X86ISD::VFIXUPIMMS, 0),
	X86_INTRINSIC_DATA(avx512_maskz_fixupimm_ss, FIXUPIMMS_MASKZ,
	X86ISD::VFIXUPIMMS, 0),
	X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_128, TERLOG_OP_MASKZ,
	X86ISD::VPTERNLOG, 0),
	X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_256, TERLOG_OP_MASKZ,
	X86ISD::VPTERNLOG, 0),
	X86_INTRINSIC_DATA(avx512_maskz_pternlog_d_512, TERLOG_OP_MASKZ,
	X86ISD::VPTERNLOG, 0),
	X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_128, TERLOG_OP_MASKZ,
	X86ISD::VPTERNLOG, 0),
	X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_256, TERLOG_OP_MASKZ,
	X86ISD::VPTERNLOG, 0),
	X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_512, TERLOG_OP_MASKZ,
	X86ISD::VPTERNLOG, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_128, FMA_OP_MASKZ, ISD::FMA, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_256, FMA_OP_MASKZ, ISD::FMA, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_512, FMA_OP_MASKZ, ISD::FMA,
	X86ISD::FMADD_RND),
	X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_128, FMA_OP_MASKZ, ISD::FMA, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_256, FMA_OP_MASKZ, ISD::FMA, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_512, FMA_OP_MASKZ, ISD::FMA,
	X86ISD::FMADD_RND),

	X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
	X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
	X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_512, FMA_OP_MASKZ, X86ISD::FMADDSUB,
	X86ISD::FMADDSUB_RND),
	X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_512, FMA_OP_MASKZ, X86ISD::FMADDSUB,
	X86ISD::FMADDSUB_RND),

	X86_INTRINSIC_DATA(avx512_maskz_vpdpbusd_128, FMA_OP_MASKZ, X86ISD::VPDPBUSD, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpdpbusd_256, FMA_OP_MASKZ, X86ISD::VPDPBUSD, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpdpbusd_512, FMA_OP_MASKZ, X86ISD::VPDPBUSD, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpdpbusds_128, FMA_OP_MASKZ, X86ISD::VPDPBUSDS, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpdpbusds_256, FMA_OP_MASKZ, X86ISD::VPDPBUSDS, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpdpbusds_512, FMA_OP_MASKZ, X86ISD::VPDPBUSDS, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpdpwssd_128, FMA_OP_MASKZ, X86ISD::VPDPWSSD, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpdpwssd_256, FMA_OP_MASKZ, X86ISD::VPDPWSSD, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpdpwssd_512, FMA_OP_MASKZ, X86ISD::VPDPWSSD, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpdpwssds_128, FMA_OP_MASKZ, X86ISD::VPDPWSSDS, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpdpwssds_256, FMA_OP_MASKZ, X86ISD::VPDPWSSDS, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpdpwssds_512, FMA_OP_MASKZ, X86ISD::VPDPWSSDS, 0),

	X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_128, VPERM_3OP_MASKZ,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_256, VPERM_3OP_MASKZ,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_512, VPERM_3OP_MASKZ,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_128, VPERM_3OP_MASKZ,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_256, VPERM_3OP_MASKZ,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_512, VPERM_3OP_MASKZ,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_128, VPERM_3OP_MASKZ,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_256, VPERM_3OP_MASKZ,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_512, VPERM_3OP_MASKZ,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_128, VPERM_3OP_MASKZ,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_256, VPERM_3OP_MASKZ,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_512, VPERM_3OP_MASKZ,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_128, VPERM_3OP_MASKZ,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_256, VPERM_3OP_MASKZ,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_512, VPERM_3OP_MASKZ,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_qi_128, VPERM_3OP_MASKZ,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_qi_256, VPERM_3OP_MASKZ,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_qi_512, VPERM_3OP_MASKZ,
	X86ISD::VPERMV3, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_128, IFMA_OP_MASKZ,
	X86ISD::VPMADD52H, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_256, IFMA_OP_MASKZ,
	X86ISD::VPMADD52H, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_512, IFMA_OP_MASKZ,
	X86ISD::VPMADD52H, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_128, IFMA_OP_MASKZ,
	X86ISD::VPMADD52L, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_256, IFMA_OP_MASKZ,
	X86ISD::VPMADD52L, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_512, IFMA_OP_MASKZ,
	X86ISD::VPMADD52L, 0),

	X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_128, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_256, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_512, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpshldv_q_128, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpshldv_q_256, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpshldv_q_512, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpshldv_w_128, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpshldv_w_256, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpshldv_w_512, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_d_128, FMA_OP_MASKZ, X86ISD::VSHRDV, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_d_256, FMA_OP_MASKZ, X86ISD::VSHRDV, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_d_512, FMA_OP_MASKZ, X86ISD::VSHRDV, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_q_128, FMA_OP_MASKZ, X86ISD::VSHRDV, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_q_256, FMA_OP_MASKZ, X86ISD::VSHRDV, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_q_512, FMA_OP_MASKZ, X86ISD::VSHRDV, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_w_128, FMA_OP_MASKZ, X86ISD::VSHRDV, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_w_256, FMA_OP_MASKZ, X86ISD::VSHRDV, 0),
	X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_w_512, FMA_OP_MASKZ, X86ISD::VSHRDV, 0),

	X86_INTRINSIC_DATA(avx512_packssdw_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
	X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
	X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
	X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
	X86_INTRINSIC_DATA(avx512_pmul_dq_512, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
	X86_INTRINSIC_DATA(avx512_pmulu_dq_512, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
	X86_INTRINSIC_DATA(avx512_psad_bw_512, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
	X86_INTRINSIC_DATA(avx512_pshuf_b_512, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
	X86_INTRINSIC_DATA(avx512_psll_d_512, INTR_TYPE_2OP, X86ISD::VSHL, 0),
	X86_INTRINSIC_DATA(avx512_psll_q_512, INTR_TYPE_2OP, X86ISD::VSHL, 0),
	X86_INTRINSIC_DATA(avx512_psll_w_512, INTR_TYPE_2OP, X86ISD::VSHL, 0),
	X86_INTRINSIC_DATA(avx512_pslli_d_512, VSHIFT, X86ISD::VSHLI, 0),
	X86_INTRINSIC_DATA(avx512_pslli_q_512, VSHIFT, X86ISD::VSHLI, 0),
	X86_INTRINSIC_DATA(avx512_pslli_w_512, VSHIFT, X86ISD::VSHLI, 0),
	X86_INTRINSIC_DATA(avx512_psllv_d_512, INTR_TYPE_2OP, ISD::SHL, 0),
	X86_INTRINSIC_DATA(avx512_psllv_q_512, INTR_TYPE_2OP, ISD::SHL, 0),
	X86_INTRINSIC_DATA(avx512_psllv_w_128, INTR_TYPE_2OP, ISD::SHL, 0),
	X86_INTRINSIC_DATA(avx512_psllv_w_256, INTR_TYPE_2OP, ISD::SHL, 0),
	X86_INTRINSIC_DATA(avx512_psllv_w_512, INTR_TYPE_2OP, ISD::SHL, 0),
	X86_INTRINSIC_DATA(avx512_psra_d_512, INTR_TYPE_2OP, X86ISD::VSRA, 0),
	X86_INTRINSIC_DATA(avx512_psra_q_128, INTR_TYPE_2OP, X86ISD::VSRA, 0),
	X86_INTRINSIC_DATA(avx512_psra_q_256, INTR_TYPE_2OP, X86ISD::VSRA, 0),
	X86_INTRINSIC_DATA(avx512_psra_q_512, INTR_TYPE_2OP, X86ISD::VSRA, 0),
	X86_INTRINSIC_DATA(avx512_psra_w_512, INTR_TYPE_2OP, X86ISD::VSRA, 0),
	X86_INTRINSIC_DATA(avx512_psrai_d_512, VSHIFT, X86ISD::VSRAI, 0),
	X86_INTRINSIC_DATA(avx512_psrai_q_128, VSHIFT, X86ISD::VSRAI, 0),
	X86_INTRINSIC_DATA(avx512_psrai_q_256, VSHIFT, X86ISD::VSRAI, 0),
	X86_INTRINSIC_DATA(avx512_psrai_q_512, VSHIFT, X86ISD::VSRAI, 0),
	X86_INTRINSIC_DATA(avx512_psrai_w_512, VSHIFT, X86ISD::VSRAI, 0),
	X86_INTRINSIC_DATA(avx512_psrav_d_512, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
	X86_INTRINSIC_DATA(avx512_psrav_q_128, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
	X86_INTRINSIC_DATA(avx512_psrav_q_256, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
	X86_INTRINSIC_DATA(avx512_psrav_q_512, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
	X86_INTRINSIC_DATA(avx512_psrav_w_128, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
	X86_INTRINSIC_DATA(avx512_psrav_w_256, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
	X86_INTRINSIC_DATA(avx512_psrav_w_512, INTR_TYPE_2OP, X86ISD::VSRAV, 0),
	X86_INTRINSIC_DATA(avx512_psrl_d_512, INTR_TYPE_2OP, X86ISD::VSRL, 0),
	X86_INTRINSIC_DATA(avx512_psrl_q_512, INTR_TYPE_2OP, X86ISD::VSRL, 0),
	X86_INTRINSIC_DATA(avx512_psrl_w_512, INTR_TYPE_2OP, X86ISD::VSRL, 0),
	X86_INTRINSIC_DATA(avx512_psrli_d_512, VSHIFT, X86ISD::VSRLI, 0),
	X86_INTRINSIC_DATA(avx512_psrli_q_512, VSHIFT, X86ISD::VSRLI, 0),
	X86_INTRINSIC_DATA(avx512_psrli_w_512, VSHIFT, X86ISD::VSRLI, 0),
	X86_INTRINSIC_DATA(avx512_psrlv_d_512, INTR_TYPE_2OP, ISD::SRL, 0),
	X86_INTRINSIC_DATA(avx512_psrlv_q_512, INTR_TYPE_2OP, ISD::SRL, 0),
	X86_INTRINSIC_DATA(avx512_psrlv_w_128, INTR_TYPE_2OP, ISD::SRL, 0),
	X86_INTRINSIC_DATA(avx512_psrlv_w_256, INTR_TYPE_2OP, ISD::SRL, 0),
	X86_INTRINSIC_DATA(avx512_psrlv_w_512, INTR_TYPE_2OP, ISD::SRL, 0),
	X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
	X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
	X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
	X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
	X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
	X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
	X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0),
	X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0),
	X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
	X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
	X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0),
	X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0),
	X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
	X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
	X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
	X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
	X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
	X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
	X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0),
	X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0),
	X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
	X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
	X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0),
	X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0),
	X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
	X86_INTRINSIC_DATA(avx512_vcomi_ss, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
	X86_INTRINSIC_DATA(avx512_vcvtsd2si32, INTR_TYPE_2OP, X86ISD::CVTS2SI_RND, 0),
	X86_INTRINSIC_DATA(avx512_vcvtsd2si64, INTR_TYPE_2OP, X86ISD::CVTS2SI_RND, 0),
	X86_INTRINSIC_DATA(avx512_vcvtsd2usi32, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0),
	X86_INTRINSIC_DATA(avx512_vcvtsd2usi64, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0),
	X86_INTRINSIC_DATA(avx512_vcvtss2si32, INTR_TYPE_2OP, X86ISD::CVTS2SI_RND, 0),
	X86_INTRINSIC_DATA(avx512_vcvtss2si64, INTR_TYPE_2OP, X86ISD::CVTS2SI_RND, 0),
	X86_INTRINSIC_DATA(avx512_vcvtss2usi32, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0),
	X86_INTRINSIC_DATA(avx512_vcvtss2usi64, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0),
	X86_INTRINSIC_DATA(avx512_vpermilvar_pd_512, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
	X86_INTRINSIC_DATA(avx512_vpermilvar_ps_512, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
	X86_INTRINSIC_DATA(fma_vfmadd_pd, INTR_TYPE_3OP, ISD::FMA, 0),
	X86_INTRINSIC_DATA(fma_vfmadd_pd_256, INTR_TYPE_3OP, ISD::FMA, 0),
	X86_INTRINSIC_DATA(fma_vfmadd_ps, INTR_TYPE_3OP, ISD::FMA, 0),
	X86_INTRINSIC_DATA(fma_vfmadd_ps_256, INTR_TYPE_3OP, ISD::FMA, 0),
	X86_INTRINSIC_DATA(fma_vfmadd_sd, INTR_TYPE_3OP, X86ISD::FMADDS1, 0),
	X86_INTRINSIC_DATA(fma_vfmadd_ss, INTR_TYPE_3OP, X86ISD::FMADDS1, 0),
	X86_INTRINSIC_DATA(fma_vfmaddsub_pd, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
	X86_INTRINSIC_DATA(fma_vfmaddsub_pd_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
	X86_INTRINSIC_DATA(fma_vfmaddsub_ps, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
	X86_INTRINSIC_DATA(fma_vfmaddsub_ps_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
	X86_INTRINSIC_DATA(fma_vfmsub_pd, INTR_TYPE_3OP, X86ISD::FMSUB, 0),
	X86_INTRINSIC_DATA(fma_vfmsub_pd_256, INTR_TYPE_3OP, X86ISD::FMSUB, 0),
	X86_INTRINSIC_DATA(fma_vfmsub_ps, INTR_TYPE_3OP, X86ISD::FMSUB, 0),
	X86_INTRINSIC_DATA(fma_vfmsub_ps_256, INTR_TYPE_3OP, X86ISD::FMSUB, 0),
	X86_INTRINSIC_DATA(fma_vfmsub_sd, INTR_TYPE_3OP, X86ISD::FMSUBS1, 0),
	X86_INTRINSIC_DATA(fma_vfmsub_ss, INTR_TYPE_3OP, X86ISD::FMSUBS1, 0),
	X86_INTRINSIC_DATA(fma_vfmsubadd_pd, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
	X86_INTRINSIC_DATA(fma_vfmsubadd_pd_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
	X86_INTRINSIC_DATA(fma_vfmsubadd_ps, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
	X86_INTRINSIC_DATA(fma_vfmsubadd_ps_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
	X86_INTRINSIC_DATA(fma_vfnmadd_pd, INTR_TYPE_3OP, X86ISD::FNMADD, 0),
	X86_INTRINSIC_DATA(fma_vfnmadd_pd_256, INTR_TYPE_3OP, X86ISD::FNMADD, 0),
	X86_INTRINSIC_DATA(fma_vfnmadd_ps, INTR_TYPE_3OP, X86ISD::FNMADD, 0),
	X86_INTRINSIC_DATA(fma_vfnmadd_ps_256, INTR_TYPE_3OP, X86ISD::FNMADD, 0),
	X86_INTRINSIC_DATA(fma_vfnmadd_sd, INTR_TYPE_3OP, X86ISD::FNMADDS1, 0),
	X86_INTRINSIC_DATA(fma_vfnmadd_ss, INTR_TYPE_3OP, X86ISD::FNMADDS1, 0),
	X86_INTRINSIC_DATA(fma_vfnmsub_pd, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
	X86_INTRINSIC_DATA(fma_vfnmsub_pd_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
	X86_INTRINSIC_DATA(fma_vfnmsub_ps, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
	X86_INTRINSIC_DATA(fma_vfnmsub_ps_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
	X86_INTRINSIC_DATA(fma_vfnmsub_sd, INTR_TYPE_3OP, X86ISD::FNMSUBS1, 0),
	X86_INTRINSIC_DATA(fma_vfnmsub_ss, INTR_TYPE_3OP, X86ISD::FNMSUBS1, 0),
	X86_INTRINSIC_DATA(fma4_vfmadd_sd, INTR_TYPE_3OP, X86ISD::FMADD4S, 0),
	X86_INTRINSIC_DATA(fma4_vfmadd_ss, INTR_TYPE_3OP, X86ISD::FMADD4S, 0),
	X86_INTRINSIC_DATA(sse_cmp_ps, INTR_TYPE_3OP, X86ISD::CMPP, 0),
	X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ),
	X86_INTRINSIC_DATA(sse_comige_ss, COMI, X86ISD::COMI, ISD::SETGE),
	X86_INTRINSIC_DATA(sse_comigt_ss, COMI, X86ISD::COMI, ISD::SETGT),
	X86_INTRINSIC_DATA(sse_comile_ss, COMI, X86ISD::COMI, ISD::SETLE),
	X86_INTRINSIC_DATA(sse_comilt_ss, COMI, X86ISD::COMI, ISD::SETLT),
	X86_INTRINSIC_DATA(sse_comineq_ss, COMI, X86ISD::COMI, ISD::SETNE),
	X86_INTRINSIC_DATA(sse_max_ps, INTR_TYPE_2OP, X86ISD::FMAX, 0),
	X86_INTRINSIC_DATA(sse_max_ss, INTR_TYPE_2OP, X86ISD::FMAXS, 0),
	X86_INTRINSIC_DATA(sse_min_ps, INTR_TYPE_2OP, X86ISD::FMIN, 0),
	X86_INTRINSIC_DATA(sse_min_ss, INTR_TYPE_2OP, X86ISD::FMINS, 0),
	X86_INTRINSIC_DATA(sse_movmsk_ps, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
	X86_INTRINSIC_DATA(sse_rcp_ps, INTR_TYPE_1OP, X86ISD::FRCP, 0),
	X86_INTRINSIC_DATA(sse_rsqrt_ps, INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
	X86_INTRINSIC_DATA(sse_sqrt_ps, INTR_TYPE_1OP, ISD::FSQRT, 0),
	X86_INTRINSIC_DATA(sse_ucomieq_ss, COMI, X86ISD::UCOMI, ISD::SETEQ),
	X86_INTRINSIC_DATA(sse_ucomige_ss, COMI, X86ISD::UCOMI, ISD::SETGE),
	X86_INTRINSIC_DATA(sse_ucomigt_ss, COMI, X86ISD::UCOMI, ISD::SETGT),
	X86_INTRINSIC_DATA(sse_ucomile_ss, COMI, X86ISD::UCOMI, ISD::SETLE),
	X86_INTRINSIC_DATA(sse_ucomilt_ss, COMI, X86ISD::UCOMI, ISD::SETLT),
	X86_INTRINSIC_DATA(sse_ucomineq_ss, COMI, X86ISD::UCOMI, ISD::SETNE),
	X86_INTRINSIC_DATA(sse2_cmp_pd, INTR_TYPE_3OP, X86ISD::CMPP, 0),
	X86_INTRINSIC_DATA(sse2_comieq_sd, COMI, X86ISD::COMI, ISD::SETEQ),
	X86_INTRINSIC_DATA(sse2_comige_sd, COMI, X86ISD::COMI, ISD::SETGE),
	X86_INTRINSIC_DATA(sse2_comigt_sd, COMI, X86ISD::COMI, ISD::SETGT),
	X86_INTRINSIC_DATA(sse2_comile_sd, COMI, X86ISD::COMI, ISD::SETLE),
	X86_INTRINSIC_DATA(sse2_comilt_sd, COMI, X86ISD::COMI, ISD::SETLT),
	X86_INTRINSIC_DATA(sse2_comineq_sd, COMI, X86ISD::COMI, ISD::SETNE),
	X86_INTRINSIC_DATA(sse2_cvtdq2ps, INTR_TYPE_1OP, ISD::SINT_TO_FP, 0),
	X86_INTRINSIC_DATA(sse2_cvtpd2dq, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0),
	X86_INTRINSIC_DATA(sse2_cvtpd2ps, INTR_TYPE_1OP, X86ISD::VFPROUND, 0),
	X86_INTRINSIC_DATA(sse2_cvttpd2dq, INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0),
	X86_INTRINSIC_DATA(sse2_cvttps2dq, INTR_TYPE_1OP, ISD::FP_TO_SINT, 0),
	X86_INTRINSIC_DATA(sse2_max_pd, INTR_TYPE_2OP, X86ISD::FMAX, 0),
	X86_INTRINSIC_DATA(sse2_max_sd, INTR_TYPE_2OP, X86ISD::FMAXS, 0),
	X86_INTRINSIC_DATA(sse2_min_pd, INTR_TYPE_2OP, X86ISD::FMIN, 0),
	X86_INTRINSIC_DATA(sse2_min_sd, INTR_TYPE_2OP, X86ISD::FMINS, 0),
	X86_INTRINSIC_DATA(sse2_movmsk_pd, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
	X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
	X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
	X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
	X86_INTRINSIC_DATA(sse2_padds_b, INTR_TYPE_2OP, X86ISD::ADDS, 0),
	X86_INTRINSIC_DATA(sse2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0),
	X86_INTRINSIC_DATA(sse2_paddus_b, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
	X86_INTRINSIC_DATA(sse2_paddus_w, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
	X86_INTRINSIC_DATA(sse2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
	X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
	X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
	X86_INTRINSIC_DATA(sse2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0),
	X86_INTRINSIC_DATA(sse2_pmulu_dq, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
	X86_INTRINSIC_DATA(sse2_psad_bw, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
	X86_INTRINSIC_DATA(sse2_psll_d, INTR_TYPE_2OP, X86ISD::VSHL, 0),
	X86_INTRINSIC_DATA(sse2_psll_q, INTR_TYPE_2OP, X86ISD::VSHL, 0),
	X86_INTRINSIC_DATA(sse2_psll_w, INTR_TYPE_2OP, X86ISD::VSHL, 0),
	X86_INTRINSIC_DATA(sse2_pslli_d, VSHIFT, X86ISD::VSHLI, 0),
	X86_INTRINSIC_DATA(sse2_pslli_q, VSHIFT, X86ISD::VSHLI, 0),
	X86_INTRINSIC_DATA(sse2_pslli_w, VSHIFT, X86ISD::VSHLI, 0),
	X86_INTRINSIC_DATA(sse2_psra_d, INTR_TYPE_2OP, X86ISD::VSRA, 0),
	X86_INTRINSIC_DATA(sse2_psra_w, INTR_TYPE_2OP, X86ISD::VSRA, 0),
	X86_INTRINSIC_DATA(sse2_psrai_d, VSHIFT, X86ISD::VSRAI, 0),
	X86_INTRINSIC_DATA(sse2_psrai_w, VSHIFT, X86ISD::VSRAI, 0),
	X86_INTRINSIC_DATA(sse2_psrl_d, INTR_TYPE_2OP, X86ISD::VSRL, 0),
	X86_INTRINSIC_DATA(sse2_psrl_q, INTR_TYPE_2OP, X86ISD::VSRL, 0),
	X86_INTRINSIC_DATA(sse2_psrl_w, INTR_TYPE_2OP, X86ISD::VSRL, 0),
	X86_INTRINSIC_DATA(sse2_psrli_d, VSHIFT, X86ISD::VSRLI, 0),
	X86_INTRINSIC_DATA(sse2_psrli_q, VSHIFT, X86ISD::VSRLI, 0),
	X86_INTRINSIC_DATA(sse2_psrli_w, VSHIFT, X86ISD::VSRLI, 0),
	X86_INTRINSIC_DATA(sse2_psubs_b, INTR_TYPE_2OP, X86ISD::SUBS, 0),
	X86_INTRINSIC_DATA(sse2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0),
	X86_INTRINSIC_DATA(sse2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
	X86_INTRINSIC_DATA(sse2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
	X86_INTRINSIC_DATA(sse2_sqrt_pd, INTR_TYPE_1OP, ISD::FSQRT, 0),
	X86_INTRINSIC_DATA(sse2_ucomieq_sd, COMI, X86ISD::UCOMI, ISD::SETEQ),
	X86_INTRINSIC_DATA(sse2_ucomige_sd, COMI, X86ISD::UCOMI, ISD::SETGE),
	X86_INTRINSIC_DATA(sse2_ucomigt_sd, COMI, X86ISD::UCOMI, ISD::SETGT),
	X86_INTRINSIC_DATA(sse2_ucomile_sd, COMI, X86ISD::UCOMI, ISD::SETLE),
	X86_INTRINSIC_DATA(sse2_ucomilt_sd, COMI, X86ISD::UCOMI, ISD::SETLT),
	X86_INTRINSIC_DATA(sse2_ucomineq_sd, COMI, X86ISD::UCOMI, ISD::SETNE),
	X86_INTRINSIC_DATA(sse3_addsub_pd, INTR_TYPE_2OP, X86ISD::ADDSUB, 0),
	X86_INTRINSIC_DATA(sse3_addsub_ps, INTR_TYPE_2OP, X86ISD::ADDSUB, 0),
	X86_INTRINSIC_DATA(sse3_hadd_pd, INTR_TYPE_2OP, X86ISD::FHADD, 0),
	X86_INTRINSIC_DATA(sse3_hadd_ps, INTR_TYPE_2OP, X86ISD::FHADD, 0),
	X86_INTRINSIC_DATA(sse3_hsub_pd, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
	X86_INTRINSIC_DATA(sse3_hsub_ps, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
	X86_INTRINSIC_DATA(sse41_insertps, INTR_TYPE_3OP, X86ISD::INSERTPS, 0),
	X86_INTRINSIC_DATA(sse41_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
	X86_INTRINSIC_DATA(sse41_phminposuw, INTR_TYPE_1OP, X86ISD::PHMINPOS, 0),
	X86_INTRINSIC_DATA(sse41_pmuldq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
	X86_INTRINSIC_DATA(sse41_round_pd, ROUNDP, X86ISD::VRNDSCALE, 0),
	X86_INTRINSIC_DATA(sse41_round_ps, ROUNDP, X86ISD::VRNDSCALE, 0),
	X86_INTRINSIC_DATA(sse41_round_sd, ROUNDS, X86ISD::VRNDSCALES, 0),
	X86_INTRINSIC_DATA(sse41_round_ss, ROUNDS, X86ISD::VRNDSCALES, 0),
	X86_INTRINSIC_DATA(sse4a_extrqi, INTR_TYPE_3OP, X86ISD::EXTRQI, 0),
	X86_INTRINSIC_DATA(sse4a_insertqi, INTR_TYPE_4OP, X86ISD::INSERTQI, 0),
	X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
	X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
	X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
	X86_INTRINSIC_DATA(ssse3_phsub_w_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
	X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw_128, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0),
	X86_INTRINSIC_DATA(ssse3_pmul_hr_sw_128, INTR_TYPE_2OP, X86ISD::MULHRS, 0),
	X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
	X86_INTRINSIC_DATA(vcvtph2ps_128, INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0),
	X86_INTRINSIC_DATA(vcvtph2ps_256, INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0),
	X86_INTRINSIC_DATA(vcvtps2ph_128, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0),
	X86_INTRINSIC_DATA(vcvtps2ph_256, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0),

	X86_INTRINSIC_DATA(vgf2p8affineinvqb_128, INTR_TYPE_3OP,
	X86ISD::GF2P8AFFINEINVQB, 0),
	X86_INTRINSIC_DATA(vgf2p8affineinvqb_256, INTR_TYPE_3OP,
	X86ISD::GF2P8AFFINEINVQB, 0),
	X86_INTRINSIC_DATA(vgf2p8affineinvqb_512, INTR_TYPE_3OP,
	X86ISD::GF2P8AFFINEINVQB, 0),
	X86_INTRINSIC_DATA(vgf2p8affineqb_128, INTR_TYPE_3OP,
	X86ISD::GF2P8AFFINEQB, 0),
	X86_INTRINSIC_DATA(vgf2p8affineqb_256, INTR_TYPE_3OP,
	X86ISD::GF2P8AFFINEQB, 0),
	X86_INTRINSIC_DATA(vgf2p8affineqb_512, INTR_TYPE_3OP,
	X86ISD::GF2P8AFFINEQB, 0),
	X86_INTRINSIC_DATA(vgf2p8mulb_128, INTR_TYPE_2OP,
	X86ISD::GF2P8MULB, 0),
	X86_INTRINSIC_DATA(vgf2p8mulb_256, INTR_TYPE_2OP,
	X86ISD::GF2P8MULB, 0),
	X86_INTRINSIC_DATA(vgf2p8mulb_512, INTR_TYPE_2OP,
	X86ISD::GF2P8MULB, 0),

	X86_INTRINSIC_DATA(xop_vpcomb, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
	X86_INTRINSIC_DATA(xop_vpcomd, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
	X86_INTRINSIC_DATA(xop_vpcomq, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
	X86_INTRINSIC_DATA(xop_vpcomub, INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
	X86_INTRINSIC_DATA(xop_vpcomud, INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
	X86_INTRINSIC_DATA(xop_vpcomuq, INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
	X86_INTRINSIC_DATA(xop_vpcomuw, INTR_TYPE_3OP, X86ISD::VPCOMU, 0),
	X86_INTRINSIC_DATA(xop_vpcomw, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
	X86_INTRINSIC_DATA(xop_vpermil2pd, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
	X86_INTRINSIC_DATA(xop_vpermil2pd_256, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
	X86_INTRINSIC_DATA(xop_vpermil2ps, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
	X86_INTRINSIC_DATA(xop_vpermil2ps_256, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
	X86_INTRINSIC_DATA(xop_vpperm, INTR_TYPE_3OP, X86ISD::VPPERM, 0),
	X86_INTRINSIC_DATA(xop_vprotb, INTR_TYPE_2OP, ISD::ROTL, 0),
	X86_INTRINSIC_DATA(xop_vprotbi, INTR_TYPE_2OP, X86ISD::VROTLI, 0),
	X86_INTRINSIC_DATA(xop_vprotd, INTR_TYPE_2OP, ISD::ROTL, 0),
	X86_INTRINSIC_DATA(xop_vprotdi, INTR_TYPE_2OP, X86ISD::VROTLI, 0),
	X86_INTRINSIC_DATA(xop_vprotq, INTR_TYPE_2OP, ISD::ROTL, 0),
	X86_INTRINSIC_DATA(xop_vprotqi, INTR_TYPE_2OP, X86ISD::VROTLI, 0),
	X86_INTRINSIC_DATA(xop_vprotw, INTR_TYPE_2OP, ISD::ROTL, 0),
	X86_INTRINSIC_DATA(xop_vprotwi, INTR_TYPE_2OP, X86ISD::VROTLI, 0),
	X86_INTRINSIC_DATA(xop_vpshab, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
	X86_INTRINSIC_DATA(xop_vpshad, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
	X86_INTRINSIC_DATA(xop_vpshaq, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
	X86_INTRINSIC_DATA(xop_vpshaw, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
	X86_INTRINSIC_DATA(xop_vpshlb, INTR_TYPE_2OP, X86ISD::VPSHL, 0),
	X86_INTRINSIC_DATA(xop_vpshld, INTR_TYPE_2OP, X86ISD::VPSHL, 0),
	X86_INTRINSIC_DATA(xop_vpshlq, INTR_TYPE_2OP, X86ISD::VPSHL, 0),
	X86_INTRINSIC_DATA(xop_vpshlw, INTR_TYPE_2OP, X86ISD::VPSHL, 0)
	};

	/*
	* Retrieve data for Intrinsic without chain.
	* Return nullptr if intrinsic is not defined in the table.
	*/
	static const IntrinsicData* getIntrinsicWithoutChain(uint16_t IntNo) {
	IntrinsicData IntrinsicToFind = { IntNo, INTR_NO_TYPE, 0, 0 };
	const IntrinsicData *Data = std::lower_bound(std::begin(IntrinsicsWithoutChain),
	std::end(IntrinsicsWithoutChain),
	IntrinsicToFind);
	if (Data != std::end(IntrinsicsWithoutChain) && *Data == IntrinsicToFind)
	return Data;
	return nullptr;
	}

	static void verifyIntrinsicTables() {
	assert(std::is_sorted(std::begin(IntrinsicsWithoutChain),
	std::end(IntrinsicsWithoutChain)) &&
	std::is_sorted(std::begin(IntrinsicsWithChain),
	std::end(IntrinsicsWithChain)) &&
	"Intrinsic data tables should be sorted by Intrinsic ID");
	assert((std::adjacent_find(std::begin(IntrinsicsWithoutChain),
	std::end(IntrinsicsWithoutChain)) ==
	std::end(IntrinsicsWithoutChain)) &&
	(std::adjacent_find(std::begin(IntrinsicsWithChain),
	std::end(IntrinsicsWithChain)) ==
	std::end(IntrinsicsWithChain)) &&
	"Intrinsic data tables should have unique entries");
	}
	} // End llvm namespace

	#endif
	Index: head/contrib/llvm/lib/Target/X86/X86RetpolineThunks.cpp
	===================================================================
	--- head/contrib/llvm/lib/Target/X86/X86RetpolineThunks.cpp (revision 329409)
	+++ head/contrib/llvm/lib/Target/X86/X86RetpolineThunks.cpp (revision 329410)
	@@ -1,311 +1,265 @@
	//======- X86RetpolineThunks.cpp - Construct retpoline thunks for x86 --=====//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	/// \file
	///
	/// Pass that injects an MI thunk implementing a "retpoline". This is
	/// a RET-implemented trampoline that is used to lower indirect calls in a way
	/// that prevents speculation on some x86 processors and can be used to mitigate
	/// security vulnerabilities due to targeted speculative execution and side
	/// channels such as CVE-2017-5715.
	///
	/// TODO(chandlerc): All of this code could use better comments and
	/// documentation.
	///
	//===----------------------------------------------------------------------===//

	#include "X86.h"
	#include "X86InstrBuilder.h"
	#include "X86Subtarget.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/Passes.h"
	#include "llvm/CodeGen/TargetPassConfig.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Module.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/raw_ostream.h"

	using namespace llvm;

	#define DEBUG_TYPE "x86-retpoline-thunks"

	static const char ThunkNamePrefix[] = "__llvm_retpoline_";
	static const char R11ThunkName[] = "__llvm_retpoline_r11";
	static const char EAXThunkName[] = "__llvm_retpoline_eax";
	static const char ECXThunkName[] = "__llvm_retpoline_ecx";
	static const char EDXThunkName[] = "__llvm_retpoline_edx";
	-static const char PushThunkName[] = "__llvm_retpoline_push";
	+static const char EDIThunkName[] = "__llvm_retpoline_edi";

	namespace {
	class X86RetpolineThunks : public MachineFunctionPass {
	public:
	static char ID;

	X86RetpolineThunks() : MachineFunctionPass(ID) {}

	StringRef getPassName() const override { return "X86 Retpoline Thunks"; }

	bool doInitialization(Module &M) override;
	bool runOnMachineFunction(MachineFunction &F) override;

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	MachineFunctionPass::getAnalysisUsage(AU);
	AU.addRequired<MachineModuleInfo>();
	AU.addPreserved<MachineModuleInfo>();
	}

	private:
	MachineModuleInfo *MMI;
	const TargetMachine *TM;
	bool Is64Bit;
	const X86Subtarget *STI;
	const X86InstrInfo *TII;

	bool InsertedThunks;

	void createThunkFunction(Module &M, StringRef Name);
	void insertRegReturnAddrClobber(MachineBasicBlock &MBB, unsigned Reg);
	- void insert32BitPushReturnAddrClobber(MachineBasicBlock &MBB);
	void populateThunk(MachineFunction &MF, Optional<unsigned> Reg = None);
	};

	} // end anonymous namespace

	FunctionPass *llvm::createX86RetpolineThunksPass() {
	return new X86RetpolineThunks();
	}

	char X86RetpolineThunks::ID = 0;

	bool X86RetpolineThunks::doInitialization(Module &M) {
	InsertedThunks = false;
	return false;
	}

	bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
	DEBUG(dbgs() << getPassName() << '\n');

	TM = &MF.getTarget();;
	STI = &MF.getSubtarget<X86Subtarget>();
	TII = STI->getInstrInfo();
	Is64Bit = TM->getTargetTriple().getArch() == Triple::x86_64;

	MMI = &getAnalysis<MachineModuleInfo>();
	Module &M = const_cast<Module &>(*MMI->getModule());

	// If this function is not a thunk, check to see if we need to insert
	// a thunk.
	if (!MF.getName().startswith(ThunkNamePrefix)) {
	// If we've already inserted a thunk, nothing else to do.
	if (InsertedThunks)
	return false;

	// Only add a thunk if one of the functions has the retpoline feature
	// enabled in its subtarget, and doesn't enable external thunks.
	// FIXME: Conditionalize on indirect calls so we don't emit a thunk when
	// nothing will end up calling it.
	// FIXME: It's a little silly to look at every function just to enumerate
	// the subtargets, but eventually we'll want to look at them for indirect
	// calls, so maybe this is OK.
	if (!STI->useRetpoline() \|\| STI->useRetpolineExternalThunk())
	return false;

	// Otherwise, we need to insert the thunk.
	// WARNING: This is not really a well behaving thing to do in a function
	// pass. We extract the module and insert a new function (and machine
	// function) directly into the module.
	if (Is64Bit)
	createThunkFunction(M, R11ThunkName);
	else
	for (StringRef Name :
	- {EAXThunkName, ECXThunkName, EDXThunkName, PushThunkName})
	+ {EAXThunkName, ECXThunkName, EDXThunkName, EDIThunkName})
	createThunkFunction(M, Name);
	InsertedThunks = true;
	return true;
	}

	// If this is a thunk function, we need to populate it with the correct MI.
	if (Is64Bit) {
	assert(MF.getName() == "__llvm_retpoline_r11" &&
	"Should only have an r11 thunk on 64-bit targets");

	// __llvm_retpoline_r11:
	// callq .Lr11_call_target
	// .Lr11_capture_spec:
	// pause
	// lfence
	// jmp .Lr11_capture_spec
	// .align 16
	// .Lr11_call_target:
	// movq %r11, (%rsp)
	// retq
	populateThunk(MF, X86::R11);
	} else {
	// For 32-bit targets we need to emit a collection of thunks for various
	- // possible scratch registers as well as a fallback that is used when
	- // there are no scratch registers and assumes the retpoline target has
	- // been pushed.
	+ // possible scratch registers as well as a fallback that uses EDI, which is
	+ // normally callee saved.
	// __llvm_retpoline_eax:
	// calll .Leax_call_target
	// .Leax_capture_spec:
	// pause
	// jmp .Leax_capture_spec
	// .align 16
	// .Leax_call_target:
	// movl %eax, (%esp) # Clobber return addr
	// retl
	//
	// __llvm_retpoline_ecx:
	// ... # Same setup
	// movl %ecx, (%esp)
	// retl
	//
	// __llvm_retpoline_edx:
	// ... # Same setup
	// movl %edx, (%esp)
	// retl
	//
	- // This last one is a bit more special and so needs a little extra
	- // handling.
	- // __llvm_retpoline_push:
	- // calll .Lpush_call_target
	- // .Lpush_capture_spec:
	- // pause
	- // lfence
	- // jmp .Lpush_capture_spec
	- // .align 16
	- // .Lpush_call_target:
	- // # Clear pause_loop return address.
	- // addl $4, %esp
	- // # Top of stack words are: Callee, RA. Exchange Callee and RA.
	- // pushl 4(%esp) # Push callee
	- // pushl 4(%esp) # Push RA
	- // popl 8(%esp) # Pop RA to final RA
	- // popl (%esp) # Pop callee to next top of stack
	- // retl # Ret to callee
	+ // __llvm_retpoline_edi:
	+ // ... # Same setup
	+ // movl %edi, (%esp)
	+ // retl
	if (MF.getName() == EAXThunkName)
	populateThunk(MF, X86::EAX);
	else if (MF.getName() == ECXThunkName)
	populateThunk(MF, X86::ECX);
	else if (MF.getName() == EDXThunkName)
	populateThunk(MF, X86::EDX);
	- else if (MF.getName() == PushThunkName)
	- populateThunk(MF);
	+ else if (MF.getName() == EDIThunkName)
	+ populateThunk(MF, X86::EDI);
	else
	llvm_unreachable("Invalid thunk name on x86-32!");
	}

	return true;
	}

	void X86RetpolineThunks::createThunkFunction(Module &M, StringRef Name) {
	assert(Name.startswith(ThunkNamePrefix) &&
	"Created a thunk with an unexpected prefix!");

	LLVMContext &Ctx = M.getContext();
	auto Type = FunctionType::get(Type::getVoidTy(Ctx), false);
	Function *F =
	Function::Create(Type, GlobalValue::LinkOnceODRLinkage, Name, &M);
	F->setVisibility(GlobalValue::HiddenVisibility);
	F->setComdat(M.getOrInsertComdat(Name));

	// Add Attributes so that we don't create a frame, unwind information, or
	// inline.
	AttrBuilder B;
	B.addAttribute(llvm::Attribute::NoUnwind);
	B.addAttribute(llvm::Attribute::Naked);
	F->addAttributes(llvm::AttributeList::FunctionIndex, B);

	// Populate our function a bit so that we can verify.
	BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", F);
	IRBuilder<> Builder(Entry);

	Builder.CreateRetVoid();
	}

	void X86RetpolineThunks::insertRegReturnAddrClobber(MachineBasicBlock &MBB,
	unsigned Reg) {
	const unsigned MovOpc = Is64Bit ? X86::MOV64mr : X86::MOV32mr;
	const unsigned SPReg = Is64Bit ? X86::RSP : X86::ESP;
	addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(MovOpc)), SPReg, false, 0)
	.addReg(Reg);
	}

	-void X86RetpolineThunks::insert32BitPushReturnAddrClobber(
	- MachineBasicBlock &MBB) {
	- // The instruction sequence we use to replace the return address without
	- // a scratch register is somewhat complicated:
	- // # Clear capture_spec from return address.
	- // addl $4, %esp
	- // # Top of stack words are: Callee, RA. Exchange Callee and RA.
	- // pushl 4(%esp) # Push callee
	- // pushl 4(%esp) # Push RA
	- // popl 8(%esp) # Pop RA to final RA
	- // popl (%esp) # Pop callee to next top of stack
	- // retl # Ret to callee
	- BuildMI(&MBB, DebugLoc(), TII->get(X86::ADD32ri), X86::ESP)
	- .addReg(X86::ESP)
	- .addImm(4);
	- addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP,
	- false, 4);
	- addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::PUSH32rmm)), X86::ESP,
	- false, 4);
	- addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP,
	- false, 8);
	- addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(X86::POP32rmm)), X86::ESP,
	- false, 0);
	-}
	-
	void X86RetpolineThunks::populateThunk(MachineFunction &MF,
	Optional<unsigned> Reg) {
	// Set MF properties. We never use vregs...
	MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);

	MachineBasicBlock *Entry = &MF.front();
	Entry->clear();

	MachineBasicBlock *CaptureSpec = MF.CreateMachineBasicBlock(Entry->getBasicBlock());
	MachineBasicBlock *CallTarget = MF.CreateMachineBasicBlock(Entry->getBasicBlock());
	MF.push_back(CaptureSpec);
	MF.push_back(CallTarget);

	const unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32;
	const unsigned RetOpc = Is64Bit ? X86::RETQ : X86::RETL;

	BuildMI(Entry, DebugLoc(), TII->get(CallOpc)).addMBB(CallTarget);
	Entry->addSuccessor(CallTarget);
	Entry->addSuccessor(CaptureSpec);
	CallTarget->setHasAddressTaken();

	// In the capture loop for speculation, we want to stop the processor from
	// speculating as fast as possible. On Intel processors, the PAUSE instruction
	// will block speculation without consuming any execution resources. On AMD
	// processors, the PAUSE instruction is (essentially) a nop, so we also use an
	// LFENCE instruction which they have advised will stop speculation as well
	// with minimal resource utilization. We still end the capture with a jump to
	// form an infinite loop to fully guarantee that no matter what implementation
	// of the x86 ISA, speculating this code path never escapes.
	BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::PAUSE));
	BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::LFENCE));
	BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::JMP_1)).addMBB(CaptureSpec);
	CaptureSpec->setHasAddressTaken();
	CaptureSpec->addSuccessor(CaptureSpec);

	CallTarget->setAlignment(4);
	- if (Reg) {
	- insertRegReturnAddrClobber(CallTarget, Reg);
	- } else {
	- assert(!Is64Bit && "We only support non-reg thunks on 32-bit x86!");
	- insert32BitPushReturnAddrClobber(*CallTarget);
	- }
	+ insertRegReturnAddrClobber(CallTarget, Reg);
	BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc));
	}
	Index: head/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
	===================================================================
	--- head/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp (revision 329409)
	+++ head/contrib/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp (revision 329410)
	@@ -1,4391 +1,4403 @@
	//===- InstCombineCalls.cpp -----------------------------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the visitCall and visitInvoke functions.
	//
	//===----------------------------------------------------------------------===//

	#include "InstCombineInternal.h"
	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/None.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/Analysis/AssumptionCache.h"
	#include "llvm/Analysis/InstructionSimplify.h"
	#include "llvm/Analysis/MemoryBuiltins.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/BasicBlock.h"
	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/InstrTypes.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/Metadata.h"
	#include "llvm/IR/PatternMatch.h"
	#include "llvm/IR/Statepoint.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/User.h"
	#include "llvm/IR/Value.h"
	#include "llvm/IR/ValueHandle.h"
	#include "llvm/Support/AtomicOrdering.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
	#include "llvm/Transforms/Utils/Local.h"
	#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
	#include <algorithm>
	#include <cassert>
	#include <cstdint>
	#include <cstring>
	#include <utility>
	#include <vector>

	using namespace llvm;
	using namespace PatternMatch;

	#define DEBUG_TYPE "instcombine"

	STATISTIC(NumSimplified, "Number of library calls simplified");

	static cl::opt<unsigned> UnfoldElementAtomicMemcpyMaxElements(
	"unfold-element-atomic-memcpy-max-elements",
	cl::init(16),
	cl::desc("Maximum number of elements in atomic memcpy the optimizer is "
	"allowed to unfold"));

	/// Return the specified type promoted as it would be to pass though a va_arg
	/// area.
	static Type getPromotedType(Type Ty) {
	if (IntegerType* ITy = dyn_cast<IntegerType>(Ty)) {
	if (ITy->getBitWidth() < 32)
	return Type::getInt32Ty(Ty->getContext());
	}
	return Ty;
	}

	/// Return a constant boolean vector that has true elements in all positions
	/// where the input constant data vector has an element with the sign bit set.
	static Constant getNegativeIsTrueBoolVec(ConstantDataVector V) {
	SmallVector<Constant *, 32> BoolVec;
	IntegerType *BoolTy = Type::getInt1Ty(V->getContext());
	for (unsigned I = 0, E = V->getNumElements(); I != E; ++I) {
	Constant *Elt = V->getElementAsConstant(I);
	assert((isa<ConstantInt>(Elt) \|\| isa<ConstantFP>(Elt)) &&
	"Unexpected constant data vector element type");
	bool Sign = V->getElementType()->isIntegerTy()
	? cast<ConstantInt>(Elt)->isNegative()
	: cast<ConstantFP>(Elt)->isNegative();
	BoolVec.push_back(ConstantInt::get(BoolTy, Sign));
	}
	return ConstantVector::get(BoolVec);
	}

	Instruction *
	InstCombiner::SimplifyElementUnorderedAtomicMemCpy(AtomicMemCpyInst *AMI) {
	// Try to unfold this intrinsic into sequence of explicit atomic loads and
	// stores.
	// First check that number of elements is compile time constant.
	auto *LengthCI = dyn_cast<ConstantInt>(AMI->getLength());
	if (!LengthCI)
	return nullptr;

	// Check that there are not too many elements.
	uint64_t LengthInBytes = LengthCI->getZExtValue();
	uint32_t ElementSizeInBytes = AMI->getElementSizeInBytes();
	uint64_t NumElements = LengthInBytes / ElementSizeInBytes;
	if (NumElements >= UnfoldElementAtomicMemcpyMaxElements)
	return nullptr;

	// Only expand if there are elements to copy.
	if (NumElements > 0) {
	// Don't unfold into illegal integers
	uint64_t ElementSizeInBits = ElementSizeInBytes * 8;
	if (!getDataLayout().isLegalInteger(ElementSizeInBits))
	return nullptr;

	// Cast source and destination to the correct type. Intrinsic input
	// arguments are usually represented as i8*. Often operands will be
	// explicitly casted to i8* and we can just strip those casts instead of
	// inserting new ones. However it's easier to rely on other InstCombine
	// rules which will cover trivial cases anyway.
	Value *Src = AMI->getRawSource();
	Value *Dst = AMI->getRawDest();
	Type *ElementPointerType =
	Type::getIntNPtrTy(AMI->getContext(), ElementSizeInBits,
	Src->getType()->getPointerAddressSpace());

	Value *SrcCasted = Builder.CreatePointerCast(Src, ElementPointerType,
	"memcpy_unfold.src_casted");
	Value *DstCasted = Builder.CreatePointerCast(Dst, ElementPointerType,
	"memcpy_unfold.dst_casted");

	for (uint64_t i = 0; i < NumElements; ++i) {
	// Get current element addresses
	ConstantInt *ElementIdxCI =
	ConstantInt::get(AMI->getContext(), APInt(64, i));
	Value *SrcElementAddr =
	Builder.CreateGEP(SrcCasted, ElementIdxCI, "memcpy_unfold.src_addr");
	Value *DstElementAddr =
	Builder.CreateGEP(DstCasted, ElementIdxCI, "memcpy_unfold.dst_addr");

	// Load from the source. Transfer alignment information and mark load as
	// unordered atomic.
	LoadInst *Load = Builder.CreateLoad(SrcElementAddr, "memcpy_unfold.val");
	Load->setOrdering(AtomicOrdering::Unordered);
	// We know alignment of the first element. It is also guaranteed by the
	// verifier that element size is less or equal than first element
	// alignment and both of this values are powers of two. This means that
	// all subsequent accesses are at least element size aligned.
	// TODO: We can infer better alignment but there is no evidence that this
	// will matter.
	Load->setAlignment(i == 0 ? AMI->getParamAlignment(1)
	: ElementSizeInBytes);
	Load->setDebugLoc(AMI->getDebugLoc());

	// Store loaded value via unordered atomic store.
	StoreInst *Store = Builder.CreateStore(Load, DstElementAddr);
	Store->setOrdering(AtomicOrdering::Unordered);
	Store->setAlignment(i == 0 ? AMI->getParamAlignment(0)
	: ElementSizeInBytes);
	Store->setDebugLoc(AMI->getDebugLoc());
	}
	}

	// Set the number of elements of the copy to 0, it will be deleted on the
	// next iteration.
	AMI->setLength(Constant::getNullValue(LengthCI->getType()));
	return AMI;
	}

	Instruction InstCombiner::SimplifyMemTransfer(MemIntrinsic MI) {
	unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, MI, &AC, &DT);
	unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, MI, &AC, &DT);
	unsigned MinAlign = std::min(DstAlign, SrcAlign);
	unsigned CopyAlign = MI->getAlignment();

	if (CopyAlign < MinAlign) {
	MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), MinAlign, false));
	return MI;
	}

	// If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with
	// load/store.
	ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getArgOperand(2));
	if (!MemOpLength) return nullptr;

	// Source and destination pointer types are always "i8*" for intrinsic. See
	// if the size is something we can handle with a single primitive load/store.
	// A single load+store correctly handles overlapping memory in the memmove
	// case.
	uint64_t Size = MemOpLength->getLimitedValue();
	assert(Size && "0-sized memory transferring should be removed already.");

	if (Size > 8 \|\| (Size&(Size-1)))
	return nullptr; // If not 1/2/4/8 bytes, exit.

	// Use an integer load+store unless we can find something better.
	unsigned SrcAddrSp =
	cast<PointerType>(MI->getArgOperand(1)->getType())->getAddressSpace();
	unsigned DstAddrSp =
	cast<PointerType>(MI->getArgOperand(0)->getType())->getAddressSpace();

	IntegerType* IntType = IntegerType::get(MI->getContext(), Size<<3);
	Type *NewSrcPtrTy = PointerType::get(IntType, SrcAddrSp);
	Type *NewDstPtrTy = PointerType::get(IntType, DstAddrSp);

	// If the memcpy has metadata describing the members, see if we can get the
	// TBAA tag describing our copy.
	MDNode *CopyMD = nullptr;
	if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) {
	if (M->getNumOperands() == 3 && M->getOperand(0) &&
	mdconst::hasa<ConstantInt>(M->getOperand(0)) &&
	mdconst::extract<ConstantInt>(M->getOperand(0))->isZero() &&
	M->getOperand(1) &&
	mdconst::hasa<ConstantInt>(M->getOperand(1)) &&
	mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() ==
	Size &&
	M->getOperand(2) && isa<MDNode>(M->getOperand(2)))
	CopyMD = cast<MDNode>(M->getOperand(2));
	}

	// If the memcpy/memmove provides better alignment info than we can
	// infer, use it.
	SrcAlign = std::max(SrcAlign, CopyAlign);
	DstAlign = std::max(DstAlign, CopyAlign);

	Value *Src = Builder.CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy);
	Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy);
	LoadInst *L = Builder.CreateLoad(Src, MI->isVolatile());
	L->setAlignment(SrcAlign);
	if (CopyMD)
	L->setMetadata(LLVMContext::MD_tbaa, CopyMD);
	MDNode *LoopMemParallelMD =
	MI->getMetadata(LLVMContext::MD_mem_parallel_loop_access);
	if (LoopMemParallelMD)
	L->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);

	StoreInst *S = Builder.CreateStore(L, Dest, MI->isVolatile());
	S->setAlignment(DstAlign);
	if (CopyMD)
	S->setMetadata(LLVMContext::MD_tbaa, CopyMD);
	if (LoopMemParallelMD)
	S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);

	// Set the size of the copy to 0, it will be deleted on the next iteration.
	MI->setArgOperand(2, Constant::getNullValue(MemOpLength->getType()));
	return MI;
	}

	Instruction InstCombiner::SimplifyMemSet(MemSetInst MI) {
	unsigned Alignment = getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT);
	if (MI->getAlignment() < Alignment) {
	MI->setAlignment(ConstantInt::get(MI->getAlignmentType(),
	Alignment, false));
	return MI;
	}

	// Extract the length and alignment and fill if they are constant.
	ConstantInt *LenC = dyn_cast<ConstantInt>(MI->getLength());
	ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue());
	if (!LenC \|\| !FillC \|\| !FillC->getType()->isIntegerTy(8))
	return nullptr;
	uint64_t Len = LenC->getLimitedValue();
	Alignment = MI->getAlignment();
	assert(Len && "0-sized memory setting should be removed already.");

	// memset(s,c,n) -> store s, c (for n=1,2,4,8)
	if (Len <= 8 && isPowerOf2_32((uint32_t)Len)) {
	Type ITy = IntegerType::get(MI->getContext(), Len8); // n=1 -> i8.

	Value *Dest = MI->getDest();
	unsigned DstAddrSp = cast<PointerType>(Dest->getType())->getAddressSpace();
	Type *NewDstPtrTy = PointerType::get(ITy, DstAddrSp);
	Dest = Builder.CreateBitCast(Dest, NewDstPtrTy);

	// Alignment 0 is identity for alignment 1 for memset, but not store.
	if (Alignment == 0) Alignment = 1;

	// Extract the fill value and store.
	uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL;
	StoreInst *S = Builder.CreateStore(ConstantInt::get(ITy, Fill), Dest,
	MI->isVolatile());
	S->setAlignment(Alignment);

	// Set the size of the copy to 0, it will be deleted on the next iteration.
	MI->setLength(Constant::getNullValue(LenC->getType()));
	return MI;
	}

	return nullptr;
	}

	static Value *simplifyX86immShift(const IntrinsicInst &II,
	InstCombiner::BuilderTy &Builder) {
	bool LogicalShift = false;
	bool ShiftLeft = false;

	switch (II.getIntrinsicID()) {
	default: llvm_unreachable("Unexpected intrinsic!");
	case Intrinsic::x86_sse2_psra_d:
	case Intrinsic::x86_sse2_psra_w:
	case Intrinsic::x86_sse2_psrai_d:
	case Intrinsic::x86_sse2_psrai_w:
	case Intrinsic::x86_avx2_psra_d:
	case Intrinsic::x86_avx2_psra_w:
	case Intrinsic::x86_avx2_psrai_d:
	case Intrinsic::x86_avx2_psrai_w:
	case Intrinsic::x86_avx512_psra_q_128:
	case Intrinsic::x86_avx512_psrai_q_128:
	case Intrinsic::x86_avx512_psra_q_256:
	case Intrinsic::x86_avx512_psrai_q_256:
	case Intrinsic::x86_avx512_psra_d_512:
	case Intrinsic::x86_avx512_psra_q_512:
	case Intrinsic::x86_avx512_psra_w_512:
	case Intrinsic::x86_avx512_psrai_d_512:
	case Intrinsic::x86_avx512_psrai_q_512:
	case Intrinsic::x86_avx512_psrai_w_512:
	LogicalShift = false; ShiftLeft = false;
	break;
	case Intrinsic::x86_sse2_psrl_d:
	case Intrinsic::x86_sse2_psrl_q:
	case Intrinsic::x86_sse2_psrl_w:
	case Intrinsic::x86_sse2_psrli_d:
	case Intrinsic::x86_sse2_psrli_q:
	case Intrinsic::x86_sse2_psrli_w:
	case Intrinsic::x86_avx2_psrl_d:
	case Intrinsic::x86_avx2_psrl_q:
	case Intrinsic::x86_avx2_psrl_w:
	case Intrinsic::x86_avx2_psrli_d:
	case Intrinsic::x86_avx2_psrli_q:
	case Intrinsic::x86_avx2_psrli_w:
	case Intrinsic::x86_avx512_psrl_d_512:
	case Intrinsic::x86_avx512_psrl_q_512:
	case Intrinsic::x86_avx512_psrl_w_512:
	case Intrinsic::x86_avx512_psrli_d_512:
	case Intrinsic::x86_avx512_psrli_q_512:
	case Intrinsic::x86_avx512_psrli_w_512:
	LogicalShift = true; ShiftLeft = false;
	break;
	case Intrinsic::x86_sse2_psll_d:
	case Intrinsic::x86_sse2_psll_q:
	case Intrinsic::x86_sse2_psll_w:
	case Intrinsic::x86_sse2_pslli_d:
	case Intrinsic::x86_sse2_pslli_q:
	case Intrinsic::x86_sse2_pslli_w:
	case Intrinsic::x86_avx2_psll_d:
	case Intrinsic::x86_avx2_psll_q:
	case Intrinsic::x86_avx2_psll_w:
	case Intrinsic::x86_avx2_pslli_d:
	case Intrinsic::x86_avx2_pslli_q:
	case Intrinsic::x86_avx2_pslli_w:
	case Intrinsic::x86_avx512_psll_d_512:
	case Intrinsic::x86_avx512_psll_q_512:
	case Intrinsic::x86_avx512_psll_w_512:
	case Intrinsic::x86_avx512_pslli_d_512:
	case Intrinsic::x86_avx512_pslli_q_512:
	case Intrinsic::x86_avx512_pslli_w_512:
	LogicalShift = true; ShiftLeft = true;
	break;
	}
	assert((LogicalShift \|\| !ShiftLeft) && "Only logical shifts can shift left");

	// Simplify if count is constant.
	auto Arg1 = II.getArgOperand(1);
	auto CAZ = dyn_cast<ConstantAggregateZero>(Arg1);
	auto CDV = dyn_cast<ConstantDataVector>(Arg1);
	auto CInt = dyn_cast<ConstantInt>(Arg1);
	if (!CAZ && !CDV && !CInt)
	return nullptr;

	APInt Count(64, 0);
	if (CDV) {
	// SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
	// operand to compute the shift amount.
	auto VT = cast<VectorType>(CDV->getType());
	unsigned BitWidth = VT->getElementType()->getPrimitiveSizeInBits();
	assert((64 % BitWidth) == 0 && "Unexpected packed shift size");
	unsigned NumSubElts = 64 / BitWidth;

	// Concatenate the sub-elements to create the 64-bit value.
	for (unsigned i = 0; i != NumSubElts; ++i) {
	unsigned SubEltIdx = (NumSubElts - 1) - i;
	auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
	Count <<= BitWidth;
	Count \|= SubElt->getValue().zextOrTrunc(64);
	}
	}
	else if (CInt)
	Count = CInt->getValue();

	auto Vec = II.getArgOperand(0);
	auto VT = cast<VectorType>(Vec->getType());
	auto SVT = VT->getElementType();
	unsigned VWidth = VT->getNumElements();
	unsigned BitWidth = SVT->getPrimitiveSizeInBits();

	// If shift-by-zero then just return the original value.
	if (Count.isNullValue())
	return Vec;

	// Handle cases when Shift >= BitWidth.
	if (Count.uge(BitWidth)) {
	// If LogicalShift - just return zero.
	if (LogicalShift)
	return ConstantAggregateZero::get(VT);

	// If ArithmeticShift - clamp Shift to (BitWidth - 1).
	Count = APInt(64, BitWidth - 1);
	}

	// Get a constant vector of the same type as the first operand.
	auto ShiftAmt = ConstantInt::get(SVT, Count.zextOrTrunc(BitWidth));
	auto ShiftVec = Builder.CreateVectorSplat(VWidth, ShiftAmt);

	if (ShiftLeft)
	return Builder.CreateShl(Vec, ShiftVec);

	if (LogicalShift)
	return Builder.CreateLShr(Vec, ShiftVec);

	return Builder.CreateAShr(Vec, ShiftVec);
	}

	// Attempt to simplify AVX2 per-element shift intrinsics to a generic IR shift.
	// Unlike the generic IR shifts, the intrinsics have defined behaviour for out
	// of range shift amounts (logical - set to zero, arithmetic - splat sign bit).
	static Value *simplifyX86varShift(const IntrinsicInst &II,
	InstCombiner::BuilderTy &Builder) {
	bool LogicalShift = false;
	bool ShiftLeft = false;

	switch (II.getIntrinsicID()) {
	default: llvm_unreachable("Unexpected intrinsic!");
	case Intrinsic::x86_avx2_psrav_d:
	case Intrinsic::x86_avx2_psrav_d_256:
	case Intrinsic::x86_avx512_psrav_q_128:
	case Intrinsic::x86_avx512_psrav_q_256:
	case Intrinsic::x86_avx512_psrav_d_512:
	case Intrinsic::x86_avx512_psrav_q_512:
	case Intrinsic::x86_avx512_psrav_w_128:
	case Intrinsic::x86_avx512_psrav_w_256:
	case Intrinsic::x86_avx512_psrav_w_512:
	LogicalShift = false;
	ShiftLeft = false;
	break;
	case Intrinsic::x86_avx2_psrlv_d:
	case Intrinsic::x86_avx2_psrlv_d_256:
	case Intrinsic::x86_avx2_psrlv_q:
	case Intrinsic::x86_avx2_psrlv_q_256:
	case Intrinsic::x86_avx512_psrlv_d_512:
	case Intrinsic::x86_avx512_psrlv_q_512:
	case Intrinsic::x86_avx512_psrlv_w_128:
	case Intrinsic::x86_avx512_psrlv_w_256:
	case Intrinsic::x86_avx512_psrlv_w_512:
	LogicalShift = true;
	ShiftLeft = false;
	break;
	case Intrinsic::x86_avx2_psllv_d:
	case Intrinsic::x86_avx2_psllv_d_256:
	case Intrinsic::x86_avx2_psllv_q:
	case Intrinsic::x86_avx2_psllv_q_256:
	case Intrinsic::x86_avx512_psllv_d_512:
	case Intrinsic::x86_avx512_psllv_q_512:
	case Intrinsic::x86_avx512_psllv_w_128:
	case Intrinsic::x86_avx512_psllv_w_256:
	case Intrinsic::x86_avx512_psllv_w_512:
	LogicalShift = true;
	ShiftLeft = true;
	break;
	}
	assert((LogicalShift \|\| !ShiftLeft) && "Only logical shifts can shift left");

	// Simplify if all shift amounts are constant/undef.
	auto *CShift = dyn_cast<Constant>(II.getArgOperand(1));
	if (!CShift)
	return nullptr;

	auto Vec = II.getArgOperand(0);
	auto VT = cast<VectorType>(II.getType());
	auto SVT = VT->getVectorElementType();
	int NumElts = VT->getNumElements();
	int BitWidth = SVT->getIntegerBitWidth();

	// Collect each element's shift amount.
	// We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
	bool AnyOutOfRange = false;
	SmallVector<int, 8> ShiftAmts;
	for (int I = 0; I < NumElts; ++I) {
	auto *CElt = CShift->getAggregateElement(I);
	if (CElt && isa<UndefValue>(CElt)) {
	ShiftAmts.push_back(-1);
	continue;
	}

	auto *COp = dyn_cast_or_null<ConstantInt>(CElt);
	if (!COp)
	return nullptr;

	// Handle out of range shifts.
	// If LogicalShift - set to BitWidth (special case).
	// If ArithmeticShift - set to (BitWidth - 1) (sign splat).
	APInt ShiftVal = COp->getValue();
	if (ShiftVal.uge(BitWidth)) {
	AnyOutOfRange = LogicalShift;
	ShiftAmts.push_back(LogicalShift ? BitWidth : BitWidth - 1);
	continue;
	}

	ShiftAmts.push_back((int)ShiftVal.getZExtValue());
	}

	// If all elements out of range or UNDEF, return vector of zeros/undefs.
	// ArithmeticShift should only hit this if they are all UNDEF.
	auto OutOfRange = [&](int Idx) { return (Idx < 0) \|\| (BitWidth <= Idx); };
	if (llvm::all_of(ShiftAmts, OutOfRange)) {
	SmallVector<Constant *, 8> ConstantVec;
	for (int Idx : ShiftAmts) {
	if (Idx < 0) {
	ConstantVec.push_back(UndefValue::get(SVT));
	} else {
	assert(LogicalShift && "Logical shift expected");
	ConstantVec.push_back(ConstantInt::getNullValue(SVT));
	}
	}
	return ConstantVector::get(ConstantVec);
	}

	// We can't handle only some out of range values with generic logical shifts.
	if (AnyOutOfRange)
	return nullptr;

	// Build the shift amount constant vector.
	SmallVector<Constant *, 8> ShiftVecAmts;
	for (int Idx : ShiftAmts) {
	if (Idx < 0)
	ShiftVecAmts.push_back(UndefValue::get(SVT));
	else
	ShiftVecAmts.push_back(ConstantInt::get(SVT, Idx));
	}
	auto ShiftVec = ConstantVector::get(ShiftVecAmts);

	if (ShiftLeft)
	return Builder.CreateShl(Vec, ShiftVec);

	if (LogicalShift)
	return Builder.CreateLShr(Vec, ShiftVec);

	return Builder.CreateAShr(Vec, ShiftVec);
	}

	static Value *simplifyX86muldq(const IntrinsicInst &II,
	InstCombiner::BuilderTy &Builder) {
	Value *Arg0 = II.getArgOperand(0);
	Value *Arg1 = II.getArgOperand(1);
	Type *ResTy = II.getType();
	assert(Arg0->getType()->getScalarSizeInBits() == 32 &&
	Arg1->getType()->getScalarSizeInBits() == 32 &&
	ResTy->getScalarSizeInBits() == 64 && "Unexpected muldq/muludq types");

	// muldq/muludq(undef, undef) -> zero (matches generic mul behavior)
	if (isa<UndefValue>(Arg0) \|\| isa<UndefValue>(Arg1))
	return ConstantAggregateZero::get(ResTy);

	// Constant folding.
	// PMULDQ = (mul(vXi64 sext(shuffle<0,2,..>(Arg0)),
	// vXi64 sext(shuffle<0,2,..>(Arg1))))
	// PMULUDQ = (mul(vXi64 zext(shuffle<0,2,..>(Arg0)),
	// vXi64 zext(shuffle<0,2,..>(Arg1))))
	if (!isa<Constant>(Arg0) \|\| !isa<Constant>(Arg1))
	return nullptr;

	unsigned NumElts = ResTy->getVectorNumElements();
	assert(Arg0->getType()->getVectorNumElements() == (2 * NumElts) &&
	Arg1->getType()->getVectorNumElements() == (2 * NumElts) &&
	"Unexpected muldq/muludq types");

	unsigned IntrinsicID = II.getIntrinsicID();
	bool IsSigned = (Intrinsic::x86_sse41_pmuldq == IntrinsicID \|\|
	Intrinsic::x86_avx2_pmul_dq == IntrinsicID \|\|
	Intrinsic::x86_avx512_pmul_dq_512 == IntrinsicID);

	SmallVector<unsigned, 16> ShuffleMask;
	for (unsigned i = 0; i != NumElts; ++i)
	ShuffleMask.push_back(i * 2);

	auto *LHS = Builder.CreateShuffleVector(Arg0, Arg0, ShuffleMask);
	auto *RHS = Builder.CreateShuffleVector(Arg1, Arg1, ShuffleMask);

	if (IsSigned) {
	LHS = Builder.CreateSExt(LHS, ResTy);
	RHS = Builder.CreateSExt(RHS, ResTy);
	} else {
	LHS = Builder.CreateZExt(LHS, ResTy);
	RHS = Builder.CreateZExt(RHS, ResTy);
	}

	return Builder.CreateMul(LHS, RHS);
	}

	static Value *simplifyX86pack(IntrinsicInst &II, bool IsSigned) {
	Value *Arg0 = II.getArgOperand(0);
	Value *Arg1 = II.getArgOperand(1);
	Type *ResTy = II.getType();

	// Fast all undef handling.
	if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
	return UndefValue::get(ResTy);

	Type *ArgTy = Arg0->getType();
	unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
	unsigned NumDstElts = ResTy->getVectorNumElements();
	unsigned NumSrcElts = ArgTy->getVectorNumElements();
	assert(NumDstElts == (2 * NumSrcElts) && "Unexpected packing types");

	unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
	unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
	unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits();
	assert(ArgTy->getScalarSizeInBits() == (2 * DstScalarSizeInBits) &&
	"Unexpected packing types");

	// Constant folding.
	auto *Cst0 = dyn_cast<Constant>(Arg0);
	auto *Cst1 = dyn_cast<Constant>(Arg1);
	if (!Cst0 \|\| !Cst1)
	return nullptr;

	SmallVector<Constant *, 32> Vals;
	for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
	for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
	unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
	auto *Cst = (Elt >= NumSrcEltsPerLane) ? Cst1 : Cst0;
	auto *COp = Cst->getAggregateElement(SrcIdx);
	if (COp && isa<UndefValue>(COp)) {
	Vals.push_back(UndefValue::get(ResTy->getScalarType()));
	continue;
	}

	auto *CInt = dyn_cast_or_null<ConstantInt>(COp);
	if (!CInt)
	return nullptr;

	APInt Val = CInt->getValue();
	assert(Val.getBitWidth() == ArgTy->getScalarSizeInBits() &&
	"Unexpected constant bitwidth");

	if (IsSigned) {
	// PACKSS: Truncate signed value with signed saturation.
	// Source values less than dst minint are saturated to minint.
	// Source values greater than dst maxint are saturated to maxint.
	if (Val.isSignedIntN(DstScalarSizeInBits))
	Val = Val.trunc(DstScalarSizeInBits);
	else if (Val.isNegative())
	Val = APInt::getSignedMinValue(DstScalarSizeInBits);
	else
	Val = APInt::getSignedMaxValue(DstScalarSizeInBits);
	} else {
	// PACKUS: Truncate signed value with unsigned saturation.
	// Source values less than zero are saturated to zero.
	// Source values greater than dst maxuint are saturated to maxuint.
	if (Val.isIntN(DstScalarSizeInBits))
	Val = Val.trunc(DstScalarSizeInBits);
	else if (Val.isNegative())
	Val = APInt::getNullValue(DstScalarSizeInBits);
	else
	Val = APInt::getAllOnesValue(DstScalarSizeInBits);
	}

	Vals.push_back(ConstantInt::get(ResTy->getScalarType(), Val));
	}
	}

	return ConstantVector::get(Vals);
	}

	static Value *simplifyX86movmsk(const IntrinsicInst &II) {
	Value *Arg = II.getArgOperand(0);
	Type *ResTy = II.getType();
	Type *ArgTy = Arg->getType();

	// movmsk(undef) -> zero as we must ensure the upper bits are zero.
	if (isa<UndefValue>(Arg))
	return Constant::getNullValue(ResTy);

	// We can't easily peek through x86_mmx types.
	if (!ArgTy->isVectorTy())
	return nullptr;

	auto *C = dyn_cast<Constant>(Arg);
	if (!C)
	return nullptr;

	// Extract signbits of the vector input and pack into integer result.
	APInt Result(ResTy->getPrimitiveSizeInBits(), 0);
	for (unsigned I = 0, E = ArgTy->getVectorNumElements(); I != E; ++I) {
	auto *COp = C->getAggregateElement(I);
	if (!COp)
	return nullptr;
	if (isa<UndefValue>(COp))
	continue;

	auto *CInt = dyn_cast<ConstantInt>(COp);
	auto *CFp = dyn_cast<ConstantFP>(COp);
	if (!CInt && !CFp)
	return nullptr;

	if ((CInt && CInt->isNegative()) \|\| (CFp && CFp->isNegative()))
	Result.setBit(I);
	}

	return Constant::getIntegerValue(ResTy, Result);
	}

	static Value *simplifyX86insertps(const IntrinsicInst &II,
	InstCombiner::BuilderTy &Builder) {
	auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2));
	if (!CInt)
	return nullptr;

	VectorType *VecTy = cast<VectorType>(II.getType());
	assert(VecTy->getNumElements() == 4 && "insertps with wrong vector type");

	// The immediate permute control byte looks like this:
	// [3:0] - zero mask for each 32-bit lane
	// [5:4] - select one 32-bit destination lane
	// [7:6] - select one 32-bit source lane

	uint8_t Imm = CInt->getZExtValue();
	uint8_t ZMask = Imm & 0xf;
	uint8_t DestLane = (Imm >> 4) & 0x3;
	uint8_t SourceLane = (Imm >> 6) & 0x3;

	ConstantAggregateZero *ZeroVector = ConstantAggregateZero::get(VecTy);

	// If all zero mask bits are set, this was just a weird way to
	// generate a zero vector.
	if (ZMask == 0xf)
	return ZeroVector;

	// Initialize by passing all of the first source bits through.
	uint32_t ShuffleMask[4] = { 0, 1, 2, 3 };

	// We may replace the second operand with the zero vector.
	Value *V1 = II.getArgOperand(1);

	if (ZMask) {
	// If the zero mask is being used with a single input or the zero mask
	// overrides the destination lane, this is a shuffle with the zero vector.
	if ((II.getArgOperand(0) == II.getArgOperand(1)) \|\|
	(ZMask & (1 << DestLane))) {
	V1 = ZeroVector;
	// We may still move 32-bits of the first source vector from one lane
	// to another.
	ShuffleMask[DestLane] = SourceLane;
	// The zero mask may override the previous insert operation.
	for (unsigned i = 0; i < 4; ++i)
	if ((ZMask >> i) & 0x1)
	ShuffleMask[i] = i + 4;
	} else {
	// TODO: Model this case as 2 shuffles or a 'logical and' plus shuffle?
	return nullptr;
	}
	} else {
	// Replace the selected destination lane with the selected source lane.
	ShuffleMask[DestLane] = SourceLane + 4;
	}

	return Builder.CreateShuffleVector(II.getArgOperand(0), V1, ShuffleMask);
	}

	/// Attempt to simplify SSE4A EXTRQ/EXTRQI instructions using constant folding
	/// or conversion to a shuffle vector.
	static Value simplifyX86extrq(IntrinsicInst &II, Value Op0,
	ConstantInt CILength, ConstantInt CIIndex,
	InstCombiner::BuilderTy &Builder) {
	auto LowConstantHighUndef = [&](uint64_t Val) {
	Type *IntTy64 = Type::getInt64Ty(II.getContext());
	Constant *Args[] = {ConstantInt::get(IntTy64, Val),
	UndefValue::get(IntTy64)};
	return ConstantVector::get(Args);
	};

	// See if we're dealing with constant values.
	Constant *C0 = dyn_cast<Constant>(Op0);
	ConstantInt *CI0 =
	C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
	: nullptr;

	// Attempt to constant fold.
	if (CILength && CIIndex) {
	// From AMD documentation: "The bit index and field length are each six
	// bits in length other bits of the field are ignored."
	APInt APIndex = CIIndex->getValue().zextOrTrunc(6);
	APInt APLength = CILength->getValue().zextOrTrunc(6);

	unsigned Index = APIndex.getZExtValue();

	// From AMD documentation: "a value of zero in the field length is
	// defined as length of 64".
	unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();

	// From AMD documentation: "If the sum of the bit index + length field
	// is greater than 64, the results are undefined".
	unsigned End = Index + Length;

	// Note that both field index and field length are 8-bit quantities.
	// Since variables 'Index' and 'Length' are unsigned values
	// obtained from zero-extending field index and field length
	// respectively, their sum should never wrap around.
	if (End > 64)
	return UndefValue::get(II.getType());

	// If we are inserting whole bytes, we can convert this to a shuffle.
	// Lowering can recognize EXTRQI shuffle masks.
	if ((Length % 8) == 0 && (Index % 8) == 0) {
	// Convert bit indices to byte indices.
	Length /= 8;
	Index /= 8;

	Type *IntTy8 = Type::getInt8Ty(II.getContext());
	Type *IntTy32 = Type::getInt32Ty(II.getContext());
	VectorType *ShufTy = VectorType::get(IntTy8, 16);

	SmallVector<Constant *, 16> ShuffleMask;
	for (int i = 0; i != (int)Length; ++i)
	ShuffleMask.push_back(
	Constant::getIntegerValue(IntTy32, APInt(32, i + Index)));
	for (int i = Length; i != 8; ++i)
	ShuffleMask.push_back(
	Constant::getIntegerValue(IntTy32, APInt(32, i + 16)));
	for (int i = 8; i != 16; ++i)
	ShuffleMask.push_back(UndefValue::get(IntTy32));

	Value *SV = Builder.CreateShuffleVector(
	Builder.CreateBitCast(Op0, ShufTy),
	ConstantAggregateZero::get(ShufTy), ConstantVector::get(ShuffleMask));
	return Builder.CreateBitCast(SV, II.getType());
	}

	// Constant Fold - shift Index'th bit to lowest position and mask off
	// Length bits.
	if (CI0) {
	APInt Elt = CI0->getValue();
	Elt.lshrInPlace(Index);
	Elt = Elt.zextOrTrunc(Length);
	return LowConstantHighUndef(Elt.getZExtValue());
	}

	// If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
	if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
	Value *Args[] = {Op0, CILength, CIIndex};
	Module *M = II.getModule();
	Value *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi);
	return Builder.CreateCall(F, Args);
	}
	}

	// Constant Fold - extraction from zero is always {zero, undef}.
	if (CI0 && CI0->isZero())
	return LowConstantHighUndef(0);

	return nullptr;
	}

	/// Attempt to simplify SSE4A INSERTQ/INSERTQI instructions using constant
	/// folding or conversion to a shuffle vector.
	static Value simplifyX86insertq(IntrinsicInst &II, Value Op0, Value *Op1,
	APInt APLength, APInt APIndex,
	InstCombiner::BuilderTy &Builder) {
	// From AMD documentation: "The bit index and field length are each six bits
	// in length other bits of the field are ignored."
	APIndex = APIndex.zextOrTrunc(6);
	APLength = APLength.zextOrTrunc(6);

	// Attempt to constant fold.
	unsigned Index = APIndex.getZExtValue();

	// From AMD documentation: "a value of zero in the field length is
	// defined as length of 64".
	unsigned Length = APLength == 0 ? 64 : APLength.getZExtValue();

	// From AMD documentation: "If the sum of the bit index + length field
	// is greater than 64, the results are undefined".
	unsigned End = Index + Length;

	// Note that both field index and field length are 8-bit quantities.
	// Since variables 'Index' and 'Length' are unsigned values
	// obtained from zero-extending field index and field length
	// respectively, their sum should never wrap around.
	if (End > 64)
	return UndefValue::get(II.getType());

	// If we are inserting whole bytes, we can convert this to a shuffle.
	// Lowering can recognize INSERTQI shuffle masks.
	if ((Length % 8) == 0 && (Index % 8) == 0) {
	// Convert bit indices to byte indices.
	Length /= 8;
	Index /= 8;

	Type *IntTy8 = Type::getInt8Ty(II.getContext());
	Type *IntTy32 = Type::getInt32Ty(II.getContext());
	VectorType *ShufTy = VectorType::get(IntTy8, 16);

	SmallVector<Constant *, 16> ShuffleMask;
	for (int i = 0; i != (int)Index; ++i)
	ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i)));
	for (int i = 0; i != (int)Length; ++i)
	ShuffleMask.push_back(
	Constant::getIntegerValue(IntTy32, APInt(32, i + 16)));
	for (int i = Index + Length; i != 8; ++i)
	ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i)));
	for (int i = 8; i != 16; ++i)
	ShuffleMask.push_back(UndefValue::get(IntTy32));

	Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
	Builder.CreateBitCast(Op1, ShufTy),
	ConstantVector::get(ShuffleMask));
	return Builder.CreateBitCast(SV, II.getType());
	}

	// See if we're dealing with constant values.
	Constant *C0 = dyn_cast<Constant>(Op0);
	Constant *C1 = dyn_cast<Constant>(Op1);
	ConstantInt *CI00 =
	C0 ? dyn_cast_or_null<ConstantInt>(C0->getAggregateElement((unsigned)0))
	: nullptr;
	ConstantInt *CI10 =
	C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
	: nullptr;

	// Constant Fold - insert bottom Length bits starting at the Index'th bit.
	if (CI00 && CI10) {
	APInt V00 = CI00->getValue();
	APInt V10 = CI10->getValue();
	APInt Mask = APInt::getLowBitsSet(64, Length).shl(Index);
	V00 = V00 & ~Mask;
	V10 = V10.zextOrTrunc(Length).zextOrTrunc(64).shl(Index);
	APInt Val = V00 \| V10;
	Type *IntTy64 = Type::getInt64Ty(II.getContext());
	Constant *Args[] = {ConstantInt::get(IntTy64, Val.getZExtValue()),
	UndefValue::get(IntTy64)};
	return ConstantVector::get(Args);
	}

	// If we were an INSERTQ call, we'll save demanded elements if we convert to
	// INSERTQI.
	if (II.getIntrinsicID() == Intrinsic::x86_sse4a_insertq) {
	Type *IntTy8 = Type::getInt8Ty(II.getContext());
	Constant *CILength = ConstantInt::get(IntTy8, Length, false);
	Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);

	Value *Args[] = {Op0, Op1, CILength, CIIndex};
	Module *M = II.getModule();
	Value *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi);
	return Builder.CreateCall(F, Args);
	}

	return nullptr;
	}

	/// Attempt to convert pshufb* to shufflevector if the mask is constant.
	static Value *simplifyX86pshufb(const IntrinsicInst &II,
	InstCombiner::BuilderTy &Builder) {
	Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
	if (!V)
	return nullptr;

	auto *VecTy = cast<VectorType>(II.getType());
	auto *MaskEltTy = Type::getInt32Ty(II.getContext());
	unsigned NumElts = VecTy->getNumElements();
	assert((NumElts == 16 \|\| NumElts == 32 \|\| NumElts == 64) &&
	"Unexpected number of elements in shuffle mask!");

	// Construct a shuffle mask from constant integers or UNDEFs.
	Constant *Indexes[64] = {nullptr};

	// Each byte in the shuffle control mask forms an index to permute the
	// corresponding byte in the destination operand.
	for (unsigned I = 0; I < NumElts; ++I) {
	Constant *COp = V->getAggregateElement(I);
	if (!COp \|\| (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
	return nullptr;

	if (isa<UndefValue>(COp)) {
	Indexes[I] = UndefValue::get(MaskEltTy);
	continue;
	}

	int8_t Index = cast<ConstantInt>(COp)->getValue().getZExtValue();

	// If the most significant bit (bit[7]) of each byte of the shuffle
	// control mask is set, then zero is written in the result byte.
	// The zero vector is in the right-hand side of the resulting
	// shufflevector.

	// The value of each index for the high 128-bit lane is the least
	// significant 4 bits of the respective shuffle control byte.
	Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
	Indexes[I] = ConstantInt::get(MaskEltTy, Index);
	}

	auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, NumElts));
	auto V1 = II.getArgOperand(0);
	auto V2 = Constant::getNullValue(VecTy);
	return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
	}

	/// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
	static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
	InstCombiner::BuilderTy &Builder) {
	Constant *V = dyn_cast<Constant>(II.getArgOperand(1));
	if (!V)
	return nullptr;

	auto *VecTy = cast<VectorType>(II.getType());
	auto *MaskEltTy = Type::getInt32Ty(II.getContext());
	unsigned NumElts = VecTy->getVectorNumElements();
	bool IsPD = VecTy->getScalarType()->isDoubleTy();
	unsigned NumLaneElts = IsPD ? 2 : 4;
	assert(NumElts == 16 \|\| NumElts == 8 \|\| NumElts == 4 \|\| NumElts == 2);

	// Construct a shuffle mask from constant integers or UNDEFs.
	Constant *Indexes[16] = {nullptr};

	// The intrinsics only read one or two bits, clear the rest.
	for (unsigned I = 0; I < NumElts; ++I) {
	Constant *COp = V->getAggregateElement(I);
	if (!COp \|\| (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
	return nullptr;

	if (isa<UndefValue>(COp)) {
	Indexes[I] = UndefValue::get(MaskEltTy);
	continue;
	}

	APInt Index = cast<ConstantInt>(COp)->getValue();
	Index = Index.zextOrTrunc(32).getLoBits(2);

	// The PD variants uses bit 1 to select per-lane element index, so
	// shift down to convert to generic shuffle mask index.
	if (IsPD)
	Index.lshrInPlace(1);

	// The _256 variants are a bit trickier since the mask bits always index
	// into the corresponding 128 half. In order to convert to a generic
	// shuffle, we have to make that explicit.
	Index += APInt(32, (I / NumLaneElts) * NumLaneElts);

	Indexes[I] = ConstantInt::get(MaskEltTy, Index);
	}

	auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, NumElts));
	auto V1 = II.getArgOperand(0);
	auto V2 = UndefValue::get(V1->getType());
	return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
	}

	/// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
	static Value *simplifyX86vpermv(const IntrinsicInst &II,
	InstCombiner::BuilderTy &Builder) {
	auto *V = dyn_cast<Constant>(II.getArgOperand(1));
	if (!V)
	return nullptr;

	auto *VecTy = cast<VectorType>(II.getType());
	auto *MaskEltTy = Type::getInt32Ty(II.getContext());
	unsigned Size = VecTy->getNumElements();
	assert((Size == 4 \|\| Size == 8 \|\| Size == 16 \|\| Size == 32 \|\| Size == 64) &&
	"Unexpected shuffle mask size");

	// Construct a shuffle mask from constant integers or UNDEFs.
	Constant *Indexes[64] = {nullptr};

	for (unsigned I = 0; I < Size; ++I) {
	Constant *COp = V->getAggregateElement(I);
	if (!COp \|\| (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
	return nullptr;

	if (isa<UndefValue>(COp)) {
	Indexes[I] = UndefValue::get(MaskEltTy);
	continue;
	}

	uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
	Index &= Size - 1;
	Indexes[I] = ConstantInt::get(MaskEltTy, Index);
	}

	auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, Size));
	auto V1 = II.getArgOperand(0);
	auto V2 = UndefValue::get(VecTy);
	return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
	}

	/// Decode XOP integer vector comparison intrinsics.
	static Value *simplifyX86vpcom(const IntrinsicInst &II,
	InstCombiner::BuilderTy &Builder,
	bool IsSigned) {
	if (auto *CInt = dyn_cast<ConstantInt>(II.getArgOperand(2))) {
	uint64_t Imm = CInt->getZExtValue() & 0x7;
	VectorType *VecTy = cast<VectorType>(II.getType());
	CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;

	switch (Imm) {
	case 0x0:
	Pred = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
	break;
	case 0x1:
	Pred = IsSigned ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE;
	break;
	case 0x2:
	Pred = IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
	break;
	case 0x3:
	Pred = IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE;
	break;
	case 0x4:
	Pred = ICmpInst::ICMP_EQ; break;
	case 0x5:
	Pred = ICmpInst::ICMP_NE; break;
	case 0x6:
	return ConstantInt::getSigned(VecTy, 0); // FALSE
	case 0x7:
	return ConstantInt::getSigned(VecTy, -1); // TRUE
	}

	if (Value *Cmp = Builder.CreateICmp(Pred, II.getArgOperand(0),
	II.getArgOperand(1)))
	return Builder.CreateSExtOrTrunc(Cmp, VecTy);
	}
	return nullptr;
	}

	// Emit a select instruction and appropriate bitcasts to help simplify
	// masked intrinsics.
	static Value emitX86MaskSelect(Value Mask, Value Op0, Value Op1,
	InstCombiner::BuilderTy &Builder) {
	unsigned VWidth = Op0->getType()->getVectorNumElements();

	// If the mask is all ones we don't need the select. But we need to check
	// only the bit thats will be used in case VWidth is less than 8.
	if (auto *C = dyn_cast<ConstantInt>(Mask))
	if (C->getValue().zextOrTrunc(VWidth).isAllOnesValue())
	return Op0;

	auto *MaskTy = VectorType::get(Builder.getInt1Ty(),
	cast<IntegerType>(Mask->getType())->getBitWidth());
	Mask = Builder.CreateBitCast(Mask, MaskTy);

	// If we have less than 8 elements, then the starting mask was an i8 and
	// we need to extract down to the right number of elements.
	if (VWidth < 8) {
	uint32_t Indices[4];
	for (unsigned i = 0; i != VWidth; ++i)
	Indices[i] = i;
	Mask = Builder.CreateShuffleVector(Mask, Mask,
	makeArrayRef(Indices, VWidth),
	"extract");
	}

	return Builder.CreateSelect(Mask, Op0, Op1);
	}

	static Value *simplifyMinnumMaxnum(const IntrinsicInst &II) {
	Value *Arg0 = II.getArgOperand(0);
	Value *Arg1 = II.getArgOperand(1);

	// fmin(x, x) -> x
	if (Arg0 == Arg1)
	return Arg0;

	const auto *C1 = dyn_cast<ConstantFP>(Arg1);

	// fmin(x, nan) -> x
	if (C1 && C1->isNaN())
	return Arg0;

	// This is the value because if undef were NaN, we would return the other
	// value and cannot return a NaN unless both operands are.
	//
	// fmin(undef, x) -> x
	if (isa<UndefValue>(Arg0))
	return Arg1;

	// fmin(x, undef) -> x
	if (isa<UndefValue>(Arg1))
	return Arg0;

	Value *X = nullptr;
	Value *Y = nullptr;
	if (II.getIntrinsicID() == Intrinsic::minnum) {
	// fmin(x, fmin(x, y)) -> fmin(x, y)
	// fmin(y, fmin(x, y)) -> fmin(x, y)
	if (match(Arg1, m_FMin(m_Value(X), m_Value(Y)))) {
	if (Arg0 == X \|\| Arg0 == Y)
	return Arg1;
	}

	// fmin(fmin(x, y), x) -> fmin(x, y)
	// fmin(fmin(x, y), y) -> fmin(x, y)
	if (match(Arg0, m_FMin(m_Value(X), m_Value(Y)))) {
	if (Arg1 == X \|\| Arg1 == Y)
	return Arg0;
	}

	// TODO: fmin(nnan x, inf) -> x
	// TODO: fmin(nnan ninf x, flt_max) -> x
	if (C1 && C1->isInfinity()) {
	// fmin(x, -inf) -> -inf
	if (C1->isNegative())
	return Arg1;
	}
	} else {
	assert(II.getIntrinsicID() == Intrinsic::maxnum);
	// fmax(x, fmax(x, y)) -> fmax(x, y)
	// fmax(y, fmax(x, y)) -> fmax(x, y)
	if (match(Arg1, m_FMax(m_Value(X), m_Value(Y)))) {
	if (Arg0 == X \|\| Arg0 == Y)
	return Arg1;
	}

	// fmax(fmax(x, y), x) -> fmax(x, y)
	// fmax(fmax(x, y), y) -> fmax(x, y)
	if (match(Arg0, m_FMax(m_Value(X), m_Value(Y)))) {
	if (Arg1 == X \|\| Arg1 == Y)
	return Arg0;
	}

	// TODO: fmax(nnan x, -inf) -> x
	// TODO: fmax(nnan ninf x, -flt_max) -> x
	if (C1 && C1->isInfinity()) {
	// fmax(x, inf) -> inf
	if (!C1->isNegative())
	return Arg1;
	}
	}
	return nullptr;
	}

	static bool maskIsAllOneOrUndef(Value *Mask) {
	auto *ConstMask = dyn_cast<Constant>(Mask);
	if (!ConstMask)
	return false;
	if (ConstMask->isAllOnesValue() \|\| isa<UndefValue>(ConstMask))
	return true;
	for (unsigned I = 0, E = ConstMask->getType()->getVectorNumElements(); I != E;
	++I) {
	if (auto *MaskElt = ConstMask->getAggregateElement(I))
	if (MaskElt->isAllOnesValue() \|\| isa<UndefValue>(MaskElt))
	continue;
	return false;
	}
	return true;
	}

	static Value *simplifyMaskedLoad(const IntrinsicInst &II,
	InstCombiner::BuilderTy &Builder) {
	// If the mask is all ones or undefs, this is a plain vector load of the 1st
	// argument.
	if (maskIsAllOneOrUndef(II.getArgOperand(2))) {
	Value *LoadPtr = II.getArgOperand(0);
	unsigned Alignment = cast<ConstantInt>(II.getArgOperand(1))->getZExtValue();
	return Builder.CreateAlignedLoad(LoadPtr, Alignment, "unmaskedload");
	}

	return nullptr;
	}

	static Instruction *simplifyMaskedStore(IntrinsicInst &II, InstCombiner &IC) {
	auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
	if (!ConstMask)
	return nullptr;

	// If the mask is all zeros, this instruction does nothing.
	if (ConstMask->isNullValue())
	return IC.eraseInstFromFunction(II);

	// If the mask is all ones, this is a plain vector store of the 1st argument.
	if (ConstMask->isAllOnesValue()) {
	Value *StorePtr = II.getArgOperand(1);
	unsigned Alignment = cast<ConstantInt>(II.getArgOperand(2))->getZExtValue();
	return new StoreInst(II.getArgOperand(0), StorePtr, false, Alignment);
	}

	return nullptr;
	}

	static Instruction *simplifyMaskedGather(IntrinsicInst &II, InstCombiner &IC) {
	// If the mask is all zeros, return the "passthru" argument of the gather.
	auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(2));
	if (ConstMask && ConstMask->isNullValue())
	return IC.replaceInstUsesWith(II, II.getArgOperand(3));

	return nullptr;
	}

	static Instruction *simplifyMaskedScatter(IntrinsicInst &II, InstCombiner &IC) {
	// If the mask is all zeros, a scatter does nothing.
	auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
	if (ConstMask && ConstMask->isNullValue())
	return IC.eraseInstFromFunction(II);

	return nullptr;
	}

	static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) {
	assert((II.getIntrinsicID() == Intrinsic::cttz \|\|
	II.getIntrinsicID() == Intrinsic::ctlz) &&
	"Expected cttz or ctlz intrinsic");
	Value *Op0 = II.getArgOperand(0);

	KnownBits Known = IC.computeKnownBits(Op0, 0, &II);

	// Create a mask for bits above (ctlz) or below (cttz) the first known one.
	bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz;
	unsigned PossibleZeros = IsTZ ? Known.countMaxTrailingZeros()
	: Known.countMaxLeadingZeros();
	unsigned DefiniteZeros = IsTZ ? Known.countMinTrailingZeros()
	: Known.countMinLeadingZeros();

	// If all bits above (ctlz) or below (cttz) the first known one are known
	// zero, this value is constant.
	// FIXME: This should be in InstSimplify because we're replacing an
	// instruction with a constant.
	if (PossibleZeros == DefiniteZeros) {
	auto *C = ConstantInt::get(Op0->getType(), DefiniteZeros);
	return IC.replaceInstUsesWith(II, C);
	}

	// If the input to cttz/ctlz is known to be non-zero,
	// then change the 'ZeroIsUndef' parameter to 'true'
	// because we know the zero behavior can't affect the result.
	if (!Known.One.isNullValue() \|\|
	isKnownNonZero(Op0, IC.getDataLayout(), 0, &IC.getAssumptionCache(), &II,
	&IC.getDominatorTree())) {
	if (!match(II.getArgOperand(1), m_One())) {
	II.setOperand(1, IC.Builder.getTrue());
	return &II;
	}
	}

	// Add range metadata since known bits can't completely reflect what we know.
	// TODO: Handle splat vectors.
	auto *IT = dyn_cast<IntegerType>(Op0->getType());
	if (IT && IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) {
	Metadata *LowAndHigh[] = {
	ConstantAsMetadata::get(ConstantInt::get(IT, DefiniteZeros)),
	ConstantAsMetadata::get(ConstantInt::get(IT, PossibleZeros + 1))};
	II.setMetadata(LLVMContext::MD_range,
	MDNode::get(II.getContext(), LowAndHigh));
	return &II;
	}

	return nullptr;
	}

	static Instruction *foldCtpop(IntrinsicInst &II, InstCombiner &IC) {
	assert(II.getIntrinsicID() == Intrinsic::ctpop &&
	"Expected ctpop intrinsic");
	Value *Op0 = II.getArgOperand(0);
	// FIXME: Try to simplify vectors of integers.
	auto *IT = dyn_cast<IntegerType>(Op0->getType());
	if (!IT)
	return nullptr;

	unsigned BitWidth = IT->getBitWidth();
	KnownBits Known(BitWidth);
	IC.computeKnownBits(Op0, Known, 0, &II);

	unsigned MinCount = Known.countMinPopulation();
	unsigned MaxCount = Known.countMaxPopulation();

	// Add range metadata since known bits can't completely reflect what we know.
	if (IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) {
	Metadata *LowAndHigh[] = {
	ConstantAsMetadata::get(ConstantInt::get(IT, MinCount)),
	ConstantAsMetadata::get(ConstantInt::get(IT, MaxCount + 1))};
	II.setMetadata(LLVMContext::MD_range,
	MDNode::get(II.getContext(), LowAndHigh));
	return &II;
	}

	return nullptr;
	}

	// TODO: If the x86 backend knew how to convert a bool vector mask back to an
	// XMM register mask efficiently, we could transform all x86 masked intrinsics
	// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
	static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
	Value *Ptr = II.getOperand(0);
	Value *Mask = II.getOperand(1);
	Constant *ZeroVec = Constant::getNullValue(II.getType());

	// Special case a zero mask since that's not a ConstantDataVector.
	// This masked load instruction creates a zero vector.
	if (isa<ConstantAggregateZero>(Mask))
	return IC.replaceInstUsesWith(II, ZeroVec);

	auto *ConstMask = dyn_cast<ConstantDataVector>(Mask);
	if (!ConstMask)
	return nullptr;

	// The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic
	// to allow target-independent optimizations.

	// First, cast the x86 intrinsic scalar pointer to a vector pointer to match
	// the LLVM intrinsic definition for the pointer argument.
	unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
	PointerType *VecPtrTy = PointerType::get(II.getType(), AddrSpace);
	Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");

	// Second, convert the x86 XMM integer vector mask to a vector of bools based
	// on each element's most significant bit (the sign bit).
	Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask);

	// The pass-through vector for an x86 masked load is a zero vector.
	CallInst *NewMaskedLoad =
	IC.Builder.CreateMaskedLoad(PtrCast, 1, BoolMask, ZeroVec);
	return IC.replaceInstUsesWith(II, NewMaskedLoad);
	}

	// TODO: If the x86 backend knew how to convert a bool vector mask back to an
	// XMM register mask efficiently, we could transform all x86 masked intrinsics
	// to LLVM masked intrinsics and remove the x86 masked intrinsic defs.
	static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
	Value *Ptr = II.getOperand(0);
	Value *Mask = II.getOperand(1);
	Value *Vec = II.getOperand(2);

	// Special case a zero mask since that's not a ConstantDataVector:
	// this masked store instruction does nothing.
	if (isa<ConstantAggregateZero>(Mask)) {
	IC.eraseInstFromFunction(II);
	return true;
	}

	// The SSE2 version is too weird (eg, unaligned but non-temporal) to do
	// anything else at this level.
	if (II.getIntrinsicID() == Intrinsic::x86_sse2_maskmov_dqu)
	return false;

	auto *ConstMask = dyn_cast<ConstantDataVector>(Mask);
	if (!ConstMask)
	return false;

	// The mask is constant. Convert this x86 intrinsic to the LLVM instrinsic
	// to allow target-independent optimizations.

	// First, cast the x86 intrinsic scalar pointer to a vector pointer to match
	// the LLVM intrinsic definition for the pointer argument.
	unsigned AddrSpace = cast<PointerType>(Ptr->getType())->getAddressSpace();
	PointerType *VecPtrTy = PointerType::get(Vec->getType(), AddrSpace);
	Value *PtrCast = IC.Builder.CreateBitCast(Ptr, VecPtrTy, "castvec");

	// Second, convert the x86 XMM integer vector mask to a vector of bools based
	// on each element's most significant bit (the sign bit).
	Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask);

	IC.Builder.CreateMaskedStore(Vec, PtrCast, 1, BoolMask);

	// 'Replace uses' doesn't work for stores. Erase the original masked store.
	IC.eraseInstFromFunction(II);
	return true;
	}

	// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
	//
	// A single NaN input is folded to minnum, so we rely on that folding for
	// handling NaNs.
	static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
	const APFloat &Src2) {
	APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);

	APFloat::cmpResult Cmp0 = Max3.compare(Src0);
	assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
	if (Cmp0 == APFloat::cmpEqual)
	return maxnum(Src1, Src2);

	APFloat::cmpResult Cmp1 = Max3.compare(Src1);
	assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
	if (Cmp1 == APFloat::cmpEqual)
	return maxnum(Src0, Src2);

	return maxnum(Src0, Src1);
	}

	// Returns true iff the 2 intrinsics have the same operands, limiting the
	// comparison to the first NumOperands.
	static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E,
	unsigned NumOperands) {
	assert(I.getNumArgOperands() >= NumOperands && "Not enough operands");
	assert(E.getNumArgOperands() >= NumOperands && "Not enough operands");
	for (unsigned i = 0; i < NumOperands; i++)
	if (I.getArgOperand(i) != E.getArgOperand(i))
	return false;
	return true;
	}

	// Remove trivially empty start/end intrinsic ranges, i.e. a start
	// immediately followed by an end (ignoring debuginfo or other
	// start/end intrinsics in between). As this handles only the most trivial
	// cases, tracking the nesting level is not needed:
	//
	// call @llvm.foo.start(i1 0) ; &I
	// call @llvm.foo.start(i1 0)
	// call @llvm.foo.end(i1 0) ; This one will not be skipped: it will be removed
	// call @llvm.foo.end(i1 0)
	static bool removeTriviallyEmptyRange(IntrinsicInst &I, unsigned StartID,
	unsigned EndID, InstCombiner &IC) {
	assert(I.getIntrinsicID() == StartID &&
	"Start intrinsic does not have expected ID");
	BasicBlock::iterator BI(I), BE(I.getParent()->end());
	for (++BI; BI != BE; ++BI) {
	if (auto *E = dyn_cast<IntrinsicInst>(BI)) {
	if (isa<DbgInfoIntrinsic>(E) \|\| E->getIntrinsicID() == StartID)
	continue;
	if (E->getIntrinsicID() == EndID &&
	haveSameOperands(I, *E, E->getNumArgOperands())) {
	IC.eraseInstFromFunction(*E);
	IC.eraseInstFromFunction(I);
	return true;
	}
	}
	break;
	}

	return false;
	}

	// Convert NVVM intrinsics to target-generic LLVM code where possible.
	static Instruction SimplifyNVVMIntrinsic(IntrinsicInst II, InstCombiner &IC) {
	// Each NVVM intrinsic we can simplify can be replaced with one of:
	//
	// * an LLVM intrinsic,
	// * an LLVM cast operation,
	// * an LLVM binary operation, or
	// * ad-hoc LLVM IR for the particular operation.

	// Some transformations are only valid when the module's
	// flush-denormals-to-zero (ftz) setting is true/false, whereas other
	// transformations are valid regardless of the module's ftz setting.
	enum FtzRequirementTy {
	FTZ_Any, // Any ftz setting is ok.
	FTZ_MustBeOn, // Transformation is valid only if ftz is on.
	FTZ_MustBeOff, // Transformation is valid only if ftz is off.
	};
	// Classes of NVVM intrinsics that can't be replaced one-to-one with a
	// target-generic intrinsic, cast op, or binary op but that we can nonetheless
	// simplify.
	enum SpecialCase {
	SPC_Reciprocal,
	};

	// SimplifyAction is a poor-man's variant (plus an additional flag) that
	// represents how to replace an NVVM intrinsic with target-generic LLVM IR.
	struct SimplifyAction {
	// Invariant: At most one of these Optionals has a value.
	Optional<Intrinsic::ID> IID;
	Optional<Instruction::CastOps> CastOp;
	Optional<Instruction::BinaryOps> BinaryOp;
	Optional<SpecialCase> Special;

	FtzRequirementTy FtzRequirement = FTZ_Any;

	SimplifyAction() = default;

	SimplifyAction(Intrinsic::ID IID, FtzRequirementTy FtzReq)
	: IID(IID), FtzRequirement(FtzReq) {}

	// Cast operations don't have anything to do with FTZ, so we skip that
	// argument.
	SimplifyAction(Instruction::CastOps CastOp) : CastOp(CastOp) {}

	SimplifyAction(Instruction::BinaryOps BinaryOp, FtzRequirementTy FtzReq)
	: BinaryOp(BinaryOp), FtzRequirement(FtzReq) {}

	SimplifyAction(SpecialCase Special, FtzRequirementTy FtzReq)
	: Special(Special), FtzRequirement(FtzReq) {}
	};

	// Try to generate a SimplifyAction describing how to replace our
	// IntrinsicInstr with target-generic LLVM IR.
	const SimplifyAction Action = [II]() -> SimplifyAction {
	switch (II->getIntrinsicID()) {
	// NVVM intrinsics that map directly to LLVM intrinsics.
	case Intrinsic::nvvm_ceil_d:
	return {Intrinsic::ceil, FTZ_Any};
	case Intrinsic::nvvm_ceil_f:
	return {Intrinsic::ceil, FTZ_MustBeOff};
	case Intrinsic::nvvm_ceil_ftz_f:
	return {Intrinsic::ceil, FTZ_MustBeOn};
	case Intrinsic::nvvm_fabs_d:
	return {Intrinsic::fabs, FTZ_Any};
	case Intrinsic::nvvm_fabs_f:
	return {Intrinsic::fabs, FTZ_MustBeOff};
	case Intrinsic::nvvm_fabs_ftz_f:
	return {Intrinsic::fabs, FTZ_MustBeOn};
	case Intrinsic::nvvm_floor_d:
	return {Intrinsic::floor, FTZ_Any};
	case Intrinsic::nvvm_floor_f:
	return {Intrinsic::floor, FTZ_MustBeOff};
	case Intrinsic::nvvm_floor_ftz_f:
	return {Intrinsic::floor, FTZ_MustBeOn};
	case Intrinsic::nvvm_fma_rn_d:
	return {Intrinsic::fma, FTZ_Any};
	case Intrinsic::nvvm_fma_rn_f:
	return {Intrinsic::fma, FTZ_MustBeOff};
	case Intrinsic::nvvm_fma_rn_ftz_f:
	return {Intrinsic::fma, FTZ_MustBeOn};
	case Intrinsic::nvvm_fmax_d:
	return {Intrinsic::maxnum, FTZ_Any};
	case Intrinsic::nvvm_fmax_f:
	return {Intrinsic::maxnum, FTZ_MustBeOff};
	case Intrinsic::nvvm_fmax_ftz_f:
	return {Intrinsic::maxnum, FTZ_MustBeOn};
	case Intrinsic::nvvm_fmin_d:
	return {Intrinsic::minnum, FTZ_Any};
	case Intrinsic::nvvm_fmin_f:
	return {Intrinsic::minnum, FTZ_MustBeOff};
	case Intrinsic::nvvm_fmin_ftz_f:
	return {Intrinsic::minnum, FTZ_MustBeOn};
	case Intrinsic::nvvm_round_d:
	return {Intrinsic::round, FTZ_Any};
	case Intrinsic::nvvm_round_f:
	return {Intrinsic::round, FTZ_MustBeOff};
	case Intrinsic::nvvm_round_ftz_f:
	return {Intrinsic::round, FTZ_MustBeOn};
	case Intrinsic::nvvm_sqrt_rn_d:
	return {Intrinsic::sqrt, FTZ_Any};
	case Intrinsic::nvvm_sqrt_f:
	// nvvm_sqrt_f is a special case. For most intrinsics, foo_ftz_f is the
	// ftz version, and foo_f is the non-ftz version. But nvvm_sqrt_f adopts
	// the ftz-ness of the surrounding code. sqrt_rn_f and sqrt_rn_ftz_f are
	// the versions with explicit ftz-ness.
	return {Intrinsic::sqrt, FTZ_Any};
	case Intrinsic::nvvm_sqrt_rn_f:
	return {Intrinsic::sqrt, FTZ_MustBeOff};
	case Intrinsic::nvvm_sqrt_rn_ftz_f:
	return {Intrinsic::sqrt, FTZ_MustBeOn};
	case Intrinsic::nvvm_trunc_d:
	return {Intrinsic::trunc, FTZ_Any};
	case Intrinsic::nvvm_trunc_f:
	return {Intrinsic::trunc, FTZ_MustBeOff};
	case Intrinsic::nvvm_trunc_ftz_f:
	return {Intrinsic::trunc, FTZ_MustBeOn};

	// NVVM intrinsics that map to LLVM cast operations.
	//
	// Note that llvm's target-generic conversion operators correspond to the rz
	// (round to zero) versions of the nvvm conversion intrinsics, even though
	// most everything else here uses the rn (round to nearest even) nvvm ops.
	case Intrinsic::nvvm_d2i_rz:
	case Intrinsic::nvvm_f2i_rz:
	case Intrinsic::nvvm_d2ll_rz:
	case Intrinsic::nvvm_f2ll_rz:
	return {Instruction::FPToSI};
	case Intrinsic::nvvm_d2ui_rz:
	case Intrinsic::nvvm_f2ui_rz:
	case Intrinsic::nvvm_d2ull_rz:
	case Intrinsic::nvvm_f2ull_rz:
	return {Instruction::FPToUI};
	case Intrinsic::nvvm_i2d_rz:
	case Intrinsic::nvvm_i2f_rz:
	case Intrinsic::nvvm_ll2d_rz:
	case Intrinsic::nvvm_ll2f_rz:
	return {Instruction::SIToFP};
	case Intrinsic::nvvm_ui2d_rz:
	case Intrinsic::nvvm_ui2f_rz:
	case Intrinsic::nvvm_ull2d_rz:
	case Intrinsic::nvvm_ull2f_rz:
	return {Instruction::UIToFP};

	// NVVM intrinsics that map to LLVM binary ops.
	case Intrinsic::nvvm_add_rn_d:
	return {Instruction::FAdd, FTZ_Any};
	case Intrinsic::nvvm_add_rn_f:
	return {Instruction::FAdd, FTZ_MustBeOff};
	case Intrinsic::nvvm_add_rn_ftz_f:
	return {Instruction::FAdd, FTZ_MustBeOn};
	case Intrinsic::nvvm_mul_rn_d:
	return {Instruction::FMul, FTZ_Any};
	case Intrinsic::nvvm_mul_rn_f:
	return {Instruction::FMul, FTZ_MustBeOff};
	case Intrinsic::nvvm_mul_rn_ftz_f:
	return {Instruction::FMul, FTZ_MustBeOn};
	case Intrinsic::nvvm_div_rn_d:
	return {Instruction::FDiv, FTZ_Any};
	case Intrinsic::nvvm_div_rn_f:
	return {Instruction::FDiv, FTZ_MustBeOff};
	case Intrinsic::nvvm_div_rn_ftz_f:
	return {Instruction::FDiv, FTZ_MustBeOn};

	// The remainder of cases are NVVM intrinsics that map to LLVM idioms, but
	// need special handling.
	//
	// We seem to be missing intrinsics for rcp.approx.{ftz.}f32, which is just
	// as well.
	case Intrinsic::nvvm_rcp_rn_d:
	return {SPC_Reciprocal, FTZ_Any};
	case Intrinsic::nvvm_rcp_rn_f:
	return {SPC_Reciprocal, FTZ_MustBeOff};
	case Intrinsic::nvvm_rcp_rn_ftz_f:
	return {SPC_Reciprocal, FTZ_MustBeOn};

	// We do not currently simplify intrinsics that give an approximate answer.
	// These include:
	//
	// - nvvm_cos_approx_{f,ftz_f}
	// - nvvm_ex2_approx_{d,f,ftz_f}
	// - nvvm_lg2_approx_{d,f,ftz_f}
	// - nvvm_sin_approx_{f,ftz_f}
	// - nvvm_sqrt_approx_{f,ftz_f}
	// - nvvm_rsqrt_approx_{d,f,ftz_f}
	// - nvvm_div_approx_{ftz_d,ftz_f,f}
	// - nvvm_rcp_approx_ftz_d
	//
	// Ideally we'd encode them as e.g. "fast call @llvm.cos", where "fast"
	// means that fastmath is enabled in the intrinsic. Unfortunately only
	// binary operators (currently) have a fastmath bit in SelectionDAG, so this
	// information gets lost and we can't select on it.
	//
	// TODO: div and rcp are lowered to a binary op, so these we could in theory
	// lower them to "fast fdiv".

	default:
	return {};
	}
	}();

	// If Action.FtzRequirementTy is not satisfied by the module's ftz state, we
	// can bail out now. (Notice that in the case that IID is not an NVVM
	// intrinsic, we don't have to look up any module metadata, as
	// FtzRequirementTy will be FTZ_Any.)
	if (Action.FtzRequirement != FTZ_Any) {
	bool FtzEnabled =
	II->getFunction()->getFnAttribute("nvptx-f32ftz").getValueAsString() ==
	"true";

	if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn))
	return nullptr;
	}

	// Simplify to target-generic intrinsic.
	if (Action.IID) {
	SmallVector<Value *, 4> Args(II->arg_operands());
	// All the target-generic intrinsics currently of interest to us have one
	// type argument, equal to that of the nvvm intrinsic's argument.
	Type *Tys[] = {II->getArgOperand(0)->getType()};
	return CallInst::Create(
	Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args);
	}

	// Simplify to target-generic binary op.
	if (Action.BinaryOp)
	return BinaryOperator::Create(*Action.BinaryOp, II->getArgOperand(0),
	II->getArgOperand(1), II->getName());

	// Simplify to target-generic cast op.
	if (Action.CastOp)
	return CastInst::Create(*Action.CastOp, II->getArgOperand(0), II->getType(),
	II->getName());

	// All that's left are the special cases.
	if (!Action.Special)
	return nullptr;

	switch (*Action.Special) {
	case SPC_Reciprocal:
	// Simplify reciprocal.
	return BinaryOperator::Create(
	Instruction::FDiv, ConstantFP::get(II->getArgOperand(0)->getType(), 1),
	II->getArgOperand(0), II->getName());
	}
	llvm_unreachable("All SpecialCase enumerators should be handled in switch.");
	}

	Instruction *InstCombiner::visitVAStartInst(VAStartInst &I) {
	removeTriviallyEmptyRange(I, Intrinsic::vastart, Intrinsic::vaend, *this);
	return nullptr;
	}

	Instruction *InstCombiner::visitVACopyInst(VACopyInst &I) {
	removeTriviallyEmptyRange(I, Intrinsic::vacopy, Intrinsic::vaend, *this);
	return nullptr;
	}

	/// CallInst simplification. This mostly only handles folding of intrinsic
	/// instructions. For normal calls, it allows visitCallSite to do the heavy
	/// lifting.
	Instruction *InstCombiner::visitCallInst(CallInst &CI) {
	if (Value *V = SimplifyCall(&CI, SQ.getWithInstruction(&CI)))
	return replaceInstUsesWith(CI, V);

	if (isFreeCall(&CI, &TLI))
	return visitFree(CI);

	// If the caller function is nounwind, mark the call as nounwind, even if the
	// callee isn't.
	if (CI.getFunction()->doesNotThrow() && !CI.doesNotThrow()) {
	CI.setDoesNotThrow();
	return &CI;
	}

	IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CI);
	if (!II) return visitCallSite(&CI);

	// Intrinsics cannot occur in an invoke, so handle them here instead of in
	// visitCallSite.
	if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(II)) {
	bool Changed = false;

	// memmove/cpy/set of zero bytes is a noop.
	if (Constant *NumBytes = dyn_cast<Constant>(MI->getLength())) {
	if (NumBytes->isNullValue())
	return eraseInstFromFunction(CI);

	if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes))
	if (CI->getZExtValue() == 1) {
	// Replace the instruction with just byte operations. We would
	// transform other cases to loads/stores, but we don't know if
	// alignment is sufficient.
	}
	}

	// No other transformations apply to volatile transfers.
	if (MI->isVolatile())
	return nullptr;

	// If we have a memmove and the source operation is a constant global,
	// then the source and dest pointers can't alias, so we can change this
	// into a call to memcpy.
	if (MemMoveInst *MMI = dyn_cast<MemMoveInst>(MI)) {
	if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource()))
	if (GVSrc->isConstant()) {
	Module *M = CI.getModule();
	Intrinsic::ID MemCpyID = Intrinsic::memcpy;
	Type *Tys[3] = { CI.getArgOperand(0)->getType(),
	CI.getArgOperand(1)->getType(),
	CI.getArgOperand(2)->getType() };
	CI.setCalledFunction(Intrinsic::getDeclaration(M, MemCpyID, Tys));
	Changed = true;
	}
	}

	if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
	// memmove(x,x,size) -> noop.
	if (MTI->getSource() == MTI->getDest())
	return eraseInstFromFunction(CI);
	}

	// If we can determine a pointer alignment that is bigger than currently
	// set, update the alignment.
	if (isa<MemTransferInst>(MI)) {
	if (Instruction *I = SimplifyMemTransfer(MI))
	return I;
	} else if (MemSetInst *MSI = dyn_cast<MemSetInst>(MI)) {
	if (Instruction *I = SimplifyMemSet(MSI))
	return I;
	}

	if (Changed) return II;
	}

	if (auto *AMI = dyn_cast<AtomicMemCpyInst>(II)) {
	if (Constant *C = dyn_cast<Constant>(AMI->getLength()))
	if (C->isNullValue())
	return eraseInstFromFunction(*AMI);

	if (Instruction *I = SimplifyElementUnorderedAtomicMemCpy(AMI))
	return I;
	}

	if (Instruction I = SimplifyNVVMIntrinsic(II, this))
	return I;

	auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width,
	unsigned DemandedWidth) {
	APInt UndefElts(Width, 0);
	APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
	return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
	};

	switch (II->getIntrinsicID()) {
	default: break;
	case Intrinsic::objectsize:
	if (ConstantInt *N =
	lowerObjectSizeCall(II, DL, &TLI, /MustSucceed=/false))
	return replaceInstUsesWith(CI, N);
	return nullptr;
	case Intrinsic::bswap: {
	Value *IIOperand = II->getArgOperand(0);
	Value *X = nullptr;

	// bswap(trunc(bswap(x))) -> trunc(lshr(x, c))
	if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) {
	unsigned C = X->getType()->getPrimitiveSizeInBits() -
	IIOperand->getType()->getPrimitiveSizeInBits();
	Value *CV = ConstantInt::get(X->getType(), C);
	Value *V = Builder.CreateLShr(X, CV);
	return new TruncInst(V, IIOperand->getType());
	}
	break;
	}
	case Intrinsic::masked_load:
	if (Value SimplifiedMaskedOp = simplifyMaskedLoad(II, Builder))
	return replaceInstUsesWith(CI, SimplifiedMaskedOp);
	break;
	case Intrinsic::masked_store:
	return simplifyMaskedStore(II, this);
	case Intrinsic::masked_gather:
	return simplifyMaskedGather(II, this);
	case Intrinsic::masked_scatter:
	return simplifyMaskedScatter(II, this);

	case Intrinsic::powi:
	if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
	// 0 and 1 are handled in instsimplify

	// powi(x, -1) -> 1/x
	if (Power->isMinusOne())
	return BinaryOperator::CreateFDiv(ConstantFP::get(CI.getType(), 1.0),
	II->getArgOperand(0));
	// powi(x, 2) -> x*x
	if (Power->equalsInt(2))
	return BinaryOperator::CreateFMul(II->getArgOperand(0),
	II->getArgOperand(0));
	}
	break;

	case Intrinsic::cttz:
	case Intrinsic::ctlz:
	if (auto I = foldCttzCtlz(II, *this))
	return I;
	break;

	case Intrinsic::ctpop:
	if (auto I = foldCtpop(II, *this))
	return I;
	break;

	case Intrinsic::uadd_with_overflow:
	case Intrinsic::sadd_with_overflow:
	case Intrinsic::umul_with_overflow:
	case Intrinsic::smul_with_overflow:
	if (isa<Constant>(II->getArgOperand(0)) &&
	!isa<Constant>(II->getArgOperand(1))) {
	// Canonicalize constants into the RHS.
	Value *LHS = II->getArgOperand(0);
	II->setArgOperand(0, II->getArgOperand(1));
	II->setArgOperand(1, LHS);
	return II;
	}
	LLVM_FALLTHROUGH;

	case Intrinsic::usub_with_overflow:
	case Intrinsic::ssub_with_overflow: {
	OverflowCheckFlavor OCF =
	IntrinsicIDToOverflowCheckFlavor(II->getIntrinsicID());
	assert(OCF != OCF_INVALID && "unexpected!");

	Value *OperationResult = nullptr;
	Constant *OverflowResult = nullptr;
	if (OptimizeOverflowCheck(OCF, II->getArgOperand(0), II->getArgOperand(1),
	*II, OperationResult, OverflowResult))
	return CreateOverflowTuple(II, OperationResult, OverflowResult);

	break;
	}

	case Intrinsic::minnum:
	case Intrinsic::maxnum: {
	Value *Arg0 = II->getArgOperand(0);
	Value *Arg1 = II->getArgOperand(1);
	// Canonicalize constants to the RHS.
	if (isa<ConstantFP>(Arg0) && !isa<ConstantFP>(Arg1)) {
	II->setArgOperand(0, Arg1);
	II->setArgOperand(1, Arg0);
	return II;
	}
	if (Value V = simplifyMinnumMaxnum(II))
	return replaceInstUsesWith(*II, V);
	break;
	}
	case Intrinsic::fmuladd: {
	// Canonicalize fast fmuladd to the separate fmul + fadd.
	if (II->isFast()) {
	BuilderTy::FastMathFlagGuard Guard(Builder);
	Builder.setFastMathFlags(II->getFastMathFlags());
	Value *Mul = Builder.CreateFMul(II->getArgOperand(0),
	II->getArgOperand(1));
	Value *Add = Builder.CreateFAdd(Mul, II->getArgOperand(2));
	Add->takeName(II);
	return replaceInstUsesWith(*II, Add);
	}

	LLVM_FALLTHROUGH;
	}
	case Intrinsic::fma: {
	Value *Src0 = II->getArgOperand(0);
	Value *Src1 = II->getArgOperand(1);

	// Canonicalize constants into the RHS.
	if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
	II->setArgOperand(0, Src1);
	II->setArgOperand(1, Src0);
	std::swap(Src0, Src1);
	}

	Value *LHS = nullptr;
	Value *RHS = nullptr;

	// fma fneg(x), fneg(y), z -> fma x, y, z
	if (match(Src0, m_FNeg(m_Value(LHS))) &&
	match(Src1, m_FNeg(m_Value(RHS)))) {
	II->setArgOperand(0, LHS);
	II->setArgOperand(1, RHS);
	return II;
	}

	// fma fabs(x), fabs(x), z -> fma x, x, z
	if (match(Src0, m_Intrinsic<Intrinsic::fabs>(m_Value(LHS))) &&
	match(Src1, m_Intrinsic<Intrinsic::fabs>(m_Value(RHS))) && LHS == RHS) {
	II->setArgOperand(0, LHS);
	II->setArgOperand(1, RHS);
	return II;
	}

	// fma x, 1, z -> fadd x, z
	if (match(Src1, m_FPOne())) {
	Instruction *RI = BinaryOperator::CreateFAdd(Src0, II->getArgOperand(2));
	RI->copyFastMathFlags(II);
	return RI;
	}

	break;
	}
	case Intrinsic::fabs: {
	Value *Cond;
	Constant LHS, RHS;
	if (match(II->getArgOperand(0),
	m_Select(m_Value(Cond), m_Constant(LHS), m_Constant(RHS)))) {
	CallInst *Call0 = Builder.CreateCall(II->getCalledFunction(), {LHS});
	CallInst *Call1 = Builder.CreateCall(II->getCalledFunction(), {RHS});
	return SelectInst::Create(Cond, Call0, Call1);
	}

	LLVM_FALLTHROUGH;
	}
	case Intrinsic::ceil:
	case Intrinsic::floor:
	case Intrinsic::round:
	case Intrinsic::nearbyint:
	case Intrinsic::rint:
	case Intrinsic::trunc: {
	Value *ExtSrc;
	if (match(II->getArgOperand(0), m_FPExt(m_Value(ExtSrc))) &&
	II->getArgOperand(0)->hasOneUse()) {
	// fabs (fpext x) -> fpext (fabs x)
	Value *F = Intrinsic::getDeclaration(II->getModule(), II->getIntrinsicID(),
	{ ExtSrc->getType() });
	CallInst *NewFabs = Builder.CreateCall(F, ExtSrc);
	NewFabs->copyFastMathFlags(II);
	NewFabs->takeName(II);
	return new FPExtInst(NewFabs, II->getType());
	}

	break;
	}
	case Intrinsic::cos:
	case Intrinsic::amdgcn_cos: {
	Value *SrcSrc;
	Value *Src = II->getArgOperand(0);
	if (match(Src, m_FNeg(m_Value(SrcSrc))) \|\|
	match(Src, m_Intrinsic<Intrinsic::fabs>(m_Value(SrcSrc)))) {
	// cos(-x) -> cos(x)
	// cos(fabs(x)) -> cos(x)
	II->setArgOperand(0, SrcSrc);
	return II;
	}

	break;
	}
	case Intrinsic::ppc_altivec_lvx:
	case Intrinsic::ppc_altivec_lvxl:
	// Turn PPC lvx -> load if the pointer is known aligned.
	if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC,
	&DT) >= 16) {
	Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
	PointerType::getUnqual(II->getType()));
	return new LoadInst(Ptr);
	}
	break;
	case Intrinsic::ppc_vsx_lxvw4x:
	case Intrinsic::ppc_vsx_lxvd2x: {
	// Turn PPC VSX loads into normal loads.
	Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
	PointerType::getUnqual(II->getType()));
	return new LoadInst(Ptr, Twine(""), false, 1);
	}
	case Intrinsic::ppc_altivec_stvx:
	case Intrinsic::ppc_altivec_stvxl:
	// Turn stvx -> store if the pointer is known aligned.
	if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC,
	&DT) >= 16) {
	Type *OpPtrTy =
	PointerType::getUnqual(II->getArgOperand(0)->getType());
	Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
	return new StoreInst(II->getArgOperand(0), Ptr);
	}
	break;
	case Intrinsic::ppc_vsx_stxvw4x:
	case Intrinsic::ppc_vsx_stxvd2x: {
	// Turn PPC VSX stores into normal stores.
	Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType());
	Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
	return new StoreInst(II->getArgOperand(0), Ptr, false, 1);
	}
	case Intrinsic::ppc_qpx_qvlfs:
	// Turn PPC QPX qvlfs -> load if the pointer is known aligned.
	if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC,
	&DT) >= 16) {
	Type *VTy = VectorType::get(Builder.getFloatTy(),
	II->getType()->getVectorNumElements());
	Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
	PointerType::getUnqual(VTy));
	Value *Load = Builder.CreateLoad(Ptr);
	return new FPExtInst(Load, II->getType());
	}
	break;
	case Intrinsic::ppc_qpx_qvlfd:
	// Turn PPC QPX qvlfd -> load if the pointer is known aligned.
	if (getOrEnforceKnownAlignment(II->getArgOperand(0), 32, DL, II, &AC,
	&DT) >= 32) {
	Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
	PointerType::getUnqual(II->getType()));
	return new LoadInst(Ptr);
	}
	break;
	case Intrinsic::ppc_qpx_qvstfs:
	// Turn PPC QPX qvstfs -> store if the pointer is known aligned.
	if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC,
	&DT) >= 16) {
	Type *VTy = VectorType::get(Builder.getFloatTy(),
	II->getArgOperand(0)->getType()->getVectorNumElements());
	Value *TOp = Builder.CreateFPTrunc(II->getArgOperand(0), VTy);
	Type *OpPtrTy = PointerType::getUnqual(VTy);
	Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
	return new StoreInst(TOp, Ptr);
	}
	break;
	case Intrinsic::ppc_qpx_qvstfd:
	// Turn PPC QPX qvstfd -> store if the pointer is known aligned.
	if (getOrEnforceKnownAlignment(II->getArgOperand(1), 32, DL, II, &AC,
	&DT) >= 32) {
	Type *OpPtrTy =
	PointerType::getUnqual(II->getArgOperand(0)->getType());
	Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
	return new StoreInst(II->getArgOperand(0), Ptr);
	}
	break;

	case Intrinsic::x86_bmi_bextr_32:
	case Intrinsic::x86_bmi_bextr_64:
	case Intrinsic::x86_tbm_bextri_u32:
	case Intrinsic::x86_tbm_bextri_u64:
	// If the RHS is a constant we can try some simplifications.
	if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
	uint64_t Shift = C->getZExtValue();
	uint64_t Length = (Shift >> 8) & 0xff;
	Shift &= 0xff;
	unsigned BitWidth = II->getType()->getIntegerBitWidth();
	// If the length is 0 or the shift is out of range, replace with zero.
	if (Length == 0 \|\| Shift >= BitWidth)
	return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0));
	// If the LHS is also a constant, we can completely constant fold this.
	if (auto *InC = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
	uint64_t Result = InC->getZExtValue() >> Shift;
	if (Length > BitWidth)
	Length = BitWidth;
	Result &= maskTrailingOnes<uint64_t>(Length);
	return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result));
	}
	// TODO should we turn this into 'and' if shift is 0? Or 'shl' if we
	// are only masking bits that a shift already cleared?
	}
	break;

	case Intrinsic::x86_bmi_bzhi_32:
	case Intrinsic::x86_bmi_bzhi_64:
	// If the RHS is a constant we can try some simplifications.
	if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
	uint64_t Index = C->getZExtValue() & 0xff;
	unsigned BitWidth = II->getType()->getIntegerBitWidth();
	if (Index >= BitWidth)
	return replaceInstUsesWith(CI, II->getArgOperand(0));
	if (Index == 0)
	return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), 0));
	// If the LHS is also a constant, we can completely constant fold this.
	if (auto *InC = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
	uint64_t Result = InC->getZExtValue();
	Result &= maskTrailingOnes<uint64_t>(Index);
	return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Result));
	}
	// TODO should we convert this to an AND if the RHS is constant?
	}
	break;

	case Intrinsic::x86_vcvtph2ps_128:
	case Intrinsic::x86_vcvtph2ps_256: {
	auto Arg = II->getArgOperand(0);
	auto ArgType = cast<VectorType>(Arg->getType());
	auto RetType = cast<VectorType>(II->getType());
	unsigned ArgWidth = ArgType->getNumElements();
	unsigned RetWidth = RetType->getNumElements();
	assert(RetWidth <= ArgWidth && "Unexpected input/return vector widths");
	assert(ArgType->isIntOrIntVectorTy() &&
	ArgType->getScalarSizeInBits() == 16 &&
	"CVTPH2PS input type should be 16-bit integer vector");
	assert(RetType->getScalarType()->isFloatTy() &&
	"CVTPH2PS output type should be 32-bit float vector");

	// Constant folding: Convert to generic half to single conversion.
	if (isa<ConstantAggregateZero>(Arg))
	return replaceInstUsesWith(*II, ConstantAggregateZero::get(RetType));

	if (isa<ConstantDataVector>(Arg)) {
	auto VectorHalfAsShorts = Arg;
	if (RetWidth < ArgWidth) {
	SmallVector<uint32_t, 8> SubVecMask;
	for (unsigned i = 0; i != RetWidth; ++i)
	SubVecMask.push_back((int)i);
	VectorHalfAsShorts = Builder.CreateShuffleVector(
	Arg, UndefValue::get(ArgType), SubVecMask);
	}

	auto VectorHalfType =
	VectorType::get(Type::getHalfTy(II->getContext()), RetWidth);
	auto VectorHalfs =
	Builder.CreateBitCast(VectorHalfAsShorts, VectorHalfType);
	auto VectorFloats = Builder.CreateFPExt(VectorHalfs, RetType);
	return replaceInstUsesWith(*II, VectorFloats);
	}

	// We only use the lowest lanes of the argument.
	if (Value *V = SimplifyDemandedVectorEltsLow(Arg, ArgWidth, RetWidth)) {
	II->setArgOperand(0, V);
	return II;
	}
	break;
	}

	case Intrinsic::x86_sse_cvtss2si:
	case Intrinsic::x86_sse_cvtss2si64:
	case Intrinsic::x86_sse_cvttss2si:
	case Intrinsic::x86_sse_cvttss2si64:
	case Intrinsic::x86_sse2_cvtsd2si:
	case Intrinsic::x86_sse2_cvtsd2si64:
	case Intrinsic::x86_sse2_cvttsd2si:
	case Intrinsic::x86_sse2_cvttsd2si64:
	case Intrinsic::x86_avx512_vcvtss2si32:
	case Intrinsic::x86_avx512_vcvtss2si64:
	case Intrinsic::x86_avx512_vcvtss2usi32:
	case Intrinsic::x86_avx512_vcvtss2usi64:
	case Intrinsic::x86_avx512_vcvtsd2si32:
	case Intrinsic::x86_avx512_vcvtsd2si64:
	case Intrinsic::x86_avx512_vcvtsd2usi32:
	case Intrinsic::x86_avx512_vcvtsd2usi64:
	case Intrinsic::x86_avx512_cvttss2si:
	case Intrinsic::x86_avx512_cvttss2si64:
	case Intrinsic::x86_avx512_cvttss2usi:
	case Intrinsic::x86_avx512_cvttss2usi64:
	case Intrinsic::x86_avx512_cvttsd2si:
	case Intrinsic::x86_avx512_cvttsd2si64:
	case Intrinsic::x86_avx512_cvttsd2usi:
	case Intrinsic::x86_avx512_cvttsd2usi64: {
	// These intrinsics only demand the 0th element of their input vectors. If
	// we can simplify the input based on that, do so now.
	Value *Arg = II->getArgOperand(0);
	unsigned VWidth = Arg->getType()->getVectorNumElements();
	if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
	II->setArgOperand(0, V);
	return II;
	}
	break;
	}

	case Intrinsic::x86_mmx_pmovmskb:
	case Intrinsic::x86_sse_movmsk_ps:
	case Intrinsic::x86_sse2_movmsk_pd:
	case Intrinsic::x86_sse2_pmovmskb_128:
	case Intrinsic::x86_avx_movmsk_pd_256:
	case Intrinsic::x86_avx_movmsk_ps_256:
	case Intrinsic::x86_avx2_pmovmskb:
	if (Value V = simplifyX86movmsk(II))
	return replaceInstUsesWith(*II, V);
	break;

	case Intrinsic::x86_sse_comieq_ss:
	case Intrinsic::x86_sse_comige_ss:
	case Intrinsic::x86_sse_comigt_ss:
	case Intrinsic::x86_sse_comile_ss:
	case Intrinsic::x86_sse_comilt_ss:
	case Intrinsic::x86_sse_comineq_ss:
	case Intrinsic::x86_sse_ucomieq_ss:
	case Intrinsic::x86_sse_ucomige_ss:
	case Intrinsic::x86_sse_ucomigt_ss:
	case Intrinsic::x86_sse_ucomile_ss:
	case Intrinsic::x86_sse_ucomilt_ss:
	case Intrinsic::x86_sse_ucomineq_ss:
	case Intrinsic::x86_sse2_comieq_sd:
	case Intrinsic::x86_sse2_comige_sd:
	case Intrinsic::x86_sse2_comigt_sd:
	case Intrinsic::x86_sse2_comile_sd:
	case Intrinsic::x86_sse2_comilt_sd:
	case Intrinsic::x86_sse2_comineq_sd:
	case Intrinsic::x86_sse2_ucomieq_sd:
	case Intrinsic::x86_sse2_ucomige_sd:
	case Intrinsic::x86_sse2_ucomigt_sd:
	case Intrinsic::x86_sse2_ucomile_sd:
	case Intrinsic::x86_sse2_ucomilt_sd:
	case Intrinsic::x86_sse2_ucomineq_sd:
	case Intrinsic::x86_avx512_vcomi_ss:
	case Intrinsic::x86_avx512_vcomi_sd:
	case Intrinsic::x86_avx512_mask_cmp_ss:
	case Intrinsic::x86_avx512_mask_cmp_sd: {
	// These intrinsics only demand the 0th element of their input vectors. If
	// we can simplify the input based on that, do so now.
	bool MadeChange = false;
	Value *Arg0 = II->getArgOperand(0);
	Value *Arg1 = II->getArgOperand(1);
	unsigned VWidth = Arg0->getType()->getVectorNumElements();
	if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
	II->setArgOperand(0, V);
	MadeChange = true;
	}
	if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
	II->setArgOperand(1, V);
	MadeChange = true;
	}
	if (MadeChange)
	return II;
	break;
	}
	case Intrinsic::x86_avx512_mask_cmp_pd_128:
	case Intrinsic::x86_avx512_mask_cmp_pd_256:
	case Intrinsic::x86_avx512_mask_cmp_pd_512:
	case Intrinsic::x86_avx512_mask_cmp_ps_128:
	case Intrinsic::x86_avx512_mask_cmp_ps_256:
	case Intrinsic::x86_avx512_mask_cmp_ps_512: {
	// Folding cmp(sub(a,b),0) -> cmp(a,b) and cmp(0,sub(a,b)) -> cmp(b,a)
	Value *Arg0 = II->getArgOperand(0);
	Value *Arg1 = II->getArgOperand(1);
	bool Arg0IsZero = match(Arg0, m_Zero());
	if (Arg0IsZero)
	std::swap(Arg0, Arg1);
	Value A, B;
	// This fold requires only the NINF(not +/- inf) since inf minus
	// inf is nan.
	// NSZ(No Signed Zeros) is not needed because zeros of any sign are
	// equal for both compares.
	// NNAN is not needed because nans compare the same for both compares.
	// The compare intrinsic uses the above assumptions and therefore
	// doesn't require additional flags.
	if ((match(Arg0, m_OneUse(m_FSub(m_Value(A), m_Value(B)))) &&
	match(Arg1, m_Zero()) && isa<Instruction>(Arg0) &&
	cast<Instruction>(Arg0)->getFastMathFlags().noInfs())) {
	if (Arg0IsZero)
	std::swap(A, B);
	II->setArgOperand(0, A);
	II->setArgOperand(1, B);
	return II;
	}
	break;
	}

	case Intrinsic::x86_avx512_mask_add_ps_512:
	case Intrinsic::x86_avx512_mask_div_ps_512:
	case Intrinsic::x86_avx512_mask_mul_ps_512:
	case Intrinsic::x86_avx512_mask_sub_ps_512:
	case Intrinsic::x86_avx512_mask_add_pd_512:
	case Intrinsic::x86_avx512_mask_div_pd_512:
	case Intrinsic::x86_avx512_mask_mul_pd_512:
	case Intrinsic::x86_avx512_mask_sub_pd_512:
	// If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
	// IR operations.
	if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(4))) {
	if (R->getValue() == 4) {
	Value *Arg0 = II->getArgOperand(0);
	Value *Arg1 = II->getArgOperand(1);

	Value *V;
	switch (II->getIntrinsicID()) {
	default: llvm_unreachable("Case stmts out of sync!");
	case Intrinsic::x86_avx512_mask_add_ps_512:
	case Intrinsic::x86_avx512_mask_add_pd_512:
	V = Builder.CreateFAdd(Arg0, Arg1);
	break;
	case Intrinsic::x86_avx512_mask_sub_ps_512:
	case Intrinsic::x86_avx512_mask_sub_pd_512:
	V = Builder.CreateFSub(Arg0, Arg1);
	break;
	case Intrinsic::x86_avx512_mask_mul_ps_512:
	case Intrinsic::x86_avx512_mask_mul_pd_512:
	V = Builder.CreateFMul(Arg0, Arg1);
	break;
	case Intrinsic::x86_avx512_mask_div_ps_512:
	case Intrinsic::x86_avx512_mask_div_pd_512:
	V = Builder.CreateFDiv(Arg0, Arg1);
	break;
	}

	// Create a select for the masking.
	V = emitX86MaskSelect(II->getArgOperand(3), V, II->getArgOperand(2),
	Builder);
	return replaceInstUsesWith(*II, V);
	}
	}
	break;

	case Intrinsic::x86_avx512_mask_add_ss_round:
	case Intrinsic::x86_avx512_mask_div_ss_round:
	case Intrinsic::x86_avx512_mask_mul_ss_round:
	case Intrinsic::x86_avx512_mask_sub_ss_round:
	case Intrinsic::x86_avx512_mask_add_sd_round:
	case Intrinsic::x86_avx512_mask_div_sd_round:
	case Intrinsic::x86_avx512_mask_mul_sd_round:
	case Intrinsic::x86_avx512_mask_sub_sd_round:
	// If the rounding mode is CUR_DIRECTION(4) we can turn these into regular
	// IR operations.
	if (auto *R = dyn_cast<ConstantInt>(II->getArgOperand(4))) {
	if (R->getValue() == 4) {
	// Extract the element as scalars.
	Value *Arg0 = II->getArgOperand(0);
	Value *Arg1 = II->getArgOperand(1);
	Value *LHS = Builder.CreateExtractElement(Arg0, (uint64_t)0);
	Value *RHS = Builder.CreateExtractElement(Arg1, (uint64_t)0);

	Value *V;
	switch (II->getIntrinsicID()) {
	default: llvm_unreachable("Case stmts out of sync!");
	case Intrinsic::x86_avx512_mask_add_ss_round:
	case Intrinsic::x86_avx512_mask_add_sd_round:
	V = Builder.CreateFAdd(LHS, RHS);
	break;
	case Intrinsic::x86_avx512_mask_sub_ss_round:
	case Intrinsic::x86_avx512_mask_sub_sd_round:
	V = Builder.CreateFSub(LHS, RHS);
	break;
	case Intrinsic::x86_avx512_mask_mul_ss_round:
	case Intrinsic::x86_avx512_mask_mul_sd_round:
	V = Builder.CreateFMul(LHS, RHS);
	break;
	case Intrinsic::x86_avx512_mask_div_ss_round:
	case Intrinsic::x86_avx512_mask_div_sd_round:
	V = Builder.CreateFDiv(LHS, RHS);
	break;
	}

	// Handle the masking aspect of the intrinsic.
	Value *Mask = II->getArgOperand(3);
	auto *C = dyn_cast<ConstantInt>(Mask);
	// We don't need a select if we know the mask bit is a 1.
	if (!C \|\| !C->getValue()[0]) {
	// Cast the mask to an i1 vector and then extract the lowest element.
	auto *MaskTy = VectorType::get(Builder.getInt1Ty(),
	cast<IntegerType>(Mask->getType())->getBitWidth());
	Mask = Builder.CreateBitCast(Mask, MaskTy);
	Mask = Builder.CreateExtractElement(Mask, (uint64_t)0);
	// Extract the lowest element from the passthru operand.
	Value *Passthru = Builder.CreateExtractElement(II->getArgOperand(2),
	(uint64_t)0);
	V = Builder.CreateSelect(Mask, V, Passthru);
	}

	// Insert the result back into the original argument 0.
	V = Builder.CreateInsertElement(Arg0, V, (uint64_t)0);

	return replaceInstUsesWith(*II, V);
	}
	}
	LLVM_FALLTHROUGH;

	// X86 scalar intrinsics simplified with SimplifyDemandedVectorElts.
	case Intrinsic::x86_avx512_mask_max_ss_round:
	case Intrinsic::x86_avx512_mask_min_ss_round:
	case Intrinsic::x86_avx512_mask_max_sd_round:
	case Intrinsic::x86_avx512_mask_min_sd_round:
	case Intrinsic::x86_avx512_mask_vfmadd_ss:
	case Intrinsic::x86_avx512_mask_vfmadd_sd:
	case Intrinsic::x86_avx512_maskz_vfmadd_ss:
	case Intrinsic::x86_avx512_maskz_vfmadd_sd:
	case Intrinsic::x86_avx512_mask3_vfmadd_ss:
	case Intrinsic::x86_avx512_mask3_vfmadd_sd:
	case Intrinsic::x86_avx512_mask3_vfmsub_ss:
	case Intrinsic::x86_avx512_mask3_vfmsub_sd:
	case Intrinsic::x86_avx512_mask3_vfnmsub_ss:
	case Intrinsic::x86_avx512_mask3_vfnmsub_sd:
	case Intrinsic::x86_fma_vfmadd_ss:
	case Intrinsic::x86_fma_vfmsub_ss:
	case Intrinsic::x86_fma_vfnmadd_ss:
	case Intrinsic::x86_fma_vfnmsub_ss:
	case Intrinsic::x86_fma_vfmadd_sd:
	case Intrinsic::x86_fma_vfmsub_sd:
	case Intrinsic::x86_fma_vfnmadd_sd:
	case Intrinsic::x86_fma_vfnmsub_sd:
	case Intrinsic::x86_sse_cmp_ss:
	case Intrinsic::x86_sse_min_ss:
	case Intrinsic::x86_sse_max_ss:
	case Intrinsic::x86_sse2_cmp_sd:
	case Intrinsic::x86_sse2_min_sd:
	case Intrinsic::x86_sse2_max_sd:
	case Intrinsic::x86_sse41_round_ss:
	case Intrinsic::x86_sse41_round_sd:
	case Intrinsic::x86_xop_vfrcz_ss:
	case Intrinsic::x86_xop_vfrcz_sd: {
	unsigned VWidth = II->getType()->getVectorNumElements();
	APInt UndefElts(VWidth, 0);
	APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
	if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) {
	if (V != II)
	return replaceInstUsesWith(*II, V);
	return II;
	}
	break;
	}

	// Constant fold ashr( <A x Bi>, Ci ).
	// Constant fold lshr( <A x Bi>, Ci ).
	// Constant fold shl( <A x Bi>, Ci ).
	case Intrinsic::x86_sse2_psrai_d:
	case Intrinsic::x86_sse2_psrai_w:
	case Intrinsic::x86_avx2_psrai_d:
	case Intrinsic::x86_avx2_psrai_w:
	case Intrinsic::x86_avx512_psrai_q_128:
	case Intrinsic::x86_avx512_psrai_q_256:
	case Intrinsic::x86_avx512_psrai_d_512:
	case Intrinsic::x86_avx512_psrai_q_512:
	case Intrinsic::x86_avx512_psrai_w_512:
	case Intrinsic::x86_sse2_psrli_d:
	case Intrinsic::x86_sse2_psrli_q:
	case Intrinsic::x86_sse2_psrli_w:
	case Intrinsic::x86_avx2_psrli_d:
	case Intrinsic::x86_avx2_psrli_q:
	case Intrinsic::x86_avx2_psrli_w:
	case Intrinsic::x86_avx512_psrli_d_512:
	case Intrinsic::x86_avx512_psrli_q_512:
	case Intrinsic::x86_avx512_psrli_w_512:
	case Intrinsic::x86_sse2_pslli_d:
	case Intrinsic::x86_sse2_pslli_q:
	case Intrinsic::x86_sse2_pslli_w:
	case Intrinsic::x86_avx2_pslli_d:
	case Intrinsic::x86_avx2_pslli_q:
	case Intrinsic::x86_avx2_pslli_w:
	case Intrinsic::x86_avx512_pslli_d_512:
	case Intrinsic::x86_avx512_pslli_q_512:
	case Intrinsic::x86_avx512_pslli_w_512:
	if (Value V = simplifyX86immShift(II, Builder))
	return replaceInstUsesWith(*II, V);
	break;

	case Intrinsic::x86_sse2_psra_d:
	case Intrinsic::x86_sse2_psra_w:
	case Intrinsic::x86_avx2_psra_d:
	case Intrinsic::x86_avx2_psra_w:
	case Intrinsic::x86_avx512_psra_q_128:
	case Intrinsic::x86_avx512_psra_q_256:
	case Intrinsic::x86_avx512_psra_d_512:
	case Intrinsic::x86_avx512_psra_q_512:
	case Intrinsic::x86_avx512_psra_w_512:
	case Intrinsic::x86_sse2_psrl_d:
	case Intrinsic::x86_sse2_psrl_q:
	case Intrinsic::x86_sse2_psrl_w:
	case Intrinsic::x86_avx2_psrl_d:
	case Intrinsic::x86_avx2_psrl_q:
	case Intrinsic::x86_avx2_psrl_w:
	case Intrinsic::x86_avx512_psrl_d_512:
	case Intrinsic::x86_avx512_psrl_q_512:
	case Intrinsic::x86_avx512_psrl_w_512:
	case Intrinsic::x86_sse2_psll_d:
	case Intrinsic::x86_sse2_psll_q:
	case Intrinsic::x86_sse2_psll_w:
	case Intrinsic::x86_avx2_psll_d:
	case Intrinsic::x86_avx2_psll_q:
	case Intrinsic::x86_avx2_psll_w:
	case Intrinsic::x86_avx512_psll_d_512:
	case Intrinsic::x86_avx512_psll_q_512:
	case Intrinsic::x86_avx512_psll_w_512: {
	if (Value V = simplifyX86immShift(II, Builder))
	return replaceInstUsesWith(*II, V);

	// SSE2/AVX2 uses only the first 64-bits of the 128-bit vector
	// operand to compute the shift amount.
	Value *Arg1 = II->getArgOperand(1);
	assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
	"Unexpected packed shift size");
	unsigned VWidth = Arg1->getType()->getVectorNumElements();

	if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
	II->setArgOperand(1, V);
	return II;
	}
	break;
	}

	case Intrinsic::x86_avx2_psllv_d:
	case Intrinsic::x86_avx2_psllv_d_256:
	case Intrinsic::x86_avx2_psllv_q:
	case Intrinsic::x86_avx2_psllv_q_256:
	case Intrinsic::x86_avx512_psllv_d_512:
	case Intrinsic::x86_avx512_psllv_q_512:
	case Intrinsic::x86_avx512_psllv_w_128:
	case Intrinsic::x86_avx512_psllv_w_256:
	case Intrinsic::x86_avx512_psllv_w_512:
	case Intrinsic::x86_avx2_psrav_d:
	case Intrinsic::x86_avx2_psrav_d_256:
	case Intrinsic::x86_avx512_psrav_q_128:
	case Intrinsic::x86_avx512_psrav_q_256:
	case Intrinsic::x86_avx512_psrav_d_512:
	case Intrinsic::x86_avx512_psrav_q_512:
	case Intrinsic::x86_avx512_psrav_w_128:
	case Intrinsic::x86_avx512_psrav_w_256:
	case Intrinsic::x86_avx512_psrav_w_512:
	case Intrinsic::x86_avx2_psrlv_d:
	case Intrinsic::x86_avx2_psrlv_d_256:
	case Intrinsic::x86_avx2_psrlv_q:
	case Intrinsic::x86_avx2_psrlv_q_256:
	case Intrinsic::x86_avx512_psrlv_d_512:
	case Intrinsic::x86_avx512_psrlv_q_512:
	case Intrinsic::x86_avx512_psrlv_w_128:
	case Intrinsic::x86_avx512_psrlv_w_256:
	case Intrinsic::x86_avx512_psrlv_w_512:
	if (Value V = simplifyX86varShift(II, Builder))
	return replaceInstUsesWith(*II, V);
	break;

	case Intrinsic::x86_sse2_pmulu_dq:
	case Intrinsic::x86_sse41_pmuldq:
	case Intrinsic::x86_avx2_pmul_dq:
	case Intrinsic::x86_avx2_pmulu_dq:
	case Intrinsic::x86_avx512_pmul_dq_512:
	case Intrinsic::x86_avx512_pmulu_dq_512: {
	if (Value V = simplifyX86muldq(II, Builder))
	return replaceInstUsesWith(*II, V);

	unsigned VWidth = II->getType()->getVectorNumElements();
	APInt UndefElts(VWidth, 0);
	APInt DemandedElts = APInt::getAllOnesValue(VWidth);
	if (Value *V = SimplifyDemandedVectorElts(II, DemandedElts, UndefElts)) {
	if (V != II)
	return replaceInstUsesWith(*II, V);
	return II;
	}
	break;
	}

	case Intrinsic::x86_sse2_packssdw_128:
	case Intrinsic::x86_sse2_packsswb_128:
	case Intrinsic::x86_avx2_packssdw:
	case Intrinsic::x86_avx2_packsswb:
	case Intrinsic::x86_avx512_packssdw_512:
	case Intrinsic::x86_avx512_packsswb_512:
	if (Value V = simplifyX86pack(II, true))
	return replaceInstUsesWith(*II, V);
	break;

	case Intrinsic::x86_sse2_packuswb_128:
	case Intrinsic::x86_sse41_packusdw:
	case Intrinsic::x86_avx2_packusdw:
	case Intrinsic::x86_avx2_packuswb:
	case Intrinsic::x86_avx512_packusdw_512:
	case Intrinsic::x86_avx512_packuswb_512:
	if (Value V = simplifyX86pack(II, false))
	return replaceInstUsesWith(*II, V);
	break;

	case Intrinsic::x86_pclmulqdq: {
	if (auto *C = dyn_cast<ConstantInt>(II->getArgOperand(2))) {
	unsigned Imm = C->getZExtValue();

	bool MadeChange = false;
	Value *Arg0 = II->getArgOperand(0);
	Value *Arg1 = II->getArgOperand(1);
	unsigned VWidth = Arg0->getType()->getVectorNumElements();
	APInt DemandedElts(VWidth, 0);

	APInt UndefElts1(VWidth, 0);
	DemandedElts = (Imm & 0x01) ? 2 : 1;
	if (Value *V = SimplifyDemandedVectorElts(Arg0, DemandedElts,
	UndefElts1)) {
	II->setArgOperand(0, V);
	MadeChange = true;
	}

	APInt UndefElts2(VWidth, 0);
	DemandedElts = (Imm & 0x10) ? 2 : 1;
	if (Value *V = SimplifyDemandedVectorElts(Arg1, DemandedElts,
	UndefElts2)) {
	II->setArgOperand(1, V);
	MadeChange = true;
	}

	// If both input elements are undef, the result is undef.
	if (UndefElts1[(Imm & 0x01) ? 1 : 0] \|\|
	UndefElts2[(Imm & 0x10) ? 1 : 0])
	return replaceInstUsesWith(*II,
	ConstantAggregateZero::get(II->getType()));

	if (MadeChange)
	return II;
	}
	break;
	}

	case Intrinsic::x86_sse41_insertps:
	if (Value V = simplifyX86insertps(II, Builder))
	return replaceInstUsesWith(*II, V);
	break;

	case Intrinsic::x86_sse4a_extrq: {
	Value *Op0 = II->getArgOperand(0);
	Value *Op1 = II->getArgOperand(1);
	unsigned VWidth0 = Op0->getType()->getVectorNumElements();
	unsigned VWidth1 = Op1->getType()->getVectorNumElements();
	assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
	Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
	VWidth1 == 16 && "Unexpected operand sizes");

	// See if we're dealing with constant values.
	Constant *C1 = dyn_cast<Constant>(Op1);
	ConstantInt *CILength =
	C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)0))
	: nullptr;
	ConstantInt *CIIndex =
	C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
	: nullptr;

	// Attempt to simplify to a constant, shuffle vector or EXTRQI call.
	if (Value V = simplifyX86extrq(II, Op0, CILength, CIIndex, Builder))
	return replaceInstUsesWith(*II, V);

	// EXTRQ only uses the lowest 64-bits of the first 128-bit vector
	// operands and the lowest 16-bits of the second.
	bool MadeChange = false;
	if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
	II->setArgOperand(0, V);
	MadeChange = true;
	}
	if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
	II->setArgOperand(1, V);
	MadeChange = true;
	}
	if (MadeChange)
	return II;
	break;
	}

	case Intrinsic::x86_sse4a_extrqi: {
	// EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
	// bits of the lower 64-bits. The upper 64-bits are undefined.
	Value *Op0 = II->getArgOperand(0);
	unsigned VWidth = Op0->getType()->getVectorNumElements();
	assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
	"Unexpected operand size");

	// See if we're dealing with constant values.
	ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(1));
	ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(2));

	// Attempt to simplify to a constant or shuffle vector.
	if (Value V = simplifyX86extrq(II, Op0, CILength, CIIndex, Builder))
	return replaceInstUsesWith(*II, V);

	// EXTRQI only uses the lowest 64-bits of the first 128-bit vector
	// operand.
	if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
	II->setArgOperand(0, V);
	return II;
	}
	break;
	}

	case Intrinsic::x86_sse4a_insertq: {
	Value *Op0 = II->getArgOperand(0);
	Value *Op1 = II->getArgOperand(1);
	unsigned VWidth = Op0->getType()->getVectorNumElements();
	assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
	Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
	Op1->getType()->getVectorNumElements() == 2 &&
	"Unexpected operand size");

	// See if we're dealing with constant values.
	Constant *C1 = dyn_cast<Constant>(Op1);
	ConstantInt *CI11 =
	C1 ? dyn_cast_or_null<ConstantInt>(C1->getAggregateElement((unsigned)1))
	: nullptr;

	// Attempt to simplify to a constant, shuffle vector or INSERTQI call.
	if (CI11) {
	const APInt &V11 = CI11->getValue();
	APInt Len = V11.zextOrTrunc(6);
	APInt Idx = V11.lshr(8).zextOrTrunc(6);
	if (Value V = simplifyX86insertq(II, Op0, Op1, Len, Idx, Builder))
	return replaceInstUsesWith(*II, V);
	}

	// INSERTQ only uses the lowest 64-bits of the first 128-bit vector
	// operand.
	if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
	II->setArgOperand(0, V);
	return II;
	}
	break;
	}

	case Intrinsic::x86_sse4a_insertqi: {
	// INSERTQI: Extract lowest Length bits from lower half of second source and
	// insert over first source starting at Index bit. The upper 64-bits are
	// undefined.
	Value *Op0 = II->getArgOperand(0);
	Value *Op1 = II->getArgOperand(1);
	unsigned VWidth0 = Op0->getType()->getVectorNumElements();
	unsigned VWidth1 = Op1->getType()->getVectorNumElements();
	assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
	Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
	VWidth1 == 2 && "Unexpected operand sizes");

	// See if we're dealing with constant values.
	ConstantInt *CILength = dyn_cast<ConstantInt>(II->getArgOperand(2));
	ConstantInt *CIIndex = dyn_cast<ConstantInt>(II->getArgOperand(3));

	// Attempt to simplify to a constant or shuffle vector.
	if (CILength && CIIndex) {
	APInt Len = CILength->getValue().zextOrTrunc(6);
	APInt Idx = CIIndex->getValue().zextOrTrunc(6);
	if (Value V = simplifyX86insertq(II, Op0, Op1, Len, Idx, Builder))
	return replaceInstUsesWith(*II, V);
	}

	// INSERTQI only uses the lowest 64-bits of the first two 128-bit vector
	// operands.
	bool MadeChange = false;
	if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
	II->setArgOperand(0, V);
	MadeChange = true;
	}
	if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
	II->setArgOperand(1, V);
	MadeChange = true;
	}
	if (MadeChange)
	return II;
	break;
	}

	case Intrinsic::x86_sse41_pblendvb:
	case Intrinsic::x86_sse41_blendvps:
	case Intrinsic::x86_sse41_blendvpd:
	case Intrinsic::x86_avx_blendv_ps_256:
	case Intrinsic::x86_avx_blendv_pd_256:
	case Intrinsic::x86_avx2_pblendvb: {
	// Convert blendv* to vector selects if the mask is constant.
	// This optimization is convoluted because the intrinsic is defined as
	// getting a vector of floats or doubles for the ps and pd versions.
	// FIXME: That should be changed.

	Value *Op0 = II->getArgOperand(0);
	Value *Op1 = II->getArgOperand(1);
	Value *Mask = II->getArgOperand(2);

	// fold (blend A, A, Mask) -> A
	if (Op0 == Op1)
	return replaceInstUsesWith(CI, Op0);

	// Zero Mask - select 1st argument.
	if (isa<ConstantAggregateZero>(Mask))
	return replaceInstUsesWith(CI, Op0);

	// Constant Mask - select 1st/2nd argument lane based on top bit of mask.
	if (auto *ConstantMask = dyn_cast<ConstantDataVector>(Mask)) {
	Constant *NewSelector = getNegativeIsTrueBoolVec(ConstantMask);
	return SelectInst::Create(NewSelector, Op1, Op0, "blendv");
	}
	break;
	}

	case Intrinsic::x86_ssse3_pshuf_b_128:
	case Intrinsic::x86_avx2_pshuf_b:
	case Intrinsic::x86_avx512_pshuf_b_512:
	if (Value V = simplifyX86pshufb(II, Builder))
	return replaceInstUsesWith(*II, V);
	break;

	case Intrinsic::x86_avx_vpermilvar_ps:
	case Intrinsic::x86_avx_vpermilvar_ps_256:
	case Intrinsic::x86_avx512_vpermilvar_ps_512:
	case Intrinsic::x86_avx_vpermilvar_pd:
	case Intrinsic::x86_avx_vpermilvar_pd_256:
	case Intrinsic::x86_avx512_vpermilvar_pd_512:
	if (Value V = simplifyX86vpermilvar(II, Builder))
	return replaceInstUsesWith(*II, V);
	break;

	case Intrinsic::x86_avx2_permd:
	case Intrinsic::x86_avx2_permps:
	if (Value V = simplifyX86vpermv(II, Builder))
	return replaceInstUsesWith(*II, V);
	break;

	case Intrinsic::x86_avx512_mask_permvar_df_256:
	case Intrinsic::x86_avx512_mask_permvar_df_512:
	case Intrinsic::x86_avx512_mask_permvar_di_256:
	case Intrinsic::x86_avx512_mask_permvar_di_512:
	case Intrinsic::x86_avx512_mask_permvar_hi_128:
	case Intrinsic::x86_avx512_mask_permvar_hi_256:
	case Intrinsic::x86_avx512_mask_permvar_hi_512:
	case Intrinsic::x86_avx512_mask_permvar_qi_128:
	case Intrinsic::x86_avx512_mask_permvar_qi_256:
	case Intrinsic::x86_avx512_mask_permvar_qi_512:
	case Intrinsic::x86_avx512_mask_permvar_sf_256:
	case Intrinsic::x86_avx512_mask_permvar_sf_512:
	case Intrinsic::x86_avx512_mask_permvar_si_256:
	case Intrinsic::x86_avx512_mask_permvar_si_512:
	if (Value V = simplifyX86vpermv(II, Builder)) {
	// We simplified the permuting, now create a select for the masking.
	V = emitX86MaskSelect(II->getArgOperand(3), V, II->getArgOperand(2),
	Builder);
	return replaceInstUsesWith(*II, V);
	}
	break;

	case Intrinsic::x86_avx_maskload_ps:
	case Intrinsic::x86_avx_maskload_pd:
	case Intrinsic::x86_avx_maskload_ps_256:
	case Intrinsic::x86_avx_maskload_pd_256:
	case Intrinsic::x86_avx2_maskload_d:
	case Intrinsic::x86_avx2_maskload_q:
	case Intrinsic::x86_avx2_maskload_d_256:
	case Intrinsic::x86_avx2_maskload_q_256:
	if (Instruction I = simplifyX86MaskedLoad(II, *this))
	return I;
	break;

	case Intrinsic::x86_sse2_maskmov_dqu:
	case Intrinsic::x86_avx_maskstore_ps:
	case Intrinsic::x86_avx_maskstore_pd:
	case Intrinsic::x86_avx_maskstore_ps_256:
	case Intrinsic::x86_avx_maskstore_pd_256:
	case Intrinsic::x86_avx2_maskstore_d:
	case Intrinsic::x86_avx2_maskstore_q:
	case Intrinsic::x86_avx2_maskstore_d_256:
	case Intrinsic::x86_avx2_maskstore_q_256:
	if (simplifyX86MaskedStore(II, this))
	return nullptr;
	break;

	case Intrinsic::x86_xop_vpcomb:
	case Intrinsic::x86_xop_vpcomd:
	case Intrinsic::x86_xop_vpcomq:
	case Intrinsic::x86_xop_vpcomw:
	if (Value V = simplifyX86vpcom(II, Builder, true))
	return replaceInstUsesWith(*II, V);
	break;

	case Intrinsic::x86_xop_vpcomub:
	case Intrinsic::x86_xop_vpcomud:
	case Intrinsic::x86_xop_vpcomuq:
	case Intrinsic::x86_xop_vpcomuw:
	if (Value V = simplifyX86vpcom(II, Builder, false))
	return replaceInstUsesWith(*II, V);
	break;

	case Intrinsic::ppc_altivec_vperm:
	// Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
	// Note that ppc_altivec_vperm has a big-endian bias, so when creating
	// a vectorshuffle for little endian, we must undo the transformation
	// performed on vec_perm in altivec.h. That is, we must complement
	// the permutation mask with respect to 31 and reverse the order of
	// V1 and V2.
	if (Constant *Mask = dyn_cast<Constant>(II->getArgOperand(2))) {
	assert(Mask->getType()->getVectorNumElements() == 16 &&
	"Bad type for intrinsic!");

	// Check that all of the elements are integer constants or undefs.
	bool AllEltsOk = true;
	for (unsigned i = 0; i != 16; ++i) {
	Constant *Elt = Mask->getAggregateElement(i);
	if (!Elt \|\| !(isa<ConstantInt>(Elt) \|\| isa<UndefValue>(Elt))) {
	AllEltsOk = false;
	break;
	}
	}

	if (AllEltsOk) {
	// Cast the input vectors to byte vectors.
	Value *Op0 = Builder.CreateBitCast(II->getArgOperand(0),
	Mask->getType());
	Value *Op1 = Builder.CreateBitCast(II->getArgOperand(1),
	Mask->getType());
	Value *Result = UndefValue::get(Op0->getType());

	// Only extract each element once.
	Value *ExtractedElts[32];
	memset(ExtractedElts, 0, sizeof(ExtractedElts));

	for (unsigned i = 0; i != 16; ++i) {
	if (isa<UndefValue>(Mask->getAggregateElement(i)))
	continue;
	unsigned Idx =
	cast<ConstantInt>(Mask->getAggregateElement(i))->getZExtValue();
	Idx &= 31; // Match the hardware behavior.
	if (DL.isLittleEndian())
	Idx = 31 - Idx;

	if (!ExtractedElts[Idx]) {
	Value *Op0ToUse = (DL.isLittleEndian()) ? Op1 : Op0;
	Value *Op1ToUse = (DL.isLittleEndian()) ? Op0 : Op1;
	ExtractedElts[Idx] =
	Builder.CreateExtractElement(Idx < 16 ? Op0ToUse : Op1ToUse,
	Builder.getInt32(Idx&15));
	}

	// Insert this value into the result vector.
	Result = Builder.CreateInsertElement(Result, ExtractedElts[Idx],
	Builder.getInt32(i));
	}
	return CastInst::Create(Instruction::BitCast, Result, CI.getType());
	}
	}
	break;

	case Intrinsic::arm_neon_vld1:
	case Intrinsic::arm_neon_vld2:
	case Intrinsic::arm_neon_vld3:
	case Intrinsic::arm_neon_vld4:
	case Intrinsic::arm_neon_vld2lane:
	case Intrinsic::arm_neon_vld3lane:
	case Intrinsic::arm_neon_vld4lane:
	case Intrinsic::arm_neon_vst1:
	case Intrinsic::arm_neon_vst2:
	case Intrinsic::arm_neon_vst3:
	case Intrinsic::arm_neon_vst4:
	case Intrinsic::arm_neon_vst2lane:
	case Intrinsic::arm_neon_vst3lane:
	case Intrinsic::arm_neon_vst4lane: {
	unsigned MemAlign =
	getKnownAlignment(II->getArgOperand(0), DL, II, &AC, &DT);
	unsigned AlignArg = II->getNumArgOperands() - 1;
	ConstantInt *IntrAlign = dyn_cast<ConstantInt>(II->getArgOperand(AlignArg));
	if (IntrAlign && IntrAlign->getZExtValue() < MemAlign) {
	II->setArgOperand(AlignArg,
	ConstantInt::get(Type::getInt32Ty(II->getContext()),
	MemAlign, false));
	return II;
	}
	break;
	}

	case Intrinsic::arm_neon_vmulls:
	case Intrinsic::arm_neon_vmullu:
	case Intrinsic::aarch64_neon_smull:
	case Intrinsic::aarch64_neon_umull: {
	Value *Arg0 = II->getArgOperand(0);
	Value *Arg1 = II->getArgOperand(1);

	// Handle mul by zero first:
	if (isa<ConstantAggregateZero>(Arg0) \|\| isa<ConstantAggregateZero>(Arg1)) {
	return replaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType()));
	}

	// Check for constant LHS & RHS - in this case we just simplify.
	bool Zext = (II->getIntrinsicID() == Intrinsic::arm_neon_vmullu \|\|
	II->getIntrinsicID() == Intrinsic::aarch64_neon_umull);
	VectorType *NewVT = cast<VectorType>(II->getType());
	if (Constant *CV0 = dyn_cast<Constant>(Arg0)) {
	if (Constant *CV1 = dyn_cast<Constant>(Arg1)) {
	CV0 = ConstantExpr::getIntegerCast(CV0, NewVT, /isSigned=/!Zext);
	CV1 = ConstantExpr::getIntegerCast(CV1, NewVT, /isSigned=/!Zext);

	return replaceInstUsesWith(CI, ConstantExpr::getMul(CV0, CV1));
	}

	// Couldn't simplify - canonicalize constant to the RHS.
	std::swap(Arg0, Arg1);
	}

	// Handle mul by one:
	if (Constant *CV1 = dyn_cast<Constant>(Arg1))
	if (ConstantInt *Splat =
	dyn_cast_or_null<ConstantInt>(CV1->getSplatValue()))
	if (Splat->isOne())
	return CastInst::CreateIntegerCast(Arg0, II->getType(),
	/isSigned=/!Zext);

	break;
	}
	case Intrinsic::amdgcn_rcp: {
	Value *Src = II->getArgOperand(0);

	// TODO: Move to ConstantFolding/InstSimplify?
	if (isa<UndefValue>(Src))
	return replaceInstUsesWith(CI, Src);

	if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
	const APFloat &ArgVal = C->getValueAPF();
	APFloat Val(ArgVal.getSemantics(), 1.0);
	APFloat::opStatus Status = Val.divide(ArgVal,
	APFloat::rmNearestTiesToEven);
	// Only do this if it was exact and therefore not dependent on the
	// rounding mode.
	if (Status == APFloat::opOK)
	return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val));
	}

	break;
	}
	case Intrinsic::amdgcn_rsq: {
	Value *Src = II->getArgOperand(0);

	// TODO: Move to ConstantFolding/InstSimplify?
	if (isa<UndefValue>(Src))
	return replaceInstUsesWith(CI, Src);
	break;
	}
	case Intrinsic::amdgcn_frexp_mant:
	case Intrinsic::amdgcn_frexp_exp: {
	Value *Src = II->getArgOperand(0);
	if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
	int Exp;
	APFloat Significand = frexp(C->getValueAPF(), Exp,
	APFloat::rmNearestTiesToEven);

	if (II->getIntrinsicID() == Intrinsic::amdgcn_frexp_mant) {
	return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(),
	Significand));
	}

	// Match instruction special case behavior.
	if (Exp == APFloat::IEK_NaN \|\| Exp == APFloat::IEK_Inf)
	Exp = 0;

	return replaceInstUsesWith(CI, ConstantInt::get(II->getType(), Exp));
	}

	if (isa<UndefValue>(Src))
	return replaceInstUsesWith(CI, UndefValue::get(II->getType()));

	break;
	}
	case Intrinsic::amdgcn_class: {
	enum {
	S_NAN = 1 << 0, // Signaling NaN
	Q_NAN = 1 << 1, // Quiet NaN
	N_INFINITY = 1 << 2, // Negative infinity
	N_NORMAL = 1 << 3, // Negative normal
	N_SUBNORMAL = 1 << 4, // Negative subnormal
	N_ZERO = 1 << 5, // Negative zero
	P_ZERO = 1 << 6, // Positive zero
	P_SUBNORMAL = 1 << 7, // Positive subnormal
	P_NORMAL = 1 << 8, // Positive normal
	P_INFINITY = 1 << 9 // Positive infinity
	};

	const uint32_t FullMask = S_NAN \| Q_NAN \| N_INFINITY \| N_NORMAL \|
	N_SUBNORMAL \| N_ZERO \| P_ZERO \| P_SUBNORMAL \| P_NORMAL \| P_INFINITY;

	Value *Src0 = II->getArgOperand(0);
	Value *Src1 = II->getArgOperand(1);
	const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
	if (!CMask) {
	if (isa<UndefValue>(Src0))
	return replaceInstUsesWith(*II, UndefValue::get(II->getType()));

	if (isa<UndefValue>(Src1))
	return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false));
	break;
	}

	uint32_t Mask = CMask->getZExtValue();

	// If all tests are made, it doesn't matter what the value is.
	if ((Mask & FullMask) == FullMask)
	return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), true));

	if ((Mask & FullMask) == 0)
	return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), false));

	if (Mask == (S_NAN \| Q_NAN)) {
	// Equivalent of isnan. Replace with standard fcmp.
	Value *FCmp = Builder.CreateFCmpUNO(Src0, Src0);
	FCmp->takeName(II);
	return replaceInstUsesWith(*II, FCmp);
	}

	const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
	if (!CVal) {
	if (isa<UndefValue>(Src0))
	return replaceInstUsesWith(*II, UndefValue::get(II->getType()));

	// Clamp mask to used bits
	if ((Mask & FullMask) != Mask) {
	CallInst *NewCall = Builder.CreateCall(II->getCalledFunction(),
	{ Src0, ConstantInt::get(Src1->getType(), Mask & FullMask) }
	);

	NewCall->takeName(II);
	return replaceInstUsesWith(*II, NewCall);
	}

	break;
	}

	const APFloat &Val = CVal->getValueAPF();

	bool Result =
	((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) \|\|
	((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) \|\|
	((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) \|\|
	((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) \|\|
	((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) \|\|
	((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) \|\|
	((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) \|\|
	((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) \|\|
	((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) \|\|
	((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative());

	return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result));
	}
	case Intrinsic::amdgcn_cvt_pkrtz: {
	Value *Src0 = II->getArgOperand(0);
	Value *Src1 = II->getArgOperand(1);
	if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
	if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
	const fltSemantics &HalfSem
	= II->getType()->getScalarType()->getFltSemantics();
	bool LosesInfo;
	APFloat Val0 = C0->getValueAPF();
	APFloat Val1 = C1->getValueAPF();
	Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
	Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);

	Constant *Folded = ConstantVector::get({
	ConstantFP::get(II->getContext(), Val0),
	ConstantFP::get(II->getContext(), Val1) });
	return replaceInstUsesWith(*II, Folded);
	}
	}

	if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1))
	return replaceInstUsesWith(*II, UndefValue::get(II->getType()));

	break;
	}
	+ case Intrinsic::amdgcn_cvt_pknorm_i16:
	+ case Intrinsic::amdgcn_cvt_pknorm_u16:
	+ case Intrinsic::amdgcn_cvt_pk_i16:
	+ case Intrinsic::amdgcn_cvt_pk_u16: {
	+ Value *Src0 = II->getArgOperand(0);
	+ Value *Src1 = II->getArgOperand(1);
	+
	+ if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1))
	+ return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
	+
	+ break;
	+ }
	case Intrinsic::amdgcn_ubfe:
	case Intrinsic::amdgcn_sbfe: {
	// Decompose simple cases into standard shifts.
	Value *Src = II->getArgOperand(0);
	if (isa<UndefValue>(Src))
	return replaceInstUsesWith(*II, Src);

	unsigned Width;
	Type *Ty = II->getType();
	unsigned IntSize = Ty->getIntegerBitWidth();

	ConstantInt *CWidth = dyn_cast<ConstantInt>(II->getArgOperand(2));
	if (CWidth) {
	Width = CWidth->getZExtValue();
	if ((Width & (IntSize - 1)) == 0)
	return replaceInstUsesWith(*II, ConstantInt::getNullValue(Ty));

	if (Width >= IntSize) {
	// Hardware ignores high bits, so remove those.
	II->setArgOperand(2, ConstantInt::get(CWidth->getType(),
	Width & (IntSize - 1)));
	return II;
	}
	}

	unsigned Offset;
	ConstantInt *COffset = dyn_cast<ConstantInt>(II->getArgOperand(1));
	if (COffset) {
	Offset = COffset->getZExtValue();
	if (Offset >= IntSize) {
	II->setArgOperand(1, ConstantInt::get(COffset->getType(),
	Offset & (IntSize - 1)));
	return II;
	}
	}

	bool Signed = II->getIntrinsicID() == Intrinsic::amdgcn_sbfe;

	// TODO: Also emit sub if only width is constant.
	if (!CWidth && COffset && Offset == 0) {
	Constant *KSize = ConstantInt::get(COffset->getType(), IntSize);
	Value *ShiftVal = Builder.CreateSub(KSize, II->getArgOperand(2));
	ShiftVal = Builder.CreateZExt(ShiftVal, II->getType());

	Value *Shl = Builder.CreateShl(Src, ShiftVal);
	Value *RightShift = Signed ? Builder.CreateAShr(Shl, ShiftVal)
	: Builder.CreateLShr(Shl, ShiftVal);
	RightShift->takeName(II);
	return replaceInstUsesWith(*II, RightShift);
	}

	if (!CWidth \|\| !COffset)
	break;

	// TODO: This allows folding to undef when the hardware has specific
	// behavior?
	if (Offset + Width < IntSize) {
	Value *Shl = Builder.CreateShl(Src, IntSize - Offset - Width);
	Value *RightShift = Signed ? Builder.CreateAShr(Shl, IntSize - Width)
	: Builder.CreateLShr(Shl, IntSize - Width);
	RightShift->takeName(II);
	return replaceInstUsesWith(*II, RightShift);
	}

	Value *RightShift = Signed ? Builder.CreateAShr(Src, Offset)
	: Builder.CreateLShr(Src, Offset);

	RightShift->takeName(II);
	return replaceInstUsesWith(*II, RightShift);
	}
	case Intrinsic::amdgcn_exp:
	case Intrinsic::amdgcn_exp_compr: {
	ConstantInt *En = dyn_cast<ConstantInt>(II->getArgOperand(1));
	if (!En) // Illegal.
	break;

	unsigned EnBits = En->getZExtValue();
	if (EnBits == 0xf)
	break; // All inputs enabled.

	bool IsCompr = II->getIntrinsicID() == Intrinsic::amdgcn_exp_compr;
	bool Changed = false;
	for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
	if ((!IsCompr && (EnBits & (1 << I)) == 0) \|\|
	(IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
	Value *Src = II->getArgOperand(I + 2);
	if (!isa<UndefValue>(Src)) {
	II->setArgOperand(I + 2, UndefValue::get(Src->getType()));
	Changed = true;
	}
	}
	}

	if (Changed)
	return II;

	break;
	}
	case Intrinsic::amdgcn_fmed3: {
	// Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
	// for the shader.

	Value *Src0 = II->getArgOperand(0);
	Value *Src1 = II->getArgOperand(1);
	Value *Src2 = II->getArgOperand(2);

	bool Swap = false;
	// Canonicalize constants to RHS operands.
	//
	// fmed3(c0, x, c1) -> fmed3(x, c0, c1)
	if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
	std::swap(Src0, Src1);
	Swap = true;
	}

	if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
	std::swap(Src1, Src2);
	Swap = true;
	}

	if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
	std::swap(Src0, Src1);
	Swap = true;
	}

	if (Swap) {
	II->setArgOperand(0, Src0);
	II->setArgOperand(1, Src1);
	II->setArgOperand(2, Src2);
	return II;
	}

	if (match(Src2, m_NaN()) \|\| isa<UndefValue>(Src2)) {
	CallInst *NewCall = Builder.CreateMinNum(Src0, Src1);
	NewCall->copyFastMathFlags(II);
	NewCall->takeName(II);
	return replaceInstUsesWith(*II, NewCall);
	}

	if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
	if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
	if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
	APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
	C2->getValueAPF());
	return replaceInstUsesWith(*II,
	ConstantFP::get(Builder.getContext(), Result));
	}
	}
	}

	break;
	}
	case Intrinsic::amdgcn_icmp:
	case Intrinsic::amdgcn_fcmp: {
	const ConstantInt *CC = dyn_cast<ConstantInt>(II->getArgOperand(2));
	if (!CC)
	break;

	// Guard against invalid arguments.
	int64_t CCVal = CC->getZExtValue();
	bool IsInteger = II->getIntrinsicID() == Intrinsic::amdgcn_icmp;
	if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE \|\|
	CCVal > CmpInst::LAST_ICMP_PREDICATE)) \|\|
	(!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE \|\|
	CCVal > CmpInst::LAST_FCMP_PREDICATE)))
	break;

	Value *Src0 = II->getArgOperand(0);
	Value *Src1 = II->getArgOperand(1);

	if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
	if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
	Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
	if (CCmp->isNullValue()) {
	return replaceInstUsesWith(
	*II, ConstantExpr::getSExt(CCmp, II->getType()));
	}

	// The result of V_ICMP/V_FCMP assembly instructions (which this
	// intrinsic exposes) is one bit per thread, masked with the EXEC
	// register (which contains the bitmask of live threads). So a
	// comparison that always returns true is the same as a read of the
	// EXEC register.
	Value *NewF = Intrinsic::getDeclaration(
	II->getModule(), Intrinsic::read_register, II->getType());
	Metadata *MDArgs[] = {MDString::get(II->getContext(), "exec")};
	MDNode *MD = MDNode::get(II->getContext(), MDArgs);
	Value *Args[] = {MetadataAsValue::get(II->getContext(), MD)};
	CallInst *NewCall = Builder.CreateCall(NewF, Args);
	NewCall->addAttribute(AttributeList::FunctionIndex,
	Attribute::Convergent);
	NewCall->takeName(II);
	return replaceInstUsesWith(*II, NewCall);
	}

	// Canonicalize constants to RHS.
	CmpInst::Predicate SwapPred
	= CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
	II->setArgOperand(0, Src1);
	II->setArgOperand(1, Src0);
	II->setArgOperand(2, ConstantInt::get(CC->getType(),
	static_cast<int>(SwapPred)));
	return II;
	}

	if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
	break;

	// Canonicalize compare eq with true value to compare != 0
	// llvm.amdgcn.icmp(zext (i1 x), 1, eq)
	// -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
	// llvm.amdgcn.icmp(sext (i1 x), -1, eq)
	// -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
	Value *ExtSrc;
	if (CCVal == CmpInst::ICMP_EQ &&
	((match(Src1, m_One()) && match(Src0, m_ZExt(m_Value(ExtSrc)))) \|\|
	(match(Src1, m_AllOnes()) && match(Src0, m_SExt(m_Value(ExtSrc))))) &&
	ExtSrc->getType()->isIntegerTy(1)) {
	II->setArgOperand(1, ConstantInt::getNullValue(Src1->getType()));
	II->setArgOperand(2, ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
	return II;
	}

	CmpInst::Predicate SrcPred;
	Value *SrcLHS;
	Value *SrcRHS;

	// Fold compare eq/ne with 0 from a compare result as the predicate to the
	// intrinsic. The typical use is a wave vote function in the library, which
	// will be fed from a user code condition compared with 0. Fold in the
	// redundant compare.

	// llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
	// -> llvm.amdgcn.[if]cmp(a, b, pred)
	//
	// llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
	// -> llvm.amdgcn.[if]cmp(a, b, inv pred)
	if (match(Src1, m_Zero()) &&
	match(Src0,
	m_ZExtOrSExt(m_Cmp(SrcPred, m_Value(SrcLHS), m_Value(SrcRHS))))) {
	if (CCVal == CmpInst::ICMP_EQ)
	SrcPred = CmpInst::getInversePredicate(SrcPred);

	Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) ?
	Intrinsic::amdgcn_fcmp : Intrinsic::amdgcn_icmp;

	Value *NewF = Intrinsic::getDeclaration(II->getModule(), NewIID,
	SrcLHS->getType());
	Value *Args[] = { SrcLHS, SrcRHS,
	ConstantInt::get(CC->getType(), SrcPred) };
	CallInst *NewCall = Builder.CreateCall(NewF, Args);
	NewCall->takeName(II);
	return replaceInstUsesWith(*II, NewCall);
	}

	break;
	}
	case Intrinsic::amdgcn_wqm_vote: {
	// wqm_vote is identity when the argument is constant.
	if (!isa<Constant>(II->getArgOperand(0)))
	break;

	return replaceInstUsesWith(*II, II->getArgOperand(0));
	}
	case Intrinsic::amdgcn_kill: {
	const ConstantInt *C = dyn_cast<ConstantInt>(II->getArgOperand(0));
	if (!C \|\| !C->getZExtValue())
	break;

	// amdgcn.kill(i1 1) is a no-op
	return eraseInstFromFunction(CI);
	}
	case Intrinsic::stackrestore: {
	// If the save is right next to the restore, remove the restore. This can
	// happen when variable allocas are DCE'd.
	if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {
	if (SS->getIntrinsicID() == Intrinsic::stacksave) {
	if (&*++SS->getIterator() == II)
	return eraseInstFromFunction(CI);
	}
	}

	// Scan down this block to see if there is another stack restore in the
	// same block without an intervening call/alloca.
	BasicBlock::iterator BI(II);
	TerminatorInst *TI = II->getParent()->getTerminator();
	bool CannotRemove = false;
	for (++BI; &*BI != TI; ++BI) {
	if (isa<AllocaInst>(BI)) {
	CannotRemove = true;
	break;
	}
	if (CallInst *BCI = dyn_cast<CallInst>(BI)) {
	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(BCI)) {
	// If there is a stackrestore below this one, remove this one.
	if (II->getIntrinsicID() == Intrinsic::stackrestore)
	return eraseInstFromFunction(CI);

	// Bail if we cross over an intrinsic with side effects, such as
	// llvm.stacksave, llvm.read_register, or llvm.setjmp.
	if (II->mayHaveSideEffects()) {
	CannotRemove = true;
	break;
	}
	} else {
	// If we found a non-intrinsic call, we can't remove the stack
	// restore.
	CannotRemove = true;
	break;
	}
	}
	}

	// If the stack restore is in a return, resume, or unwind block and if there
	// are no allocas or calls between the restore and the return, nuke the
	// restore.
	if (!CannotRemove && (isa<ReturnInst>(TI) \|\| isa<ResumeInst>(TI)))
	return eraseInstFromFunction(CI);
	break;
	}
	case Intrinsic::lifetime_start:
	// Asan needs to poison memory to detect invalid access which is possible
	// even for empty lifetime range.
	if (II->getFunction()->hasFnAttribute(Attribute::SanitizeAddress) \|\|
	II->getFunction()->hasFnAttribute(Attribute::SanitizeHWAddress))
	break;

	if (removeTriviallyEmptyRange(*II, Intrinsic::lifetime_start,
	Intrinsic::lifetime_end, *this))
	return nullptr;
	break;
	case Intrinsic::assume: {
	Value *IIOperand = II->getArgOperand(0);
	// Remove an assume if it is immediately followed by an identical assume.
	if (match(II->getNextNode(),
	m_Intrinsic<Intrinsic::assume>(m_Specific(IIOperand))))
	return eraseInstFromFunction(CI);

	// Canonicalize assume(a && b) -> assume(a); assume(b);
	// Note: New assumption intrinsics created here are registered by
	// the InstCombineIRInserter object.
	Value AssumeIntrinsic = II->getCalledValue(), A, *B;
	if (match(IIOperand, m_And(m_Value(A), m_Value(B)))) {
	Builder.CreateCall(AssumeIntrinsic, A, II->getName());
	Builder.CreateCall(AssumeIntrinsic, B, II->getName());
	return eraseInstFromFunction(*II);
	}
	// assume(!(a \|\| b)) -> assume(!a); assume(!b);
	if (match(IIOperand, m_Not(m_Or(m_Value(A), m_Value(B))))) {
	Builder.CreateCall(AssumeIntrinsic, Builder.CreateNot(A), II->getName());
	Builder.CreateCall(AssumeIntrinsic, Builder.CreateNot(B), II->getName());
	return eraseInstFromFunction(*II);
	}

	// assume( (load addr) != null ) -> add 'nonnull' metadata to load
	// (if assume is valid at the load)
	CmpInst::Predicate Pred;
	Instruction *LHS;
	if (match(IIOperand, m_ICmp(Pred, m_Instruction(LHS), m_Zero())) &&
	Pred == ICmpInst::ICMP_NE && LHS->getOpcode() == Instruction::Load &&
	LHS->getType()->isPointerTy() &&
	isValidAssumeForContext(II, LHS, &DT)) {
	MDNode *MD = MDNode::get(II->getContext(), None);
	LHS->setMetadata(LLVMContext::MD_nonnull, MD);
	return eraseInstFromFunction(*II);

	// TODO: apply nonnull return attributes to calls and invokes
	// TODO: apply range metadata for range check patterns?
	}

	// If there is a dominating assume with the same condition as this one,
	// then this one is redundant, and should be removed.
	KnownBits Known(1);
	computeKnownBits(IIOperand, Known, 0, II);
	if (Known.isAllOnes())
	return eraseInstFromFunction(*II);

	// Update the cache of affected values for this assumption (we might be
	// here because we just simplified the condition).
	AC.updateAffectedValues(II);
	break;
	}
	case Intrinsic::experimental_gc_relocate: {
	// Translate facts known about a pointer before relocating into
	// facts about the relocate value, while being careful to
	// preserve relocation semantics.
	Value *DerivedPtr = cast<GCRelocateInst>(II)->getDerivedPtr();

	// Remove the relocation if unused, note that this check is required
	// to prevent the cases below from looping forever.
	if (II->use_empty())
	return eraseInstFromFunction(*II);

	// Undef is undef, even after relocation.
	// TODO: provide a hook for this in GCStrategy. This is clearly legal for
	// most practical collectors, but there was discussion in the review thread
	// about whether it was legal for all possible collectors.
	if (isa<UndefValue>(DerivedPtr))
	// Use undef of gc_relocate's type to replace it.
	return replaceInstUsesWith(*II, UndefValue::get(II->getType()));

	if (auto *PT = dyn_cast<PointerType>(II->getType())) {
	// The relocation of null will be null for most any collector.
	// TODO: provide a hook for this in GCStrategy. There might be some
	// weird collector this property does not hold for.
	if (isa<ConstantPointerNull>(DerivedPtr))
	// Use null-pointer of gc_relocate's type to replace it.
	return replaceInstUsesWith(*II, ConstantPointerNull::get(PT));

	// isKnownNonNull -> nonnull attribute
	if (isKnownNonZero(DerivedPtr, DL, 0, &AC, II, &DT))
	II->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
	}

	// TODO: bitcast(relocate(p)) -> relocate(bitcast(p))
	// Canonicalize on the type from the uses to the defs

	// TODO: relocate((gep p, C, C2, ...)) -> gep(relocate(p), C, C2, ...)
	break;
	}

	case Intrinsic::experimental_guard: {
	// Is this guard followed by another guard?
	Instruction *NextInst = II->getNextNode();
	Value *NextCond = nullptr;
	if (match(NextInst,
	m_Intrinsic<Intrinsic::experimental_guard>(m_Value(NextCond)))) {
	Value *CurrCond = II->getArgOperand(0);

	// Remove a guard that it is immediately preceded by an identical guard.
	if (CurrCond == NextCond)
	return eraseInstFromFunction(*NextInst);

	// Otherwise canonicalize guard(a); guard(b) -> guard(a & b).
	II->setArgOperand(0, Builder.CreateAnd(CurrCond, NextCond));
	return eraseInstFromFunction(*NextInst);
	}
	break;
	}
	}
	return visitCallSite(II);
	}

	// Fence instruction simplification
	Instruction *InstCombiner::visitFenceInst(FenceInst &FI) {
	// Remove identical consecutive fences.
	if (auto *NFI = dyn_cast<FenceInst>(FI.getNextNode()))
	if (FI.isIdenticalTo(NFI))
	return eraseInstFromFunction(FI);
	return nullptr;
	}

	// InvokeInst simplification
	Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) {
	return visitCallSite(&II);
	}

	/// If this cast does not affect the value passed through the varargs area, we
	/// can eliminate the use of the cast.
	static bool isSafeToEliminateVarargsCast(const CallSite CS,
	const DataLayout &DL,
	const CastInst *const CI,
	const int ix) {
	if (!CI->isLosslessCast())
	return false;

	// If this is a GC intrinsic, avoid munging types. We need types for
	// statepoint reconstruction in SelectionDAG.
	// TODO: This is probably something which should be expanded to all
	// intrinsics since the entire point of intrinsics is that
	// they are understandable by the optimizer.
	if (isStatepoint(CS) \|\| isGCRelocate(CS) \|\| isGCResult(CS))
	return false;

	// The size of ByVal or InAlloca arguments is derived from the type, so we
	// can't change to a type with a different size. If the size were
	// passed explicitly we could avoid this check.
	if (!CS.isByValOrInAllocaArgument(ix))
	return true;

	Type* SrcTy =
	cast<PointerType>(CI->getOperand(0)->getType())->getElementType();
	Type* DstTy = cast<PointerType>(CI->getType())->getElementType();
	if (!SrcTy->isSized() \|\| !DstTy->isSized())
	return false;
	if (DL.getTypeAllocSize(SrcTy) != DL.getTypeAllocSize(DstTy))
	return false;
	return true;
	}

	Instruction InstCombiner::tryOptimizeCall(CallInst CI) {
	if (!CI->getCalledFunction()) return nullptr;

	auto InstCombineRAUW = [this](Instruction From, Value With) {
	replaceInstUsesWith(*From, With);
	};
	LibCallSimplifier Simplifier(DL, &TLI, ORE, InstCombineRAUW);
	if (Value *With = Simplifier.optimizeCall(CI)) {
	++NumSimplified;
	return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With);
	}

	return nullptr;
	}

	static IntrinsicInst findInitTrampolineFromAlloca(Value TrampMem) {
	// Strip off at most one level of pointer casts, looking for an alloca. This
	// is good enough in practice and simpler than handling any number of casts.
	Value *Underlying = TrampMem->stripPointerCasts();
	if (Underlying != TrampMem &&
	(!Underlying->hasOneUse() \|\| Underlying->user_back() != TrampMem))
	return nullptr;
	if (!isa<AllocaInst>(Underlying))
	return nullptr;

	IntrinsicInst *InitTrampoline = nullptr;
	for (User *U : TrampMem->users()) {
	IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
	if (!II)
	return nullptr;
	if (II->getIntrinsicID() == Intrinsic::init_trampoline) {
	if (InitTrampoline)
	// More than one init_trampoline writes to this value. Give up.
	return nullptr;
	InitTrampoline = II;
	continue;
	}
	if (II->getIntrinsicID() == Intrinsic::adjust_trampoline)
	// Allow any number of calls to adjust.trampoline.
	continue;
	return nullptr;
	}

	// No call to init.trampoline found.
	if (!InitTrampoline)
	return nullptr;

	// Check that the alloca is being used in the expected way.
	if (InitTrampoline->getOperand(0) != TrampMem)
	return nullptr;

	return InitTrampoline;
	}

	static IntrinsicInst findInitTrampolineFromBB(IntrinsicInst AdjustTramp,
	Value *TrampMem) {
	// Visit all the previous instructions in the basic block, and try to find a
	// init.trampoline which has a direct path to the adjust.trampoline.
	for (BasicBlock::iterator I = AdjustTramp->getIterator(),
	E = AdjustTramp->getParent()->begin();
	I != E;) {
	Instruction Inst = &--I;
	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
	if (II->getIntrinsicID() == Intrinsic::init_trampoline &&
	II->getOperand(0) == TrampMem)
	return II;
	if (Inst->mayWriteToMemory())
	return nullptr;
	}
	return nullptr;
	}

	// Given a call to llvm.adjust.trampoline, find and return the corresponding
	// call to llvm.init.trampoline if the call to the trampoline can be optimized
	// to a direct call to a function. Otherwise return NULL.
	static IntrinsicInst findInitTrampoline(Value Callee) {
	Callee = Callee->stripPointerCasts();
	IntrinsicInst *AdjustTramp = dyn_cast<IntrinsicInst>(Callee);
	if (!AdjustTramp \|\|
	AdjustTramp->getIntrinsicID() != Intrinsic::adjust_trampoline)
	return nullptr;

	Value *TrampMem = AdjustTramp->getOperand(0);

	if (IntrinsicInst *IT = findInitTrampolineFromAlloca(TrampMem))
	return IT;
	if (IntrinsicInst *IT = findInitTrampolineFromBB(AdjustTramp, TrampMem))
	return IT;
	return nullptr;
	}

	/// Improvements for call and invoke instructions.
	Instruction *InstCombiner::visitCallSite(CallSite CS) {
	if (isAllocLikeFn(CS.getInstruction(), &TLI))
	return visitAllocSite(*CS.getInstruction());

	bool Changed = false;

	// Mark any parameters that are known to be non-null with the nonnull
	// attribute. This is helpful for inlining calls to functions with null
	// checks on their arguments.
	SmallVector<unsigned, 4> ArgNos;
	unsigned ArgNo = 0;

	for (Value *V : CS.args()) {
	if (V->getType()->isPointerTy() &&
	!CS.paramHasAttr(ArgNo, Attribute::NonNull) &&
	isKnownNonZero(V, DL, 0, &AC, CS.getInstruction(), &DT))
	ArgNos.push_back(ArgNo);
	ArgNo++;
	}

	assert(ArgNo == CS.arg_size() && "sanity check");

	if (!ArgNos.empty()) {
	AttributeList AS = CS.getAttributes();
	LLVMContext &Ctx = CS.getInstruction()->getContext();
	AS = AS.addParamAttribute(Ctx, ArgNos,
	Attribute::get(Ctx, Attribute::NonNull));
	CS.setAttributes(AS);
	Changed = true;
	}

	// If the callee is a pointer to a function, attempt to move any casts to the
	// arguments of the call/invoke.
	Value *Callee = CS.getCalledValue();
	if (!isa<Function>(Callee) && transformConstExprCastCall(CS))
	return nullptr;

	if (Function *CalleeF = dyn_cast<Function>(Callee)) {
	// Remove the convergent attr on calls when the callee is not convergent.
	if (CS.isConvergent() && !CalleeF->isConvergent() &&
	!CalleeF->isIntrinsic()) {
	DEBUG(dbgs() << "Removing convergent attr from instr "
	<< CS.getInstruction() << "\n");
	CS.setNotConvergent();
	return CS.getInstruction();
	}

	// If the call and callee calling conventions don't match, this call must
	// be unreachable, as the call is undefined.
	if (CalleeF->getCallingConv() != CS.getCallingConv() &&
	// Only do this for calls to a function with a body. A prototype may
	// not actually end up matching the implementation's calling conv for a
	// variety of reasons (e.g. it may be written in assembly).
	!CalleeF->isDeclaration()) {
	Instruction *OldCall = CS.getInstruction();
	new StoreInst(ConstantInt::getTrue(Callee->getContext()),
	UndefValue::get(Type::getInt1PtrTy(Callee->getContext())),
	OldCall);
	// If OldCall does not return void then replaceAllUsesWith undef.
	// This allows ValueHandlers and custom metadata to adjust itself.
	if (!OldCall->getType()->isVoidTy())
	replaceInstUsesWith(*OldCall, UndefValue::get(OldCall->getType()));
	if (isa<CallInst>(OldCall))
	return eraseInstFromFunction(*OldCall);

	// We cannot remove an invoke, because it would change the CFG, just
	// change the callee to a null pointer.
	cast<InvokeInst>(OldCall)->setCalledFunction(
	Constant::getNullValue(CalleeF->getType()));
	return nullptr;
	}
	}

	if (isa<ConstantPointerNull>(Callee) \|\| isa<UndefValue>(Callee)) {
	// If CS does not return void then replaceAllUsesWith undef.
	// This allows ValueHandlers and custom metadata to adjust itself.
	if (!CS.getInstruction()->getType()->isVoidTy())
	replaceInstUsesWith(*CS.getInstruction(),
	UndefValue::get(CS.getInstruction()->getType()));

	if (isa<InvokeInst>(CS.getInstruction())) {
	// Can't remove an invoke because we cannot change the CFG.
	return nullptr;
	}

	// This instruction is not reachable, just remove it. We insert a store to
	// undef so that we know that this code is not reachable, despite the fact
	// that we can't modify the CFG here.
	new StoreInst(ConstantInt::getTrue(Callee->getContext()),
	UndefValue::get(Type::getInt1PtrTy(Callee->getContext())),
	CS.getInstruction());

	return eraseInstFromFunction(*CS.getInstruction());
	}

	if (IntrinsicInst *II = findInitTrampoline(Callee))
	return transformCallThroughTrampoline(CS, II);

	PointerType *PTy = cast<PointerType>(Callee->getType());
	FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
	if (FTy->isVarArg()) {
	int ix = FTy->getNumParams();
	// See if we can optimize any arguments passed through the varargs area of
	// the call.
	for (CallSite::arg_iterator I = CS.arg_begin() + FTy->getNumParams(),
	E = CS.arg_end(); I != E; ++I, ++ix) {
	CastInst CI = dyn_cast<CastInst>(I);
	if (CI && isSafeToEliminateVarargsCast(CS, DL, CI, ix)) {
	*I = CI->getOperand(0);
	Changed = true;
	}
	}
	}

	if (isa<InlineAsm>(Callee) && !CS.doesNotThrow()) {
	// Inline asm calls cannot throw - mark them 'nounwind'.
	CS.setDoesNotThrow();
	Changed = true;
	}

	// Try to optimize the call if possible, we require DataLayout for most of
	// this. None of these calls are seen as possibly dead so go ahead and
	// delete the instruction now.
	if (CallInst *CI = dyn_cast<CallInst>(CS.getInstruction())) {
	Instruction *I = tryOptimizeCall(CI);
	// If we changed something return the result, etc. Otherwise let
	// the fallthrough check.
	if (I) return eraseInstFromFunction(*I);
	}

	return Changed ? CS.getInstruction() : nullptr;
	}

	/// If the callee is a constexpr cast of a function, attempt to move the cast to
	/// the arguments of the call/invoke.
	bool InstCombiner::transformConstExprCastCall(CallSite CS) {
	auto *Callee = dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts());
	if (!Callee)
	return false;

	// The prototype of a thunk is a lie. Don't directly call such a function.
	if (Callee->hasFnAttribute("thunk"))
	return false;

	Instruction *Caller = CS.getInstruction();
	const AttributeList &CallerPAL = CS.getAttributes();

	// Okay, this is a cast from a function to a different type. Unless doing so
	// would cause a type conversion of one of our arguments, change this call to
	// be a direct call with arguments casted to the appropriate types.
	FunctionType *FT = Callee->getFunctionType();
	Type *OldRetTy = Caller->getType();
	Type *NewRetTy = FT->getReturnType();

	// Check to see if we are changing the return type...
	if (OldRetTy != NewRetTy) {

	if (NewRetTy->isStructTy())
	return false; // TODO: Handle multiple return values.

	if (!CastInst::isBitOrNoopPointerCastable(NewRetTy, OldRetTy, DL)) {
	if (Callee->isDeclaration())
	return false; // Cannot transform this return value.

	if (!Caller->use_empty() &&
	// void -> non-void is handled specially
	!NewRetTy->isVoidTy())
	return false; // Cannot transform this return value.
	}

	if (!CallerPAL.isEmpty() && !Caller->use_empty()) {
	AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
	if (RAttrs.overlaps(AttributeFuncs::typeIncompatible(NewRetTy)))
	return false; // Attribute not compatible with transformed value.
	}

	// If the callsite is an invoke instruction, and the return value is used by
	// a PHI node in a successor, we cannot change the return type of the call
	// because there is no place to put the cast instruction (without breaking
	// the critical edge). Bail out in this case.
	if (!Caller->use_empty())
	if (InvokeInst *II = dyn_cast<InvokeInst>(Caller))
	for (User *U : II->users())
	if (PHINode *PN = dyn_cast<PHINode>(U))
	if (PN->getParent() == II->getNormalDest() \|\|
	PN->getParent() == II->getUnwindDest())
	return false;
	}

	unsigned NumActualArgs = CS.arg_size();
	unsigned NumCommonArgs = std::min(FT->getNumParams(), NumActualArgs);

	// Prevent us turning:
	// declare void @takes_i32_inalloca(i32* inalloca)
	// call void bitcast (void (i32) @takes_i32_inalloca to void (i32)*)(i32 0)
	//
	// into:
	// call void @takes_i32_inalloca(i32* null)
	//
	// Similarly, avoid folding away bitcasts of byval calls.
	if (Callee->getAttributes().hasAttrSomewhere(Attribute::InAlloca) \|\|
	Callee->getAttributes().hasAttrSomewhere(Attribute::ByVal))
	return false;

	CallSite::arg_iterator AI = CS.arg_begin();
	for (unsigned i = 0, e = NumCommonArgs; i != e; ++i, ++AI) {
	Type *ParamTy = FT->getParamType(i);
	Type ActTy = (AI)->getType();

	if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL))
	return false; // Cannot transform this parameter value.

	if (AttrBuilder(CallerPAL.getParamAttributes(i))
	.overlaps(AttributeFuncs::typeIncompatible(ParamTy)))
	return false; // Attribute not compatible with transformed value.

	if (CS.isInAllocaArgument(i))
	return false; // Cannot transform to and from inalloca.

	// If the parameter is passed as a byval argument, then we have to have a
	// sized type and the sized type has to have the same size as the old type.
	if (ParamTy != ActTy && CallerPAL.hasParamAttribute(i, Attribute::ByVal)) {
	PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy);
	if (!ParamPTy \|\| !ParamPTy->getElementType()->isSized())
	return false;

	Type *CurElTy = ActTy->getPointerElementType();
	if (DL.getTypeAllocSize(CurElTy) !=
	DL.getTypeAllocSize(ParamPTy->getElementType()))
	return false;
	}
	}

	if (Callee->isDeclaration()) {
	// Do not delete arguments unless we have a function body.
	if (FT->getNumParams() < NumActualArgs && !FT->isVarArg())
	return false;

	// If the callee is just a declaration, don't change the varargsness of the
	// call. We don't want to introduce a varargs call where one doesn't
	// already exist.
	PointerType *APTy = cast<PointerType>(CS.getCalledValue()->getType());
	if (FT->isVarArg()!=cast<FunctionType>(APTy->getElementType())->isVarArg())
	return false;

	// If both the callee and the cast type are varargs, we still have to make
	// sure the number of fixed parameters are the same or we have the same
	// ABI issues as if we introduce a varargs call.
	if (FT->isVarArg() &&
	cast<FunctionType>(APTy->getElementType())->isVarArg() &&
	FT->getNumParams() !=
	cast<FunctionType>(APTy->getElementType())->getNumParams())
	return false;
	}

	if (FT->getNumParams() < NumActualArgs && FT->isVarArg() &&
	!CallerPAL.isEmpty()) {
	// In this case we have more arguments than the new function type, but we
	// won't be dropping them. Check that these extra arguments have attributes
	// that are compatible with being a vararg call argument.
	unsigned SRetIdx;
	if (CallerPAL.hasAttrSomewhere(Attribute::StructRet, &SRetIdx) &&
	SRetIdx > FT->getNumParams())
	return false;
	}

	// Okay, we decided that this is a safe thing to do: go ahead and start
	// inserting cast instructions as necessary.
	SmallVector<Value *, 8> Args;
	SmallVector<AttributeSet, 8> ArgAttrs;
	Args.reserve(NumActualArgs);
	ArgAttrs.reserve(NumActualArgs);

	// Get any return attributes.
	AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);

	// If the return value is not being used, the type may not be compatible
	// with the existing attributes. Wipe out any problematic attributes.
	RAttrs.remove(AttributeFuncs::typeIncompatible(NewRetTy));

	AI = CS.arg_begin();
	for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) {
	Type *ParamTy = FT->getParamType(i);

	Value NewArg = AI;
	if ((*AI)->getType() != ParamTy)
	NewArg = Builder.CreateBitOrPointerCast(*AI, ParamTy);
	Args.push_back(NewArg);

	// Add any parameter attributes.
	ArgAttrs.push_back(CallerPAL.getParamAttributes(i));
	}

	// If the function takes more arguments than the call was taking, add them
	// now.
	for (unsigned i = NumCommonArgs; i != FT->getNumParams(); ++i) {
	Args.push_back(Constant::getNullValue(FT->getParamType(i)));
	ArgAttrs.push_back(AttributeSet());
	}

	// If we are removing arguments to the function, emit an obnoxious warning.
	if (FT->getNumParams() < NumActualArgs) {
	// TODO: if (!FT->isVarArg()) this call may be unreachable. PR14722
	if (FT->isVarArg()) {
	// Add all of the arguments in their promoted form to the arg list.
	for (unsigned i = FT->getNumParams(); i != NumActualArgs; ++i, ++AI) {
	Type PTy = getPromotedType((AI)->getType());
	Value NewArg = AI;
	if (PTy != (*AI)->getType()) {
	// Must promote to pass through va_arg area!
	Instruction::CastOps opcode =
	CastInst::getCastOpcode(*AI, false, PTy, false);
	NewArg = Builder.CreateCast(opcode, *AI, PTy);
	}
	Args.push_back(NewArg);

	// Add any parameter attributes.
	ArgAttrs.push_back(CallerPAL.getParamAttributes(i));
	}
	}
	}

	AttributeSet FnAttrs = CallerPAL.getFnAttributes();

	if (NewRetTy->isVoidTy())
	Caller->setName(""); // Void type should not have a name.

	assert((ArgAttrs.size() == FT->getNumParams() \|\| FT->isVarArg()) &&
	"missing argument attributes");
	LLVMContext &Ctx = Callee->getContext();
	AttributeList NewCallerPAL = AttributeList::get(
	Ctx, FnAttrs, AttributeSet::get(Ctx, RAttrs), ArgAttrs);

	SmallVector<OperandBundleDef, 1> OpBundles;
	CS.getOperandBundlesAsDefs(OpBundles);

	CallSite NewCS;
	if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
	NewCS = Builder.CreateInvoke(Callee, II->getNormalDest(),
	II->getUnwindDest(), Args, OpBundles);
	} else {
	NewCS = Builder.CreateCall(Callee, Args, OpBundles);
	cast<CallInst>(NewCS.getInstruction())
	->setTailCallKind(cast<CallInst>(Caller)->getTailCallKind());
	}
	NewCS->takeName(Caller);
	NewCS.setCallingConv(CS.getCallingConv());
	NewCS.setAttributes(NewCallerPAL);

	// Preserve the weight metadata for the new call instruction. The metadata
	// is used by SamplePGO to check callsite's hotness.
	uint64_t W;
	if (Caller->extractProfTotalWeight(W))
	NewCS->setProfWeight(W);

	// Insert a cast of the return type as necessary.
	Instruction *NC = NewCS.getInstruction();
	Value *NV = NC;
	if (OldRetTy != NV->getType() && !Caller->use_empty()) {
	if (!NV->getType()->isVoidTy()) {
	NV = NC = CastInst::CreateBitOrPointerCast(NC, OldRetTy);
	NC->setDebugLoc(Caller->getDebugLoc());

	// If this is an invoke instruction, we should insert it after the first
	// non-phi, instruction in the normal successor block.
	if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
	BasicBlock::iterator I = II->getNormalDest()->getFirstInsertionPt();
	InsertNewInstBefore(NC, *I);
	} else {
	// Otherwise, it's a call, just insert cast right after the call.
	InsertNewInstBefore(NC, *Caller);
	}
	Worklist.AddUsersToWorkList(*Caller);
	} else {
	NV = UndefValue::get(Caller->getType());
	}
	}

	if (!Caller->use_empty())
	replaceInstUsesWith(*Caller, NV);
	else if (Caller->hasValueHandle()) {
	if (OldRetTy == NV->getType())
	ValueHandleBase::ValueIsRAUWd(Caller, NV);
	else
	// We cannot call ValueIsRAUWd with a different type, and the
	// actual tracked value will disappear.
	ValueHandleBase::ValueIsDeleted(Caller);
	}

	eraseInstFromFunction(*Caller);
	return true;
	}

	/// Turn a call to a function created by init_trampoline / adjust_trampoline
	/// intrinsic pair into a direct call to the underlying function.
	Instruction *
	InstCombiner::transformCallThroughTrampoline(CallSite CS,
	IntrinsicInst *Tramp) {
	Value *Callee = CS.getCalledValue();
	PointerType *PTy = cast<PointerType>(Callee->getType());
	FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
	AttributeList Attrs = CS.getAttributes();

	// If the call already has the 'nest' attribute somewhere then give up -
	// otherwise 'nest' would occur twice after splicing in the chain.
	if (Attrs.hasAttrSomewhere(Attribute::Nest))
	return nullptr;

	assert(Tramp &&
	"transformCallThroughTrampoline called with incorrect CallSite.");

	Function *NestF =cast<Function>(Tramp->getArgOperand(1)->stripPointerCasts());
	FunctionType *NestFTy = cast<FunctionType>(NestF->getValueType());

	AttributeList NestAttrs = NestF->getAttributes();
	if (!NestAttrs.isEmpty()) {
	unsigned NestArgNo = 0;
	Type *NestTy = nullptr;
	AttributeSet NestAttr;

	// Look for a parameter marked with the 'nest' attribute.
	for (FunctionType::param_iterator I = NestFTy->param_begin(),
	E = NestFTy->param_end();
	I != E; ++NestArgNo, ++I) {
	AttributeSet AS = NestAttrs.getParamAttributes(NestArgNo);
	if (AS.hasAttribute(Attribute::Nest)) {
	// Record the parameter type and any other attributes.
	NestTy = *I;
	NestAttr = AS;
	break;
	}
	}

	if (NestTy) {
	Instruction *Caller = CS.getInstruction();
	std::vector<Value*> NewArgs;
	std::vector<AttributeSet> NewArgAttrs;
	NewArgs.reserve(CS.arg_size() + 1);
	NewArgAttrs.reserve(CS.arg_size());

	// Insert the nest argument into the call argument list, which may
	// mean appending it. Likewise for attributes.

	{
	unsigned ArgNo = 0;
	CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
	do {
	if (ArgNo == NestArgNo) {
	// Add the chain argument and attributes.
	Value *NestVal = Tramp->getArgOperand(2);
	if (NestVal->getType() != NestTy)
	NestVal = Builder.CreateBitCast(NestVal, NestTy, "nest");
	NewArgs.push_back(NestVal);
	NewArgAttrs.push_back(NestAttr);
	}

	if (I == E)
	break;

	// Add the original argument and attributes.
	NewArgs.push_back(*I);
	NewArgAttrs.push_back(Attrs.getParamAttributes(ArgNo));

	++ArgNo;
	++I;
	} while (true);
	}

	// The trampoline may have been bitcast to a bogus type (FTy).
	// Handle this by synthesizing a new function type, equal to FTy
	// with the chain parameter inserted.

	std::vector<Type*> NewTypes;
	NewTypes.reserve(FTy->getNumParams()+1);

	// Insert the chain's type into the list of parameter types, which may
	// mean appending it.
	{
	unsigned ArgNo = 0;
	FunctionType::param_iterator I = FTy->param_begin(),
	E = FTy->param_end();

	do {
	if (ArgNo == NestArgNo)
	// Add the chain's type.
	NewTypes.push_back(NestTy);

	if (I == E)
	break;

	// Add the original type.
	NewTypes.push_back(*I);

	++ArgNo;
	++I;
	} while (true);
	}

	// Replace the trampoline call with a direct call. Let the generic
	// code sort out any function type mismatches.
	FunctionType *NewFTy = FunctionType::get(FTy->getReturnType(), NewTypes,
	FTy->isVarArg());
	Constant *NewCallee =
	NestF->getType() == PointerType::getUnqual(NewFTy) ?
	NestF : ConstantExpr::getBitCast(NestF,
	PointerType::getUnqual(NewFTy));
	AttributeList NewPAL =
	AttributeList::get(FTy->getContext(), Attrs.getFnAttributes(),
	Attrs.getRetAttributes(), NewArgAttrs);

	SmallVector<OperandBundleDef, 1> OpBundles;
	CS.getOperandBundlesAsDefs(OpBundles);

	Instruction *NewCaller;
	if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
	NewCaller = InvokeInst::Create(NewCallee,
	II->getNormalDest(), II->getUnwindDest(),
	NewArgs, OpBundles);
	cast<InvokeInst>(NewCaller)->setCallingConv(II->getCallingConv());
	cast<InvokeInst>(NewCaller)->setAttributes(NewPAL);
	} else {
	NewCaller = CallInst::Create(NewCallee, NewArgs, OpBundles);
	cast<CallInst>(NewCaller)->setTailCallKind(
	cast<CallInst>(Caller)->getTailCallKind());
	cast<CallInst>(NewCaller)->setCallingConv(
	cast<CallInst>(Caller)->getCallingConv());
	cast<CallInst>(NewCaller)->setAttributes(NewPAL);
	}
	NewCaller->setDebugLoc(Caller->getDebugLoc());

	return NewCaller;
	}
	}

	// Replace the trampoline call with a direct call. Since there is no 'nest'
	// parameter, there is no need to adjust the argument list. Let the generic
	// code sort out any function type mismatches.
	Constant *NewCallee =
	NestF->getType() == PTy ? NestF :
	ConstantExpr::getBitCast(NestF, PTy);
	CS.setCalledFunction(NewCallee);
	return CS.getInstruction();
	}
	Index: head/contrib/llvm/tools/clang/include/clang/AST/DeclBase.h
	===================================================================
	--- head/contrib/llvm/tools/clang/include/clang/AST/DeclBase.h (revision 329409)
	+++ head/contrib/llvm/tools/clang/include/clang/AST/DeclBase.h (revision 329410)
	@@ -1,2056 +1,2060 @@
	//===- DeclBase.h - Base Classes for representing declarations --- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the Decl and DeclContext interfaces.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_CLANG_AST_DECLBASE_H
	#define LLVM_CLANG_AST_DECLBASE_H

	#include "clang/AST/AttrIterator.h"
	#include "clang/AST/DeclarationName.h"
	#include "clang/Basic/LLVM.h"
	#include "clang/Basic/SourceLocation.h"
	#include "clang/Basic/Specifiers.h"
	#include "clang/Basic/VersionTuple.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/PointerIntPair.h"
	#include "llvm/ADT/PointerUnion.h"
	#include "llvm/ADT/iterator.h"
	#include "llvm/ADT/iterator_range.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/PrettyStackTrace.h"
	#include <algorithm>
	#include <cassert>
	#include <cstddef>
	#include <iterator>
	#include <string>
	#include <type_traits>
	#include <utility>

	namespace clang {

	class ASTContext;
	class ASTMutationListener;
	class Attr;
	class DeclContext;
	class ExternalSourceSymbolAttr;
	class FunctionDecl;
	class FunctionType;
	class IdentifierInfo;
	enum Linkage : unsigned char;
	class LinkageSpecDecl;
	class Module;
	class NamedDecl;
	class ObjCCategoryDecl;
	class ObjCCategoryImplDecl;
	class ObjCContainerDecl;
	class ObjCImplDecl;
	class ObjCImplementationDecl;
	class ObjCInterfaceDecl;
	class ObjCMethodDecl;
	class ObjCProtocolDecl;
	struct PrintingPolicy;
	class RecordDecl;
	class SourceManager;
	class Stmt;
	class StoredDeclsMap;
	class TemplateDecl;
	class TranslationUnitDecl;
	class UsingDirectiveDecl;

	/// \brief Captures the result of checking the availability of a
	/// declaration.
	enum AvailabilityResult {
	AR_Available = 0,
	AR_NotYetIntroduced,
	AR_Deprecated,
	AR_Unavailable
	};

	/// Decl - This represents one declaration (or definition), e.g. a variable,
	/// typedef, function, struct, etc.
	///
	/// Note: There are objects tacked on before the beginning of Decl
	/// (and its subclasses) in its Decl::operator new(). Proper alignment
	/// of all subclasses (not requiring more than the alignment of Decl) is
	/// asserted in DeclBase.cpp.
	class LLVM_ALIGNAS(/alignof(uint64_t)/ 8) Decl {
	public:
	/// \brief Lists the kind of concrete classes of Decl.
	enum Kind {
	#define DECL(DERIVED, BASE) DERIVED,
	#define ABSTRACT_DECL(DECL)
	#define DECL_RANGE(BASE, START, END) \
	first##BASE = START, last##BASE = END,
	#define LAST_DECL_RANGE(BASE, START, END) \
	first##BASE = START, last##BASE = END
	#include "clang/AST/DeclNodes.inc"
	};

	/// \brief A placeholder type used to construct an empty shell of a
	/// decl-derived type that will be filled in later (e.g., by some
	/// deserialization method).
	struct EmptyShell {};

	/// IdentifierNamespace - The different namespaces in which
	/// declarations may appear. According to C99 6.2.3, there are
	/// four namespaces, labels, tags, members and ordinary
	/// identifiers. C++ describes lookup completely differently:
	/// certain lookups merely "ignore" certain kinds of declarations,
	/// usually based on whether the declaration is of a type, etc.
	///
	/// These are meant as bitmasks, so that searches in
	/// C++ can look into the "tag" namespace during ordinary lookup.
	///
	/// Decl currently provides 15 bits of IDNS bits.
	enum IdentifierNamespace {
	/// Labels, declared with 'x:' and referenced with 'goto x'.
	IDNS_Label = 0x0001,

	/// Tags, declared with 'struct foo;' and referenced with
	/// 'struct foo'. All tags are also types. This is what
	/// elaborated-type-specifiers look for in C.
	/// This also contains names that conflict with tags in the
	/// same scope but that are otherwise ordinary names (non-type
	/// template parameters and indirect field declarations).
	IDNS_Tag = 0x0002,

	/// Types, declared with 'struct foo', typedefs, etc.
	/// This is what elaborated-type-specifiers look for in C++,
	/// but note that it's ill-formed to find a non-tag.
	IDNS_Type = 0x0004,

	/// Members, declared with object declarations within tag
	/// definitions. In C, these can only be found by "qualified"
	/// lookup in member expressions. In C++, they're found by
	/// normal lookup.
	IDNS_Member = 0x0008,

	/// Namespaces, declared with 'namespace foo {}'.
	/// Lookup for nested-name-specifiers find these.
	IDNS_Namespace = 0x0010,

	/// Ordinary names. In C, everything that's not a label, tag,
	/// member, or function-local extern ends up here.
	IDNS_Ordinary = 0x0020,

	/// Objective C \@protocol.
	IDNS_ObjCProtocol = 0x0040,

	/// This declaration is a friend function. A friend function
	/// declaration is always in this namespace but may also be in
	/// IDNS_Ordinary if it was previously declared.
	IDNS_OrdinaryFriend = 0x0080,

	/// This declaration is a friend class. A friend class
	/// declaration is always in this namespace but may also be in
	/// IDNS_Tag\|IDNS_Type if it was previously declared.
	IDNS_TagFriend = 0x0100,

	/// This declaration is a using declaration. A using declaration
	/// introduces a number of other declarations into the current
	/// scope, and those declarations use the IDNS of their targets,
	/// but the actual using declarations go in this namespace.
	IDNS_Using = 0x0200,

	/// This declaration is a C++ operator declared in a non-class
	/// context. All such operators are also in IDNS_Ordinary.
	/// C++ lexical operator lookup looks for these.
	IDNS_NonMemberOperator = 0x0400,

	/// This declaration is a function-local extern declaration of a
	/// variable or function. This may also be IDNS_Ordinary if it
	/// has been declared outside any function. These act mostly like
	/// invisible friend declarations, but are also visible to unqualified
	/// lookup within the scope of the declaring function.
	IDNS_LocalExtern = 0x0800,

	/// This declaration is an OpenMP user defined reduction construction.
	IDNS_OMPReduction = 0x1000
	};

	/// ObjCDeclQualifier - 'Qualifiers' written next to the return and
	/// parameter types in method declarations. Other than remembering
	/// them and mangling them into the method's signature string, these
	/// are ignored by the compiler; they are consumed by certain
	/// remote-messaging frameworks.
	///
	/// in, inout, and out are mutually exclusive and apply only to
	/// method parameters. bycopy and byref are mutually exclusive and
	/// apply only to method parameters (?). oneway applies only to
	/// results. All of these expect their corresponding parameter to
	/// have a particular type. None of this is currently enforced by
	/// clang.
	///
	/// This should be kept in sync with ObjCDeclSpec::ObjCDeclQualifier.
	enum ObjCDeclQualifier {
	OBJC_TQ_None = 0x0,
	OBJC_TQ_In = 0x1,
	OBJC_TQ_Inout = 0x2,
	OBJC_TQ_Out = 0x4,
	OBJC_TQ_Bycopy = 0x8,
	OBJC_TQ_Byref = 0x10,
	OBJC_TQ_Oneway = 0x20,

	/// The nullability qualifier is set when the nullability of the
	/// result or parameter was expressed via a context-sensitive
	/// keyword.
	OBJC_TQ_CSNullability = 0x40
	};

	/// The kind of ownership a declaration has, for visibility purposes.
	/// This enumeration is designed such that higher values represent higher
	/// levels of name hiding.
	enum class ModuleOwnershipKind : unsigned {
	/// This declaration is not owned by a module.
	Unowned,

	/// This declaration has an owning module, but is globally visible
	/// (typically because its owning module is visible and we know that
	/// modules cannot later become hidden in this compilation).
	/// After serialization and deserialization, this will be converted
	/// to VisibleWhenImported.
	Visible,

	/// This declaration has an owning module, and is visible when that
	/// module is imported.
	VisibleWhenImported,

	/// This declaration has an owning module, but is only visible to
	/// lookups that occur within that module.
	ModulePrivate
	};

	protected:
	/// \brief The next declaration within the same lexical
	/// DeclContext. These pointers form the linked list that is
	/// traversed via DeclContext's decls_begin()/decls_end().
	///
	/// The extra two bits are used for the ModuleOwnershipKind.
	llvm::PointerIntPair<Decl *, 2, ModuleOwnershipKind> NextInContextAndBits;

	private:
	friend class DeclContext;

	struct MultipleDC {
	DeclContext *SemanticDC;
	DeclContext *LexicalDC;
	};

	/// DeclCtx - Holds either a DeclContext* or a MultipleDC*.
	/// For declarations that don't contain C++ scope specifiers, it contains
	/// the DeclContext where the Decl was declared.
	/// For declarations with C++ scope specifiers, it contains a MultipleDC*
	/// with the context where it semantically belongs (SemanticDC) and the
	/// context where it was lexically declared (LexicalDC).
	/// e.g.:
	///
	/// namespace A {
	/// void f(); // SemanticDC == LexicalDC == 'namespace A'
	/// }
	/// void A::f(); // SemanticDC == namespace 'A'
	/// // LexicalDC == global namespace
	llvm::PointerUnion<DeclContext, MultipleDC> DeclCtx;

	bool isInSemaDC() const { return DeclCtx.is<DeclContext*>(); }
	bool isOutOfSemaDC() const { return DeclCtx.is<MultipleDC*>(); }

	MultipleDC *getMultipleDC() const {
	return DeclCtx.get<MultipleDC*>();
	}

	DeclContext *getSemanticDC() const {
	return DeclCtx.get<DeclContext*>();
	}

	/// Loc - The location of this decl.
	SourceLocation Loc;

	/// DeclKind - This indicates which class this is.
	unsigned DeclKind : 7;

	/// InvalidDecl - This indicates a semantic error occurred.
	unsigned InvalidDecl : 1;

	/// HasAttrs - This indicates whether the decl has attributes or not.
	unsigned HasAttrs : 1;

	/// Implicit - Whether this declaration was implicitly generated by
	/// the implementation rather than explicitly written by the user.
	unsigned Implicit : 1;

	/// \brief Whether this declaration was "used", meaning that a definition is
	/// required.
	unsigned Used : 1;

	/// \brief Whether this declaration was "referenced".
	/// The difference with 'Used' is whether the reference appears in a
	/// evaluated context or not, e.g. functions used in uninstantiated templates
	/// are regarded as "referenced" but not "used".
	unsigned Referenced : 1;

	/// \brief Whether this declaration is a top-level declaration (function,
	/// global variable, etc.) that is lexically inside an objc container
	/// definition.
	unsigned TopLevelDeclInObjCContainer : 1;

	/// \brief Whether statistic collection is enabled.
	static bool StatisticsEnabled;

	protected:
	friend class ASTDeclReader;
	friend class ASTDeclWriter;
	friend class ASTReader;
	friend class CXXClassMemberWrapper;
	friend class LinkageComputer;
	template<typename decl_type> friend class Redeclarable;

	/// Access - Used by C++ decls for the access specifier.
	// NOTE: VC++ treats enums as signed, avoid using the AccessSpecifier enum
	unsigned Access : 2;

	/// \brief Whether this declaration was loaded from an AST file.
	unsigned FromASTFile : 1;

	/// IdentifierNamespace - This specifies what IDNS_* namespace this lives in.
	unsigned IdentifierNamespace : 13;

	/// \brief If 0, we have not computed the linkage of this declaration.
	/// Otherwise, it is the linkage + 1.
	mutable unsigned CacheValidAndLinkage : 3;

	/// \brief Allocate memory for a deserialized declaration.
	///
	/// This routine must be used to allocate memory for any declaration that is
	/// deserialized from a module file.
	///
	/// \param Size The size of the allocated object.
	/// \param Ctx The context in which we will allocate memory.
	/// \param ID The global ID of the deserialized declaration.
	/// \param Extra The amount of extra space to allocate after the object.
	void *operator new(std::size_t Size, const ASTContext &Ctx, unsigned ID,
	std::size_t Extra = 0);

	/// \brief Allocate memory for a non-deserialized declaration.
	void *operator new(std::size_t Size, const ASTContext &Ctx,
	DeclContext *Parent, std::size_t Extra = 0);

	private:
	bool AccessDeclContextSanity() const;

	/// Get the module ownership kind to use for a local lexical child of \p DC,
	/// which may be either a local or (rarely) an imported declaration.
	static ModuleOwnershipKind getModuleOwnershipKindForChildOf(DeclContext *DC) {
	if (DC) {
	auto *D = cast<Decl>(DC);
	auto MOK = D->getModuleOwnershipKind();
	if (MOK != ModuleOwnershipKind::Unowned &&
	(!D->isFromASTFile() \|\| D->hasLocalOwningModuleStorage()))
	return MOK;
	// If D is not local and we have no local module storage, then we don't
	// need to track module ownership at all.
	}
	return ModuleOwnershipKind::Unowned;
	}

	protected:
	Decl(Kind DK, DeclContext *DC, SourceLocation L)
	: NextInContextAndBits(nullptr, getModuleOwnershipKindForChildOf(DC)),
	DeclCtx(DC), Loc(L), DeclKind(DK), InvalidDecl(false), HasAttrs(false),
	Implicit(false), Used(false), Referenced(false),
	TopLevelDeclInObjCContainer(false), Access(AS_none), FromASTFile(0),
	IdentifierNamespace(getIdentifierNamespaceForKind(DK)),
	CacheValidAndLinkage(0) {
	if (StatisticsEnabled) add(DK);
	}

	Decl(Kind DK, EmptyShell Empty)
	: DeclKind(DK), InvalidDecl(false), HasAttrs(false), Implicit(false),
	Used(false), Referenced(false), TopLevelDeclInObjCContainer(false),
	Access(AS_none), FromASTFile(0),
	IdentifierNamespace(getIdentifierNamespaceForKind(DK)),
	CacheValidAndLinkage(0) {
	if (StatisticsEnabled) add(DK);
	}

	virtual ~Decl();

	/// \brief Update a potentially out-of-date declaration.
	void updateOutOfDate(IdentifierInfo &II) const;

	Linkage getCachedLinkage() const {
	return Linkage(CacheValidAndLinkage - 1);
	}

	void setCachedLinkage(Linkage L) const {
	CacheValidAndLinkage = L + 1;
	}

	bool hasCachedLinkage() const {
	return CacheValidAndLinkage;
	}

	public:
	/// \brief Source range that this declaration covers.
	virtual SourceRange getSourceRange() const LLVM_READONLY {
	return SourceRange(getLocation(), getLocation());
	}

	SourceLocation getLocStart() const LLVM_READONLY {
	return getSourceRange().getBegin();
	}

	SourceLocation getLocEnd() const LLVM_READONLY {
	return getSourceRange().getEnd();
	}

	SourceLocation getLocation() const { return Loc; }
	void setLocation(SourceLocation L) { Loc = L; }

	Kind getKind() const { return static_cast<Kind>(DeclKind); }
	const char *getDeclKindName() const;

	Decl *getNextDeclInContext() { return NextInContextAndBits.getPointer(); }
	const Decl *getNextDeclInContext() const {return NextInContextAndBits.getPointer();}

	DeclContext *getDeclContext() {
	if (isInSemaDC())
	return getSemanticDC();
	return getMultipleDC()->SemanticDC;
	}
	const DeclContext *getDeclContext() const {
	return const_cast<Decl*>(this)->getDeclContext();
	}

	/// Find the innermost non-closure ancestor of this declaration,
	/// walking up through blocks, lambdas, etc. If that ancestor is
	/// not a code context (!isFunctionOrMethod()), returns null.
	///
	/// A declaration may be its own non-closure context.
	Decl *getNonClosureContext();
	const Decl *getNonClosureContext() const {
	return const_cast<Decl*>(this)->getNonClosureContext();
	}

	TranslationUnitDecl *getTranslationUnitDecl();
	const TranslationUnitDecl *getTranslationUnitDecl() const {
	return const_cast<Decl*>(this)->getTranslationUnitDecl();
	}

	bool isInAnonymousNamespace() const;

	bool isInStdNamespace() const;

	ASTContext &getASTContext() const LLVM_READONLY;

	void setAccess(AccessSpecifier AS) {
	Access = AS;
	assert(AccessDeclContextSanity());
	}

	AccessSpecifier getAccess() const {
	assert(AccessDeclContextSanity());
	return AccessSpecifier(Access);
	}

	/// \brief Retrieve the access specifier for this declaration, even though
	/// it may not yet have been properly set.
	AccessSpecifier getAccessUnsafe() const {
	return AccessSpecifier(Access);
	}

	bool hasAttrs() const { return HasAttrs; }

	void setAttrs(const AttrVec& Attrs) {
	return setAttrsImpl(Attrs, getASTContext());
	}

	AttrVec &getAttrs() {
	return const_cast<AttrVec&>(const_cast<const Decl*>(this)->getAttrs());
	}

	const AttrVec &getAttrs() const;
	void dropAttrs();

	void addAttr(Attr *A) {
	if (hasAttrs())
	getAttrs().push_back(A);
	else
	setAttrs(AttrVec(1, A));
	}

	using attr_iterator = AttrVec::const_iterator;
	using attr_range = llvm::iterator_range<attr_iterator>;

	attr_range attrs() const {
	return attr_range(attr_begin(), attr_end());
	}

	attr_iterator attr_begin() const {
	return hasAttrs() ? getAttrs().begin() : nullptr;
	}
	attr_iterator attr_end() const {
	return hasAttrs() ? getAttrs().end() : nullptr;
	}

	template <typename T>
	void dropAttr() {
	if (!HasAttrs) return;

	AttrVec &Vec = getAttrs();
	Vec.erase(std::remove_if(Vec.begin(), Vec.end(), isa<T, Attr*>), Vec.end());

	if (Vec.empty())
	HasAttrs = false;
	}

	template <typename T>
	llvm::iterator_range<specific_attr_iterator<T>> specific_attrs() const {
	return llvm::make_range(specific_attr_begin<T>(), specific_attr_end<T>());
	}

	template <typename T>
	specific_attr_iterator<T> specific_attr_begin() const {
	return specific_attr_iterator<T>(attr_begin());
	}

	template <typename T>
	specific_attr_iterator<T> specific_attr_end() const {
	return specific_attr_iterator<T>(attr_end());
	}

	template<typename T> T *getAttr() const {
	return hasAttrs() ? getSpecificAttr<T>(getAttrs()) : nullptr;
	}

	template<typename T> bool hasAttr() const {
	return hasAttrs() && hasSpecificAttr<T>(getAttrs());
	}

	/// getMaxAlignment - return the maximum alignment specified by attributes
	/// on this decl, 0 if there are none.
	unsigned getMaxAlignment() const;

	/// setInvalidDecl - Indicates the Decl had a semantic error. This
	/// allows for graceful error recovery.
	void setInvalidDecl(bool Invalid = true);
	bool isInvalidDecl() const { return (bool) InvalidDecl; }

	/// isImplicit - Indicates whether the declaration was implicitly
	/// generated by the implementation. If false, this declaration
	/// was written explicitly in the source code.
	bool isImplicit() const { return Implicit; }
	void setImplicit(bool I = true) { Implicit = I; }

	/// \brief Whether any (re-)declaration of the entity was used, meaning that
	/// a definition is required.
	///
	/// \param CheckUsedAttr When true, also consider the "used" attribute
	/// (in addition to the "used" bit set by \c setUsed()) when determining
	/// whether the function is used.
	bool isUsed(bool CheckUsedAttr = true) const;

	/// \brief Set whether the declaration is used, in the sense of odr-use.
	///
	/// This should only be used immediately after creating a declaration.
	/// It intentionally doesn't notify any listeners.
	void setIsUsed() { getCanonicalDecl()->Used = true; }

	/// \brief Mark the declaration used, in the sense of odr-use.
	///
	/// This notifies any mutation listeners in addition to setting a bit
	/// indicating the declaration is used.
	void markUsed(ASTContext &C);

	/// \brief Whether any declaration of this entity was referenced.
	bool isReferenced() const;

	/// \brief Whether this declaration was referenced. This should not be relied
	/// upon for anything other than debugging.
	bool isThisDeclarationReferenced() const { return Referenced; }

	void setReferenced(bool R = true) { Referenced = R; }

	/// \brief Whether this declaration is a top-level declaration (function,
	/// global variable, etc.) that is lexically inside an objc container
	/// definition.
	bool isTopLevelDeclInObjCContainer() const {
	return TopLevelDeclInObjCContainer;
	}

	void setTopLevelDeclInObjCContainer(bool V = true) {
	TopLevelDeclInObjCContainer = V;
	}

	/// \brief Looks on this and related declarations for an applicable
	/// external source symbol attribute.
	ExternalSourceSymbolAttr *getExternalSourceSymbolAttr() const;

	/// \brief Whether this declaration was marked as being private to the
	/// module in which it was defined.
	bool isModulePrivate() const {
	return getModuleOwnershipKind() == ModuleOwnershipKind::ModulePrivate;
	}

	/// \brief Whether this declaration is exported (by virtue of being lexically
	/// within an ExportDecl or by being a NamespaceDecl).
	bool isExported() const;

	/// Return true if this declaration has an attribute which acts as
	/// definition of the entity, such as 'alias' or 'ifunc'.
	bool hasDefiningAttr() const;

	/// Return this declaration's defining attribute if it has one.
	const Attr *getDefiningAttr() const;

	protected:
	/// \brief Specify that this declaration was marked as being private
	/// to the module in which it was defined.
	void setModulePrivate() {
	// The module-private specifier has no effect on unowned declarations.
	// FIXME: We should track this in some way for source fidelity.
	if (getModuleOwnershipKind() == ModuleOwnershipKind::Unowned)
	return;
	setModuleOwnershipKind(ModuleOwnershipKind::ModulePrivate);
	}

	/// \brief Set the owning module ID.
	void setOwningModuleID(unsigned ID) {
	assert(isFromASTFile() && "Only works on a deserialized declaration");
	((unsigned)this - 2) = ID;
	}

	public:
	/// \brief Determine the availability of the given declaration.
	///
	/// This routine will determine the most restrictive availability of
	/// the given declaration (e.g., preferring 'unavailable' to
	/// 'deprecated').
	///
	/// \param Message If non-NULL and the result is not \c
	/// AR_Available, will be set to a (possibly empty) message
	/// describing why the declaration has not been introduced, is
	/// deprecated, or is unavailable.
	///
	/// \param EnclosingVersion The version to compare with. If empty, assume the
	/// deployment target version.
	AvailabilityResult
	getAvailability(std::string *Message = nullptr,
	VersionTuple EnclosingVersion = VersionTuple()) const;

	/// \brief Retrieve the version of the target platform in which this
	/// declaration was introduced.
	///
	/// \returns An empty version tuple if this declaration has no 'introduced'
	/// availability attributes, or the version tuple that's specified in the
	/// attribute otherwise.
	VersionTuple getVersionIntroduced() const;

	/// \brief Determine whether this declaration is marked 'deprecated'.
	///
	/// \param Message If non-NULL and the declaration is deprecated,
	/// this will be set to the message describing why the declaration
	/// was deprecated (which may be empty).
	bool isDeprecated(std::string *Message = nullptr) const {
	return getAvailability(Message) == AR_Deprecated;
	}

	/// \brief Determine whether this declaration is marked 'unavailable'.
	///
	/// \param Message If non-NULL and the declaration is unavailable,
	/// this will be set to the message describing why the declaration
	/// was made unavailable (which may be empty).
	bool isUnavailable(std::string *Message = nullptr) const {
	return getAvailability(Message) == AR_Unavailable;
	}

	/// \brief Determine whether this is a weak-imported symbol.
	///
	/// Weak-imported symbols are typically marked with the
	/// 'weak_import' attribute, but may also be marked with an
	/// 'availability' attribute where we're targing a platform prior to
	/// the introduction of this feature.
	bool isWeakImported() const;

	/// \brief Determines whether this symbol can be weak-imported,
	/// e.g., whether it would be well-formed to add the weak_import
	/// attribute.
	///
	/// \param IsDefinition Set to \c true to indicate that this
	/// declaration cannot be weak-imported because it has a definition.
	bool canBeWeakImported(bool &IsDefinition) const;

	/// \brief Determine whether this declaration came from an AST file (such as
	/// a precompiled header or module) rather than having been parsed.
	bool isFromASTFile() const { return FromASTFile; }

	/// \brief Retrieve the global declaration ID associated with this
	/// declaration, which specifies where this Decl was loaded from.
	unsigned getGlobalID() const {
	if (isFromASTFile())
	return ((const unsigned)this - 1);
	return 0;
	}

	/// \brief Retrieve the global ID of the module that owns this particular
	/// declaration.
	unsigned getOwningModuleID() const {
	if (isFromASTFile())
	return ((const unsigned)this - 2);
	return 0;
	}

	private:
	Module *getOwningModuleSlow() const;

	protected:
	bool hasLocalOwningModuleStorage() const;

	public:
	/// \brief Get the imported owning module, if this decl is from an imported
	/// (non-local) module.
	Module *getImportedOwningModule() const {
	if (!isFromASTFile() \|\| !hasOwningModule())
	return nullptr;

	return getOwningModuleSlow();
	}

	/// \brief Get the local owning module, if known. Returns nullptr if owner is
	/// not yet known or declaration is not from a module.
	Module *getLocalOwningModule() const {
	if (isFromASTFile() \|\| !hasOwningModule())
	return nullptr;

	assert(hasLocalOwningModuleStorage() &&
	"owned local decl but no local module storage");
	return reinterpret_cast<Module const >(this)[-1];
	}
	void setLocalOwningModule(Module *M) {
	assert(!isFromASTFile() && hasOwningModule() &&
	hasLocalOwningModuleStorage() &&
	"should not have a cached owning module");
	reinterpret_cast<Module **>(this)[-1] = M;
	}

	/// Is this declaration owned by some module?
	bool hasOwningModule() const {
	return getModuleOwnershipKind() != ModuleOwnershipKind::Unowned;
	}

	/// Get the module that owns this declaration (for visibility purposes).
	Module *getOwningModule() const {
	return isFromASTFile() ? getImportedOwningModule() : getLocalOwningModule();
	}

	/// Get the module that owns this declaration for linkage purposes.
	/// There only ever is such a module under the C++ Modules TS.
	///
	/// \param IgnoreLinkage Ignore the linkage of the entity; assume that
	/// all declarations in a global module fragment are unowned.
	Module *getOwningModuleForLinkage(bool IgnoreLinkage = false) const;

	/// \brief Determine whether this declaration might be hidden from name
	/// lookup. Note that the declaration might be visible even if this returns
	/// \c false, if the owning module is visible within the query context.
	// FIXME: Rename this to make it clearer what it does.
	bool isHidden() const {
	return (int)getModuleOwnershipKind() > (int)ModuleOwnershipKind::Visible;
	}

	/// Set that this declaration is globally visible, even if it came from a
	/// module that is not visible.
	void setVisibleDespiteOwningModule() {
	if (isHidden())
	setModuleOwnershipKind(ModuleOwnershipKind::Visible);
	}

	/// \brief Get the kind of module ownership for this declaration.
	ModuleOwnershipKind getModuleOwnershipKind() const {
	return NextInContextAndBits.getInt();
	}

	/// \brief Set whether this declaration is hidden from name lookup.
	void setModuleOwnershipKind(ModuleOwnershipKind MOK) {
	assert(!(getModuleOwnershipKind() == ModuleOwnershipKind::Unowned &&
	MOK != ModuleOwnershipKind::Unowned && !isFromASTFile() &&
	!hasLocalOwningModuleStorage()) &&
	"no storage available for owning module for this declaration");
	NextInContextAndBits.setInt(MOK);
	}

	unsigned getIdentifierNamespace() const {
	return IdentifierNamespace;
	}

	bool isInIdentifierNamespace(unsigned NS) const {
	return getIdentifierNamespace() & NS;
	}

	static unsigned getIdentifierNamespaceForKind(Kind DK);

	bool hasTagIdentifierNamespace() const {
	return isTagIdentifierNamespace(getIdentifierNamespace());
	}

	static bool isTagIdentifierNamespace(unsigned NS) {
	// TagDecls have Tag and Type set and may also have TagFriend.
	return (NS & ~IDNS_TagFriend) == (IDNS_Tag \| IDNS_Type);
	}

	/// getLexicalDeclContext - The declaration context where this Decl was
	/// lexically declared (LexicalDC). May be different from
	/// getDeclContext() (SemanticDC).
	/// e.g.:
	///
	/// namespace A {
	/// void f(); // SemanticDC == LexicalDC == 'namespace A'
	/// }
	/// void A::f(); // SemanticDC == namespace 'A'
	/// // LexicalDC == global namespace
	DeclContext *getLexicalDeclContext() {
	if (isInSemaDC())
	return getSemanticDC();
	return getMultipleDC()->LexicalDC;
	}
	const DeclContext *getLexicalDeclContext() const {
	return const_cast<Decl*>(this)->getLexicalDeclContext();
	}

	/// Determine whether this declaration is declared out of line (outside its
	/// semantic context).
	virtual bool isOutOfLine() const;

	/// setDeclContext - Set both the semantic and lexical DeclContext
	/// to DC.
	void setDeclContext(DeclContext *DC);

	void setLexicalDeclContext(DeclContext *DC);

	+ /// Determine whether this declaration is a templated entity (whether it is
	+ // within the scope of a template parameter).
	+ bool isTemplated() const;
	+
	/// isDefinedOutsideFunctionOrMethod - This predicate returns true if this
	/// scoped decl is defined outside the current function or method. This is
	/// roughly global variables and functions, but also handles enums (which
	/// could be defined inside or outside a function etc).
	bool isDefinedOutsideFunctionOrMethod() const {
	return getParentFunctionOrMethod() == nullptr;
	}

	/// \brief Returns true if this declaration lexically is inside a function.
	/// It recognizes non-defining declarations as well as members of local
	/// classes:
	/// \code
	/// void foo() { void bar(); }
	/// void foo2() { class ABC { void bar(); }; }
	/// \endcode
	bool isLexicallyWithinFunctionOrMethod() const;

	/// \brief If this decl is defined inside a function/method/block it returns
	/// the corresponding DeclContext, otherwise it returns null.
	const DeclContext *getParentFunctionOrMethod() const;
	DeclContext *getParentFunctionOrMethod() {
	return const_cast<DeclContext*>(
	const_cast<const Decl*>(this)->getParentFunctionOrMethod());
	}

	/// \brief Retrieves the "canonical" declaration of the given declaration.
	virtual Decl *getCanonicalDecl() { return this; }
	const Decl *getCanonicalDecl() const {
	return const_cast<Decl*>(this)->getCanonicalDecl();
	}

	/// \brief Whether this particular Decl is a canonical one.
	bool isCanonicalDecl() const { return getCanonicalDecl() == this; }

	protected:
	/// \brief Returns the next redeclaration or itself if this is the only decl.
	///
	/// Decl subclasses that can be redeclared should override this method so that
	/// Decl::redecl_iterator can iterate over them.
	virtual Decl *getNextRedeclarationImpl() { return this; }

	/// \brief Implementation of getPreviousDecl(), to be overridden by any
	/// subclass that has a redeclaration chain.
	virtual Decl *getPreviousDeclImpl() { return nullptr; }

	/// \brief Implementation of getMostRecentDecl(), to be overridden by any
	/// subclass that has a redeclaration chain.
	virtual Decl *getMostRecentDeclImpl() { return this; }

	public:
	/// \brief Iterates through all the redeclarations of the same decl.
	class redecl_iterator {
	/// Current - The current declaration.
	Decl *Current = nullptr;
	Decl *Starter;

	public:
	using value_type = Decl *;
	using reference = const value_type &;
	using pointer = const value_type *;
	using iterator_category = std::forward_iterator_tag;
	using difference_type = std::ptrdiff_t;

	redecl_iterator() = default;
	explicit redecl_iterator(Decl *C) : Current(C), Starter(C) {}

	reference operator*() const { return Current; }
	value_type operator->() const { return Current; }

	redecl_iterator& operator++() {
	assert(Current && "Advancing while iterator has reached end");
	// Get either previous decl or latest decl.
	Decl *Next = Current->getNextRedeclarationImpl();
	assert(Next && "Should return next redeclaration or itself, never null!");
	Current = (Next != Starter) ? Next : nullptr;
	return *this;
	}

	redecl_iterator operator++(int) {
	redecl_iterator tmp(*this);
	++(*this);
	return tmp;
	}

	friend bool operator==(redecl_iterator x, redecl_iterator y) {
	return x.Current == y.Current;
	}

	friend bool operator!=(redecl_iterator x, redecl_iterator y) {
	return x.Current != y.Current;
	}
	};

	using redecl_range = llvm::iterator_range<redecl_iterator>;

	/// \brief Returns an iterator range for all the redeclarations of the same
	/// decl. It will iterate at least once (when this decl is the only one).
	redecl_range redecls() const {
	return redecl_range(redecls_begin(), redecls_end());
	}

	redecl_iterator redecls_begin() const {
	return redecl_iterator(const_cast<Decl *>(this));
	}

	redecl_iterator redecls_end() const { return redecl_iterator(); }

	/// \brief Retrieve the previous declaration that declares the same entity
	/// as this declaration, or NULL if there is no previous declaration.
	Decl *getPreviousDecl() { return getPreviousDeclImpl(); }

	/// \brief Retrieve the most recent declaration that declares the same entity
	/// as this declaration, or NULL if there is no previous declaration.
	const Decl *getPreviousDecl() const {
	return const_cast<Decl *>(this)->getPreviousDeclImpl();
	}

	/// \brief True if this is the first declaration in its redeclaration chain.
	bool isFirstDecl() const {
	return getPreviousDecl() == nullptr;
	}

	/// \brief Retrieve the most recent declaration that declares the same entity
	/// as this declaration (which may be this declaration).
	Decl *getMostRecentDecl() { return getMostRecentDeclImpl(); }

	/// \brief Retrieve the most recent declaration that declares the same entity
	/// as this declaration (which may be this declaration).
	const Decl *getMostRecentDecl() const {
	return const_cast<Decl *>(this)->getMostRecentDeclImpl();
	}

	/// getBody - If this Decl represents a declaration for a body of code,
	/// such as a function or method definition, this method returns the
	/// top-level Stmt* of that body. Otherwise this method returns null.
	virtual Stmt* getBody() const { return nullptr; }

	/// \brief Returns true if this \c Decl represents a declaration for a body of
	/// code, such as a function or method definition.
	/// Note that \c hasBody can also return true if any redeclaration of this
	/// \c Decl represents a declaration for a body of code.
	virtual bool hasBody() const { return getBody() != nullptr; }

	/// getBodyRBrace - Gets the right brace of the body, if a body exists.
	/// This works whether the body is a CompoundStmt or a CXXTryStmt.
	SourceLocation getBodyRBrace() const;

	// global temp stats (until we have a per-module visitor)
	static void add(Kind k);
	static void EnableStatistics();
	static void PrintStats();

	/// isTemplateParameter - Determines whether this declaration is a
	/// template parameter.
	bool isTemplateParameter() const;

	/// isTemplateParameter - Determines whether this declaration is a
	/// template parameter pack.
	bool isTemplateParameterPack() const;

	/// \brief Whether this declaration is a parameter pack.
	bool isParameterPack() const;

	/// \brief returns true if this declaration is a template
	bool isTemplateDecl() const;

	/// \brief Whether this declaration is a function or function template.
	bool isFunctionOrFunctionTemplate() const {
	return (DeclKind >= Decl::firstFunction &&
	DeclKind <= Decl::lastFunction) \|\|
	DeclKind == FunctionTemplate;
	}

	/// \brief If this is a declaration that describes some template, this
	/// method returns that template declaration.
	TemplateDecl *getDescribedTemplate() const;

	/// \brief Returns the function itself, or the templated function if this is a
	/// function template.
	FunctionDecl *getAsFunction() LLVM_READONLY;

	const FunctionDecl *getAsFunction() const {
	return const_cast<Decl *>(this)->getAsFunction();
	}

	/// \brief Changes the namespace of this declaration to reflect that it's
	/// a function-local extern declaration.
	///
	/// These declarations appear in the lexical context of the extern
	/// declaration, but in the semantic context of the enclosing namespace
	/// scope.
	void setLocalExternDecl() {
	Decl *Prev = getPreviousDecl();
	IdentifierNamespace &= ~IDNS_Ordinary;

	// It's OK for the declaration to still have the "invisible friend" flag or
	// the "conflicts with tag declarations in this scope" flag for the outer
	// scope.
	assert((IdentifierNamespace & ~(IDNS_OrdinaryFriend \| IDNS_Tag)) == 0 &&
	"namespace is not ordinary");

	IdentifierNamespace \|= IDNS_LocalExtern;
	if (Prev && Prev->getIdentifierNamespace() & IDNS_Ordinary)
	IdentifierNamespace \|= IDNS_Ordinary;
	}

	/// \brief Determine whether this is a block-scope declaration with linkage.
	/// This will either be a local variable declaration declared 'extern', or a
	/// local function declaration.
	bool isLocalExternDecl() {
	return IdentifierNamespace & IDNS_LocalExtern;
	}

	/// \brief Changes the namespace of this declaration to reflect that it's
	/// the object of a friend declaration.
	///
	/// These declarations appear in the lexical context of the friending
	/// class, but in the semantic context of the actual entity. This property
	/// applies only to a specific decl object; other redeclarations of the
	/// same entity may not (and probably don't) share this property.
	void setObjectOfFriendDecl(bool PerformFriendInjection = false) {
	unsigned OldNS = IdentifierNamespace;
	assert((OldNS & (IDNS_Tag \| IDNS_Ordinary \|
	IDNS_TagFriend \| IDNS_OrdinaryFriend \|
	IDNS_LocalExtern)) &&
	"namespace includes neither ordinary nor tag");
	assert(!(OldNS & ~(IDNS_Tag \| IDNS_Ordinary \| IDNS_Type \|
	IDNS_TagFriend \| IDNS_OrdinaryFriend \|
	IDNS_LocalExtern)) &&
	"namespace includes other than ordinary or tag");

	Decl *Prev = getPreviousDecl();
	IdentifierNamespace &= ~(IDNS_Ordinary \| IDNS_Tag \| IDNS_Type);

	if (OldNS & (IDNS_Tag \| IDNS_TagFriend)) {
	IdentifierNamespace \|= IDNS_TagFriend;
	if (PerformFriendInjection \|\|
	(Prev && Prev->getIdentifierNamespace() & IDNS_Tag))
	IdentifierNamespace \|= IDNS_Tag \| IDNS_Type;
	}

	if (OldNS & (IDNS_Ordinary \| IDNS_OrdinaryFriend \| IDNS_LocalExtern)) {
	IdentifierNamespace \|= IDNS_OrdinaryFriend;
	if (PerformFriendInjection \|\|
	(Prev && Prev->getIdentifierNamespace() & IDNS_Ordinary))
	IdentifierNamespace \|= IDNS_Ordinary;
	}
	}

	enum FriendObjectKind {
	FOK_None, ///< Not a friend object.
	FOK_Declared, ///< A friend of a previously-declared entity.
	FOK_Undeclared ///< A friend of a previously-undeclared entity.
	};

	/// \brief Determines whether this declaration is the object of a
	/// friend declaration and, if so, what kind.
	///
	/// There is currently no direct way to find the associated FriendDecl.
	FriendObjectKind getFriendObjectKind() const {
	unsigned mask =
	(IdentifierNamespace & (IDNS_TagFriend \| IDNS_OrdinaryFriend));
	if (!mask) return FOK_None;
	return (IdentifierNamespace & (IDNS_Tag \| IDNS_Ordinary) ? FOK_Declared
	: FOK_Undeclared);
	}

	/// Specifies that this declaration is a C++ overloaded non-member.
	void setNonMemberOperator() {
	assert(getKind() == Function \|\| getKind() == FunctionTemplate);
	assert((IdentifierNamespace & IDNS_Ordinary) &&
	"visible non-member operators should be in ordinary namespace");
	IdentifierNamespace \|= IDNS_NonMemberOperator;
	}

	static bool classofKind(Kind K) { return true; }
	static DeclContext castToDeclContext(const Decl );
	static Decl castFromDeclContext(const DeclContext );

	void print(raw_ostream &Out, unsigned Indentation = 0,
	bool PrintInstantiation = false) const;
	void print(raw_ostream &Out, const PrintingPolicy &Policy,
	unsigned Indentation = 0, bool PrintInstantiation = false) const;
	static void printGroup(Decl** Begin, unsigned NumDecls,
	raw_ostream &Out, const PrintingPolicy &Policy,
	unsigned Indentation = 0);

	// Debuggers don't usually respect default arguments.
	void dump() const;

	// Same as dump(), but forces color printing.
	void dumpColor() const;

	void dump(raw_ostream &Out, bool Deserialize = false) const;

	/// \brief Looks through the Decl's underlying type to extract a FunctionType
	/// when possible. Will return null if the type underlying the Decl does not
	/// have a FunctionType.
	const FunctionType *getFunctionType(bool BlocksToo = true) const;

	private:
	void setAttrsImpl(const AttrVec& Attrs, ASTContext &Ctx);
	void setDeclContextsImpl(DeclContext SemaDC, DeclContext LexicalDC,
	ASTContext &Ctx);

	protected:
	ASTMutationListener *getASTMutationListener() const;
	};

	/// \brief Determine whether two declarations declare the same entity.
	inline bool declaresSameEntity(const Decl D1, const Decl D2) {
	if (!D1 \|\| !D2)
	return false;

	if (D1 == D2)
	return true;

	return D1->getCanonicalDecl() == D2->getCanonicalDecl();
	}

	/// PrettyStackTraceDecl - If a crash occurs, indicate that it happened when
	/// doing something to a specific decl.
	class PrettyStackTraceDecl : public llvm::PrettyStackTraceEntry {
	const Decl *TheDecl;
	SourceLocation Loc;
	SourceManager &SM;
	const char *Message;

	public:
	PrettyStackTraceDecl(const Decl *theDecl, SourceLocation L,
	SourceManager &sm, const char *Msg)
	: TheDecl(theDecl), Loc(L), SM(sm), Message(Msg) {}

	void print(raw_ostream &OS) const override;
	};

	/// \brief The results of name lookup within a DeclContext. This is either a
	/// single result (with no stable storage) or a collection of results (with
	/// stable storage provided by the lookup table).
	class DeclContextLookupResult {
	using ResultTy = ArrayRef<NamedDecl *>;

	ResultTy Result;

	// If there is only one lookup result, it would be invalidated by
	// reallocations of the name table, so store it separately.
	NamedDecl *Single = nullptr;

	static NamedDecl *const SingleElementDummyList;

	public:
	DeclContextLookupResult() = default;
	DeclContextLookupResult(ArrayRef<NamedDecl *> Result)
	: Result(Result) {}
	DeclContextLookupResult(NamedDecl *Single)
	: Result(SingleElementDummyList), Single(Single) {}

	class iterator;

	using IteratorBase =
	llvm::iterator_adaptor_base<iterator, ResultTy::iterator,
	std::random_access_iterator_tag,
	NamedDecl *const>;

	class iterator : public IteratorBase {
	value_type SingleElement;

	public:
	iterator() = default;
	explicit iterator(pointer Pos, value_type Single = nullptr)
	: IteratorBase(Pos), SingleElement(Single) {}

	reference operator*() const {
	return SingleElement ? SingleElement : IteratorBase::operator*();
	}
	};

	using const_iterator = iterator;
	using pointer = iterator::pointer;
	using reference = iterator::reference;

	iterator begin() const { return iterator(Result.begin(), Single); }
	iterator end() const { return iterator(Result.end(), Single); }

	bool empty() const { return Result.empty(); }
	pointer data() const { return Single ? &Single : Result.data(); }
	size_t size() const { return Single ? 1 : Result.size(); }
	reference front() const { return Single ? Single : Result.front(); }
	reference back() const { return Single ? Single : Result.back(); }
	reference operator[](size_t N) const { return Single ? Single : Result[N]; }

	// FIXME: Remove this from the interface
	DeclContextLookupResult slice(size_t N) const {
	DeclContextLookupResult Sliced = Result.slice(N);
	Sliced.Single = Single;
	return Sliced;
	}
	};

	/// DeclContext - This is used only as base class of specific decl types that
	/// can act as declaration contexts. These decls are (only the top classes
	/// that directly derive from DeclContext are mentioned, not their subclasses):
	///
	/// TranslationUnitDecl
	/// NamespaceDecl
	/// FunctionDecl
	/// TagDecl
	/// ObjCMethodDecl
	/// ObjCContainerDecl
	/// LinkageSpecDecl
	/// ExportDecl
	/// BlockDecl
	/// OMPDeclareReductionDecl
	class DeclContext {
	/// DeclKind - This indicates which class this is.
	unsigned DeclKind : 8;

	/// \brief Whether this declaration context also has some external
	/// storage that contains additional declarations that are lexically
	/// part of this context.
	mutable bool ExternalLexicalStorage : 1;

	/// \brief Whether this declaration context also has some external
	/// storage that contains additional declarations that are visible
	/// in this context.
	mutable bool ExternalVisibleStorage : 1;

	/// \brief Whether this declaration context has had external visible
	/// storage added since the last lookup. In this case, \c LookupPtr's
	/// invariant may not hold and needs to be fixed before we perform
	/// another lookup.
	mutable bool NeedToReconcileExternalVisibleStorage : 1;

	/// \brief If \c true, this context may have local lexical declarations
	/// that are missing from the lookup table.
	mutable bool HasLazyLocalLexicalLookups : 1;

	/// \brief If \c true, the external source may have lexical declarations
	/// that are missing from the lookup table.
	mutable bool HasLazyExternalLexicalLookups : 1;

	/// \brief If \c true, lookups should only return identifier from
	/// DeclContext scope (for example TranslationUnit). Used in
	/// LookupQualifiedName()
	mutable bool UseQualifiedLookup : 1;

	/// \brief Pointer to the data structure used to lookup declarations
	/// within this context (or a DependentStoredDeclsMap if this is a
	/// dependent context). We maintain the invariant that, if the map
	/// contains an entry for a DeclarationName (and we haven't lazily
	/// omitted anything), then it contains all relevant entries for that
	/// name (modulo the hasExternalDecls() flag).
	mutable StoredDeclsMap *LookupPtr = nullptr;

	protected:
	friend class ASTDeclReader;
	friend class ASTWriter;
	friend class ExternalASTSource;

	/// FirstDecl - The first declaration stored within this declaration
	/// context.
	mutable Decl *FirstDecl = nullptr;

	/// LastDecl - The last declaration stored within this declaration
	/// context. FIXME: We could probably cache this value somewhere
	/// outside of the DeclContext, to reduce the size of DeclContext by
	/// another pointer.
	mutable Decl *LastDecl = nullptr;

	/// \brief Build up a chain of declarations.
	///
	/// \returns the first/last pair of declarations.
	static std::pair<Decl , Decl >
	BuildDeclChain(ArrayRef<Decl*> Decls, bool FieldsAlreadyLoaded);

	DeclContext(Decl::Kind K)
	: DeclKind(K), ExternalLexicalStorage(false),
	ExternalVisibleStorage(false),
	NeedToReconcileExternalVisibleStorage(false),
	HasLazyLocalLexicalLookups(false), HasLazyExternalLexicalLookups(false),
	UseQualifiedLookup(false) {}

	public:
	~DeclContext();

	Decl::Kind getDeclKind() const {
	return static_cast<Decl::Kind>(DeclKind);
	}

	const char *getDeclKindName() const;

	/// getParent - Returns the containing DeclContext.
	DeclContext *getParent() {
	return cast<Decl>(this)->getDeclContext();
	}
	const DeclContext *getParent() const {
	return const_cast<DeclContext*>(this)->getParent();
	}

	/// getLexicalParent - Returns the containing lexical DeclContext. May be
	/// different from getParent, e.g.:
	///
	/// namespace A {
	/// struct S;
	/// }
	/// struct A::S {}; // getParent() == namespace 'A'
	/// // getLexicalParent() == translation unit
	///
	DeclContext *getLexicalParent() {
	return cast<Decl>(this)->getLexicalDeclContext();
	}
	const DeclContext *getLexicalParent() const {
	return const_cast<DeclContext*>(this)->getLexicalParent();
	}

	DeclContext *getLookupParent();

	const DeclContext *getLookupParent() const {
	return const_cast<DeclContext*>(this)->getLookupParent();
	}

	ASTContext &getParentASTContext() const {
	return cast<Decl>(this)->getASTContext();
	}

	bool isClosure() const {
	return DeclKind == Decl::Block;
	}

	bool isObjCContainer() const {
	switch (DeclKind) {
	case Decl::ObjCCategory:
	case Decl::ObjCCategoryImpl:
	case Decl::ObjCImplementation:
	case Decl::ObjCInterface:
	case Decl::ObjCProtocol:
	return true;
	}
	return false;
	}

	bool isFunctionOrMethod() const {
	switch (DeclKind) {
	case Decl::Block:
	case Decl::Captured:
	case Decl::ObjCMethod:
	return true;
	default:
	return DeclKind >= Decl::firstFunction && DeclKind <= Decl::lastFunction;
	}
	}

	/// \brief Test whether the context supports looking up names.
	bool isLookupContext() const {
	return !isFunctionOrMethod() && DeclKind != Decl::LinkageSpec &&
	DeclKind != Decl::Export;
	}

	bool isFileContext() const {
	return DeclKind == Decl::TranslationUnit \|\| DeclKind == Decl::Namespace;
	}

	bool isTranslationUnit() const {
	return DeclKind == Decl::TranslationUnit;
	}

	bool isRecord() const {
	return DeclKind >= Decl::firstRecord && DeclKind <= Decl::lastRecord;
	}

	bool isNamespace() const {
	return DeclKind == Decl::Namespace;
	}

	bool isStdNamespace() const;

	bool isInlineNamespace() const;

	/// \brief Determines whether this context is dependent on a
	/// template parameter.
	bool isDependentContext() const;

	/// isTransparentContext - Determines whether this context is a
	/// "transparent" context, meaning that the members declared in this
	/// context are semantically declared in the nearest enclosing
	/// non-transparent (opaque) context but are lexically declared in
	/// this context. For example, consider the enumerators of an
	/// enumeration type:
	/// @code
	/// enum E {
	/// Val1
	/// };
	/// @endcode
	/// Here, E is a transparent context, so its enumerator (Val1) will
	/// appear (semantically) that it is in the same context of E.
	/// Examples of transparent contexts include: enumerations (except for
	/// C++0x scoped enums), and C++ linkage specifications.
	bool isTransparentContext() const;

	/// \brief Determines whether this context or some of its ancestors is a
	/// linkage specification context that specifies C linkage.
	bool isExternCContext() const;

	/// \brief Retrieve the nearest enclosing C linkage specification context.
	const LinkageSpecDecl *getExternCContext() const;

	/// \brief Determines whether this context or some of its ancestors is a
	/// linkage specification context that specifies C++ linkage.
	bool isExternCXXContext() const;

	/// \brief Determine whether this declaration context is equivalent
	/// to the declaration context DC.
	bool Equals(const DeclContext *DC) const {
	return DC && this->getPrimaryContext() == DC->getPrimaryContext();
	}

	/// \brief Determine whether this declaration context encloses the
	/// declaration context DC.
	bool Encloses(const DeclContext *DC) const;

	/// \brief Find the nearest non-closure ancestor of this context,
	/// i.e. the innermost semantic parent of this context which is not
	/// a closure. A context may be its own non-closure ancestor.
	Decl *getNonClosureAncestor();
	const Decl *getNonClosureAncestor() const {
	return const_cast<DeclContext*>(this)->getNonClosureAncestor();
	}

	/// getPrimaryContext - There may be many different
	/// declarations of the same entity (including forward declarations
	/// of classes, multiple definitions of namespaces, etc.), each with
	/// a different set of declarations. This routine returns the
	/// "primary" DeclContext structure, which will contain the
	/// information needed to perform name lookup into this context.
	DeclContext *getPrimaryContext();
	const DeclContext *getPrimaryContext() const {
	return const_cast<DeclContext*>(this)->getPrimaryContext();
	}

	/// getRedeclContext - Retrieve the context in which an entity conflicts with
	/// other entities of the same name, or where it is a redeclaration if the
	/// two entities are compatible. This skips through transparent contexts.
	DeclContext *getRedeclContext();
	const DeclContext *getRedeclContext() const {
	return const_cast<DeclContext *>(this)->getRedeclContext();
	}

	/// \brief Retrieve the nearest enclosing namespace context.
	DeclContext *getEnclosingNamespaceContext();
	const DeclContext *getEnclosingNamespaceContext() const {
	return const_cast<DeclContext *>(this)->getEnclosingNamespaceContext();
	}

	/// \brief Retrieve the outermost lexically enclosing record context.
	RecordDecl *getOuterLexicalRecordContext();
	const RecordDecl *getOuterLexicalRecordContext() const {
	return const_cast<DeclContext *>(this)->getOuterLexicalRecordContext();
	}

	/// \brief Test if this context is part of the enclosing namespace set of
	/// the context NS, as defined in C++0x [namespace.def]p9. If either context
	/// isn't a namespace, this is equivalent to Equals().
	///
	/// The enclosing namespace set of a namespace is the namespace and, if it is
	/// inline, its enclosing namespace, recursively.
	bool InEnclosingNamespaceSetOf(const DeclContext *NS) const;

	/// \brief Collects all of the declaration contexts that are semantically
	/// connected to this declaration context.
	///
	/// For declaration contexts that have multiple semantically connected but
	/// syntactically distinct contexts, such as C++ namespaces, this routine
	/// retrieves the complete set of such declaration contexts in source order.
	/// For example, given:
	///
	/// \code
	/// namespace N {
	/// int x;
	/// }
	/// namespace N {
	/// int y;
	/// }
	/// \endcode
	///
	/// The \c Contexts parameter will contain both definitions of N.
	///
	/// \param Contexts Will be cleared and set to the set of declaration
	/// contexts that are semanticaly connected to this declaration context,
	/// in source order, including this context (which may be the only result,
	/// for non-namespace contexts).
	void collectAllContexts(SmallVectorImpl<DeclContext *> &Contexts);

	/// decl_iterator - Iterates through the declarations stored
	/// within this context.
	class decl_iterator {
	/// Current - The current declaration.
	Decl *Current = nullptr;

	public:
	using value_type = Decl *;
	using reference = const value_type &;
	using pointer = const value_type *;
	using iterator_category = std::forward_iterator_tag;
	using difference_type = std::ptrdiff_t;

	decl_iterator() = default;
	explicit decl_iterator(Decl *C) : Current(C) {}

	reference operator*() const { return Current; }

	// This doesn't meet the iterator requirements, but it's convenient
	value_type operator->() const { return Current; }

	decl_iterator& operator++() {
	Current = Current->getNextDeclInContext();
	return *this;
	}

	decl_iterator operator++(int) {
	decl_iterator tmp(*this);
	++(*this);
	return tmp;
	}

	friend bool operator==(decl_iterator x, decl_iterator y) {
	return x.Current == y.Current;
	}

	friend bool operator!=(decl_iterator x, decl_iterator y) {
	return x.Current != y.Current;
	}
	};

	using decl_range = llvm::iterator_range<decl_iterator>;

	/// decls_begin/decls_end - Iterate over the declarations stored in
	/// this context.
	decl_range decls() const { return decl_range(decls_begin(), decls_end()); }
	decl_iterator decls_begin() const;
	decl_iterator decls_end() const { return decl_iterator(); }
	bool decls_empty() const;

	/// noload_decls_begin/end - Iterate over the declarations stored in this
	/// context that are currently loaded; don't attempt to retrieve anything
	/// from an external source.
	decl_range noload_decls() const {
	return decl_range(noload_decls_begin(), noload_decls_end());
	}
	decl_iterator noload_decls_begin() const { return decl_iterator(FirstDecl); }
	decl_iterator noload_decls_end() const { return decl_iterator(); }

	/// specific_decl_iterator - Iterates over a subrange of
	/// declarations stored in a DeclContext, providing only those that
	/// are of type SpecificDecl (or a class derived from it). This
	/// iterator is used, for example, to provide iteration over just
	/// the fields within a RecordDecl (with SpecificDecl = FieldDecl).
	template<typename SpecificDecl>
	class specific_decl_iterator {
	/// Current - The current, underlying declaration iterator, which
	/// will either be NULL or will point to a declaration of
	/// type SpecificDecl.
	DeclContext::decl_iterator Current;

	/// SkipToNextDecl - Advances the current position up to the next
	/// declaration of type SpecificDecl that also meets the criteria
	/// required by Acceptable.
	void SkipToNextDecl() {
	while (Current && !isa<SpecificDecl>(Current))
	++Current;
	}

	public:
	using value_type = SpecificDecl *;
	// TODO: Add reference and pointer types (with some appropriate proxy type)
	// if we ever have a need for them.
	using reference = void;
	using pointer = void;
	using difference_type =
	std::iterator_traits<DeclContext::decl_iterator>::difference_type;
	using iterator_category = std::forward_iterator_tag;

	specific_decl_iterator() = default;

	/// specific_decl_iterator - Construct a new iterator over a
	/// subset of the declarations the range [C,
	/// end-of-declarations). If A is non-NULL, it is a pointer to a
	/// member function of SpecificDecl that should return true for
	/// all of the SpecificDecl instances that will be in the subset
	/// of iterators. For example, if you want Objective-C instance
	/// methods, SpecificDecl will be ObjCMethodDecl and A will be
	/// &ObjCMethodDecl::isInstanceMethod.
	explicit specific_decl_iterator(DeclContext::decl_iterator C) : Current(C) {
	SkipToNextDecl();
	}

	value_type operator() const { return cast<SpecificDecl>(Current); }

	// This doesn't meet the iterator requirements, but it's convenient
	value_type operator->() const { return **this; }

	specific_decl_iterator& operator++() {
	++Current;
	SkipToNextDecl();
	return *this;
	}

	specific_decl_iterator operator++(int) {
	specific_decl_iterator tmp(*this);
	++(*this);
	return tmp;
	}

	friend bool operator==(const specific_decl_iterator& x,
	const specific_decl_iterator& y) {
	return x.Current == y.Current;
	}

	friend bool operator!=(const specific_decl_iterator& x,
	const specific_decl_iterator& y) {
	return x.Current != y.Current;
	}
	};

	/// \brief Iterates over a filtered subrange of declarations stored
	/// in a DeclContext.
	///
	/// This iterator visits only those declarations that are of type
	/// SpecificDecl (or a class derived from it) and that meet some
	/// additional run-time criteria. This iterator is used, for
	/// example, to provide access to the instance methods within an
	/// Objective-C interface (with SpecificDecl = ObjCMethodDecl and
	/// Acceptable = ObjCMethodDecl::isInstanceMethod).
	template<typename SpecificDecl, bool (SpecificDecl::*Acceptable)() const>
	class filtered_decl_iterator {
	/// Current - The current, underlying declaration iterator, which
	/// will either be NULL or will point to a declaration of
	/// type SpecificDecl.
	DeclContext::decl_iterator Current;

	/// SkipToNextDecl - Advances the current position up to the next
	/// declaration of type SpecificDecl that also meets the criteria
	/// required by Acceptable.
	void SkipToNextDecl() {
	while (*Current &&
	(!isa<SpecificDecl>(*Current) \|\|
	(Acceptable && !(cast<SpecificDecl>(Current)->Acceptable)())))
	++Current;
	}

	public:
	using value_type = SpecificDecl *;
	// TODO: Add reference and pointer types (with some appropriate proxy type)
	// if we ever have a need for them.
	using reference = void;
	using pointer = void;
	using difference_type =
	std::iterator_traits<DeclContext::decl_iterator>::difference_type;
	using iterator_category = std::forward_iterator_tag;

	filtered_decl_iterator() = default;

	/// filtered_decl_iterator - Construct a new iterator over a
	/// subset of the declarations the range [C,
	/// end-of-declarations). If A is non-NULL, it is a pointer to a
	/// member function of SpecificDecl that should return true for
	/// all of the SpecificDecl instances that will be in the subset
	/// of iterators. For example, if you want Objective-C instance
	/// methods, SpecificDecl will be ObjCMethodDecl and A will be
	/// &ObjCMethodDecl::isInstanceMethod.
	explicit filtered_decl_iterator(DeclContext::decl_iterator C) : Current(C) {
	SkipToNextDecl();
	}

	value_type operator() const { return cast<SpecificDecl>(Current); }
	value_type operator->() const { return cast<SpecificDecl>(*Current); }

	filtered_decl_iterator& operator++() {
	++Current;
	SkipToNextDecl();
	return *this;
	}

	filtered_decl_iterator operator++(int) {
	filtered_decl_iterator tmp(*this);
	++(*this);
	return tmp;
	}

	friend bool operator==(const filtered_decl_iterator& x,
	const filtered_decl_iterator& y) {
	return x.Current == y.Current;
	}

	friend bool operator!=(const filtered_decl_iterator& x,
	const filtered_decl_iterator& y) {
	return x.Current != y.Current;
	}
	};

	/// @brief Add the declaration D into this context.
	///
	/// This routine should be invoked when the declaration D has first
	/// been declared, to place D into the context where it was
	/// (lexically) defined. Every declaration must be added to one
	/// (and only one!) context, where it can be visited via
	/// [decls_begin(), decls_end()). Once a declaration has been added
	/// to its lexical context, the corresponding DeclContext owns the
	/// declaration.
	///
	/// If D is also a NamedDecl, it will be made visible within its
	/// semantic context via makeDeclVisibleInContext.
	void addDecl(Decl *D);

	/// @brief Add the declaration D into this context, but suppress
	/// searches for external declarations with the same name.
	///
	/// Although analogous in function to addDecl, this removes an
	/// important check. This is only useful if the Decl is being
	/// added in response to an external search; in all other cases,
	/// addDecl() is the right function to use.
	/// See the ASTImporter for use cases.
	void addDeclInternal(Decl *D);

	/// @brief Add the declaration D to this context without modifying
	/// any lookup tables.
	///
	/// This is useful for some operations in dependent contexts where
	/// the semantic context might not be dependent; this basically
	/// only happens with friends.
	void addHiddenDecl(Decl *D);

	/// @brief Removes a declaration from this context.
	void removeDecl(Decl *D);

	/// @brief Checks whether a declaration is in this context.
	bool containsDecl(Decl *D) const;

	using lookup_result = DeclContextLookupResult;
	using lookup_iterator = lookup_result::iterator;

	/// lookup - Find the declarations (if any) with the given Name in
	/// this context. Returns a range of iterators that contains all of
	/// the declarations with this name, with object, function, member,
	/// and enumerator names preceding any tag name. Note that this
	/// routine will not look into parent contexts.
	lookup_result lookup(DeclarationName Name) const;

	/// \brief Find the declarations with the given name that are visible
	/// within this context; don't attempt to retrieve anything from an
	/// external source.
	lookup_result noload_lookup(DeclarationName Name);

	/// \brief A simplistic name lookup mechanism that performs name lookup
	/// into this declaration context without consulting the external source.
	///
	/// This function should almost never be used, because it subverts the
	/// usual relationship between a DeclContext and the external source.
	/// See the ASTImporter for the (few, but important) use cases.
	///
	/// FIXME: This is very inefficient; replace uses of it with uses of
	/// noload_lookup.
	void localUncachedLookup(DeclarationName Name,
	SmallVectorImpl<NamedDecl *> &Results);

	/// @brief Makes a declaration visible within this context.
	///
	/// This routine makes the declaration D visible to name lookup
	/// within this context and, if this is a transparent context,
	/// within its parent contexts up to the first enclosing
	/// non-transparent context. Making a declaration visible within a
	/// context does not transfer ownership of a declaration, and a
	/// declaration can be visible in many contexts that aren't its
	/// lexical context.
	///
	/// If D is a redeclaration of an existing declaration that is
	/// visible from this context, as determined by
	/// NamedDecl::declarationReplaces, the previous declaration will be
	/// replaced with D.
	void makeDeclVisibleInContext(NamedDecl *D);

	/// all_lookups_iterator - An iterator that provides a view over the results
	/// of looking up every possible name.
	class all_lookups_iterator;

	using lookups_range = llvm::iterator_range<all_lookups_iterator>;

	lookups_range lookups() const;
	lookups_range noload_lookups() const;

	/// \brief Iterators over all possible lookups within this context.
	all_lookups_iterator lookups_begin() const;
	all_lookups_iterator lookups_end() const;

	/// \brief Iterators over all possible lookups within this context that are
	/// currently loaded; don't attempt to retrieve anything from an external
	/// source.
	all_lookups_iterator noload_lookups_begin() const;
	all_lookups_iterator noload_lookups_end() const;

	struct udir_iterator;

	using udir_iterator_base =
	llvm::iterator_adaptor_base<udir_iterator, lookup_iterator,
	std::random_access_iterator_tag,
	UsingDirectiveDecl *>;

	struct udir_iterator : udir_iterator_base {
	udir_iterator(lookup_iterator I) : udir_iterator_base(I) {}

	UsingDirectiveDecl operator() const;
	};

	using udir_range = llvm::iterator_range<udir_iterator>;

	udir_range using_directives() const;

	// These are all defined in DependentDiagnostic.h.
	class ddiag_iterator;

	using ddiag_range = llvm::iterator_range<DeclContext::ddiag_iterator>;

	inline ddiag_range ddiags() const;

	// Low-level accessors

	/// \brief Mark that there are external lexical declarations that we need
	/// to include in our lookup table (and that are not available as external
	/// visible lookups). These extra lookup results will be found by walking
	/// the lexical declarations of this context. This should be used only if
	/// setHasExternalLexicalStorage() has been called on any decl context for
	/// which this is the primary context.
	void setMustBuildLookupTable() {
	assert(this == getPrimaryContext() &&
	"should only be called on primary context");
	HasLazyExternalLexicalLookups = true;
	}

	/// \brief Retrieve the internal representation of the lookup structure.
	/// This may omit some names if we are lazily building the structure.
	StoredDeclsMap *getLookupPtr() const { return LookupPtr; }

	/// \brief Ensure the lookup structure is fully-built and return it.
	StoredDeclsMap *buildLookup();

	/// \brief Whether this DeclContext has external storage containing
	/// additional declarations that are lexically in this context.
	bool hasExternalLexicalStorage() const { return ExternalLexicalStorage; }

	/// \brief State whether this DeclContext has external storage for
	/// declarations lexically in this context.
	void setHasExternalLexicalStorage(bool ES = true) {
	ExternalLexicalStorage = ES;
	}

	/// \brief Whether this DeclContext has external storage containing
	/// additional declarations that are visible in this context.
	bool hasExternalVisibleStorage() const { return ExternalVisibleStorage; }

	/// \brief State whether this DeclContext has external storage for
	/// declarations visible in this context.
	void setHasExternalVisibleStorage(bool ES = true) {
	ExternalVisibleStorage = ES;
	if (ES && LookupPtr)
	NeedToReconcileExternalVisibleStorage = true;
	}

	/// \brief Determine whether the given declaration is stored in the list of
	/// declarations lexically within this context.
	bool isDeclInLexicalTraversal(const Decl *D) const {
	return D && (D->NextInContextAndBits.getPointer() \|\| D == FirstDecl \|\|
	D == LastDecl);
	}

	bool setUseQualifiedLookup(bool use = true) {
	bool old_value = UseQualifiedLookup;
	UseQualifiedLookup = use;
	return old_value;
	}

	bool shouldUseQualifiedLookup() const {
	return UseQualifiedLookup;
	}

	static bool classof(const Decl *D);
	static bool classof(const DeclContext *D) { return true; }

	void dumpDeclContext() const;
	void dumpLookups() const;
	void dumpLookups(llvm::raw_ostream &OS, bool DumpDecls = false,
	bool Deserialize = false) const;

	private:
	friend class DependentDiagnostic;

	void reconcileExternalVisibleStorage() const;
	bool LoadLexicalDeclsFromExternalStorage() const;

	/// @brief Makes a declaration visible within this context, but
	/// suppresses searches for external declarations with the same
	/// name.
	///
	/// Analogous to makeDeclVisibleInContext, but for the exclusive
	/// use of addDeclInternal().
	void makeDeclVisibleInContextInternal(NamedDecl *D);

	StoredDeclsMap *CreateStoredDeclsMap(ASTContext &C) const;

	void buildLookupImpl(DeclContext *DCtx, bool Internal);
	void makeDeclVisibleInContextWithFlags(NamedDecl *D, bool Internal,
	bool Rediscoverable);
	void makeDeclVisibleInContextImpl(NamedDecl *D, bool Internal);
	};

	inline bool Decl::isTemplateParameter() const {
	return getKind() == TemplateTypeParm \|\| getKind() == NonTypeTemplateParm \|\|
	getKind() == TemplateTemplateParm;
	}

	// Specialization selected when ToTy is not a known subclass of DeclContext.
	template <class ToTy,
	bool IsKnownSubtype = ::std::is_base_of<DeclContext, ToTy>::value>
	struct cast_convert_decl_context {
	static const ToTy doit(const DeclContext Val) {
	return static_cast<const ToTy*>(Decl::castFromDeclContext(Val));
	}

	static ToTy doit(DeclContext Val) {
	return static_cast<ToTy*>(Decl::castFromDeclContext(Val));
	}
	};

	// Specialization selected when ToTy is a known subclass of DeclContext.
	template <class ToTy>
	struct cast_convert_decl_context<ToTy, true> {
	static const ToTy doit(const DeclContext Val) {
	return static_cast<const ToTy*>(Val);
	}

	static ToTy doit(DeclContext Val) {
	return static_cast<ToTy*>(Val);
	}
	};

	} // namespace clang

	namespace llvm {

	/// isa<T>(DeclContext*)
	template <typename To>
	struct isa_impl<To, ::clang::DeclContext> {
	static bool doit(const ::clang::DeclContext &Val) {
	return To::classofKind(Val.getDeclKind());
	}
	};

	/// cast<T>(DeclContext*)
	template<class ToTy>
	struct cast_convert_val<ToTy,
	const ::clang::DeclContext,const ::clang::DeclContext> {
	static const ToTy &doit(const ::clang::DeclContext &Val) {
	return *::clang::cast_convert_decl_context<ToTy>::doit(&Val);
	}
	};

	template<class ToTy>
	struct cast_convert_val<ToTy, ::clang::DeclContext, ::clang::DeclContext> {
	static ToTy &doit(::clang::DeclContext &Val) {
	return *::clang::cast_convert_decl_context<ToTy>::doit(&Val);
	}
	};

	template<class ToTy>
	struct cast_convert_val<ToTy,
	const ::clang::DeclContext, const ::clang::DeclContext> {
	static const ToTy doit(const ::clang::DeclContext Val) {
	return ::clang::cast_convert_decl_context<ToTy>::doit(Val);
	}
	};

	template<class ToTy>
	struct cast_convert_val<ToTy, ::clang::DeclContext, ::clang::DeclContext> {
	static ToTy doit(::clang::DeclContext Val) {
	return ::clang::cast_convert_decl_context<ToTy>::doit(Val);
	}
	};

	/// Implement cast_convert_val for Decl -> DeclContext conversions.
	template<class FromTy>
	struct cast_convert_val< ::clang::DeclContext, FromTy, FromTy> {
	static ::clang::DeclContext &doit(const FromTy &Val) {
	return *FromTy::castToDeclContext(&Val);
	}
	};

	template<class FromTy>
	struct cast_convert_val< ::clang::DeclContext, FromTy, FromTy> {
	static ::clang::DeclContext doit(const FromTy Val) {
	return FromTy::castToDeclContext(Val);
	}
	};

	template<class FromTy>
	struct cast_convert_val< const ::clang::DeclContext, FromTy, FromTy> {
	static const ::clang::DeclContext &doit(const FromTy &Val) {
	return *FromTy::castToDeclContext(&Val);
	}
	};

	template<class FromTy>
	struct cast_convert_val< const ::clang::DeclContext, FromTy, FromTy> {
	static const ::clang::DeclContext doit(const FromTy Val) {
	return FromTy::castToDeclContext(Val);
	}
	};

	} // namespace llvm

	#endif // LLVM_CLANG_AST_DECLBASE_H
	Index: head/contrib/llvm/tools/clang/lib/AST/ASTContext.cpp
	===================================================================
	--- head/contrib/llvm/tools/clang/lib/AST/ASTContext.cpp (revision 329409)
	+++ head/contrib/llvm/tools/clang/lib/AST/ASTContext.cpp (revision 329410)
	@@ -1,9952 +1,9952 @@
	//===- ASTContext.cpp - Context to hold long-lived AST nodes --------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the ASTContext interface.
	//
	//===----------------------------------------------------------------------===//

	#include "clang/AST/ASTContext.h"
	#include "CXXABI.h"
	#include "clang/AST/APValue.h"
	#include "clang/AST/ASTMutationListener.h"
	#include "clang/AST/ASTTypeTraits.h"
	#include "clang/AST/Attr.h"
	#include "clang/AST/AttrIterator.h"
	#include "clang/AST/CharUnits.h"
	#include "clang/AST/Comment.h"
	#include "clang/AST/Decl.h"
	#include "clang/AST/DeclBase.h"
	#include "clang/AST/DeclCXX.h"
	#include "clang/AST/DeclContextInternals.h"
	#include "clang/AST/DeclObjC.h"
	#include "clang/AST/DeclOpenMP.h"
	#include "clang/AST/DeclTemplate.h"
	#include "clang/AST/DeclarationName.h"
	#include "clang/AST/Expr.h"
	#include "clang/AST/ExprCXX.h"
	#include "clang/AST/ExternalASTSource.h"
	#include "clang/AST/Mangle.h"
	#include "clang/AST/MangleNumberingContext.h"
	#include "clang/AST/NestedNameSpecifier.h"
	#include "clang/AST/RawCommentList.h"
	#include "clang/AST/RecordLayout.h"
	#include "clang/AST/RecursiveASTVisitor.h"
	#include "clang/AST/Stmt.h"
	#include "clang/AST/TemplateBase.h"
	#include "clang/AST/TemplateName.h"
	#include "clang/AST/Type.h"
	#include "clang/AST/TypeLoc.h"
	#include "clang/AST/UnresolvedSet.h"
	#include "clang/AST/VTableBuilder.h"
	#include "clang/Basic/AddressSpaces.h"
	#include "clang/Basic/Builtins.h"
	#include "clang/Basic/CommentOptions.h"
	#include "clang/Basic/IdentifierTable.h"
	#include "clang/Basic/LLVM.h"
	#include "clang/Basic/LangOptions.h"
	#include "clang/Basic/Linkage.h"
	#include "clang/Basic/ObjCRuntime.h"
	#include "clang/Basic/SanitizerBlacklist.h"
	#include "clang/Basic/SourceLocation.h"
	#include "clang/Basic/SourceManager.h"
	#include "clang/Basic/Specifiers.h"
	#include "clang/Basic/TargetCXXABI.h"
	#include "clang/Basic/TargetInfo.h"
	#include "clang/Basic/XRayLists.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/APSInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/DenseSet.h"
	#include "llvm/ADT/FoldingSet.h"
	#include "llvm/ADT/None.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/PointerUnion.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/Support/Capacity.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include <algorithm>
	#include <cassert>
	#include <cstddef>
	#include <cstdint>
	#include <cstdlib>
	#include <map>
	#include <memory>
	#include <string>
	#include <tuple>
	#include <utility>

	using namespace clang;

	unsigned ASTContext::NumImplicitDefaultConstructors;
	unsigned ASTContext::NumImplicitDefaultConstructorsDeclared;
	unsigned ASTContext::NumImplicitCopyConstructors;
	unsigned ASTContext::NumImplicitCopyConstructorsDeclared;
	unsigned ASTContext::NumImplicitMoveConstructors;
	unsigned ASTContext::NumImplicitMoveConstructorsDeclared;
	unsigned ASTContext::NumImplicitCopyAssignmentOperators;
	unsigned ASTContext::NumImplicitCopyAssignmentOperatorsDeclared;
	unsigned ASTContext::NumImplicitMoveAssignmentOperators;
	unsigned ASTContext::NumImplicitMoveAssignmentOperatorsDeclared;
	unsigned ASTContext::NumImplicitDestructors;
	unsigned ASTContext::NumImplicitDestructorsDeclared;

	enum FloatingRank {
	Float16Rank, HalfRank, FloatRank, DoubleRank, LongDoubleRank, Float128Rank
	};

	RawComment ASTContext::getRawCommentForDeclNoCache(const Decl D) const {
	if (!CommentsLoaded && ExternalSource) {
	ExternalSource->ReadComments();

	#ifndef NDEBUG
	ArrayRef<RawComment *> RawComments = Comments.getComments();
	assert(std::is_sorted(RawComments.begin(), RawComments.end(),
	BeforeThanCompare<RawComment>(SourceMgr)));
	#endif

	CommentsLoaded = true;
	}

	assert(D);

	// User can not attach documentation to implicit declarations.
	if (D->isImplicit())
	return nullptr;

	// User can not attach documentation to implicit instantiations.
	if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(D)) {
	if (FD->getTemplateSpecializationKind() == TSK_ImplicitInstantiation)
	return nullptr;
	}

	if (const VarDecl *VD = dyn_cast<VarDecl>(D)) {
	if (VD->isStaticDataMember() &&
	VD->getTemplateSpecializationKind() == TSK_ImplicitInstantiation)
	return nullptr;
	}

	if (const CXXRecordDecl *CRD = dyn_cast<CXXRecordDecl>(D)) {
	if (CRD->getTemplateSpecializationKind() == TSK_ImplicitInstantiation)
	return nullptr;
	}

	if (const ClassTemplateSpecializationDecl *CTSD =
	dyn_cast<ClassTemplateSpecializationDecl>(D)) {
	TemplateSpecializationKind TSK = CTSD->getSpecializationKind();
	if (TSK == TSK_ImplicitInstantiation \|\|
	TSK == TSK_Undeclared)
	return nullptr;
	}

	if (const EnumDecl *ED = dyn_cast<EnumDecl>(D)) {
	if (ED->getTemplateSpecializationKind() == TSK_ImplicitInstantiation)
	return nullptr;
	}
	if (const TagDecl *TD = dyn_cast<TagDecl>(D)) {
	// When tag declaration (but not definition!) is part of the
	// decl-specifier-seq of some other declaration, it doesn't get comment
	if (TD->isEmbeddedInDeclarator() && !TD->isCompleteDefinition())
	return nullptr;
	}
	// TODO: handle comments for function parameters properly.
	if (isa<ParmVarDecl>(D))
	return nullptr;

	// TODO: we could look up template parameter documentation in the template
	// documentation.
	if (isa<TemplateTypeParmDecl>(D) \|\|
	isa<NonTypeTemplateParmDecl>(D) \|\|
	isa<TemplateTemplateParmDecl>(D))
	return nullptr;

	ArrayRef<RawComment *> RawComments = Comments.getComments();

	// If there are no comments anywhere, we won't find anything.
	if (RawComments.empty())
	return nullptr;

	// Find declaration location.
	// For Objective-C declarations we generally don't expect to have multiple
	// declarators, thus use declaration starting location as the "declaration
	// location".
	// For all other declarations multiple declarators are used quite frequently,
	// so we use the location of the identifier as the "declaration location".
	SourceLocation DeclLoc;
	if (isa<ObjCMethodDecl>(D) \|\| isa<ObjCContainerDecl>(D) \|\|
	isa<ObjCPropertyDecl>(D) \|\|
	isa<RedeclarableTemplateDecl>(D) \|\|
	isa<ClassTemplateSpecializationDecl>(D))
	DeclLoc = D->getLocStart();
	else {
	DeclLoc = D->getLocation();
	if (DeclLoc.isMacroID()) {
	if (isa<TypedefDecl>(D)) {
	// If location of the typedef name is in a macro, it is because being
	// declared via a macro. Try using declaration's starting location as
	// the "declaration location".
	DeclLoc = D->getLocStart();
	} else if (const TagDecl *TD = dyn_cast<TagDecl>(D)) {
	// If location of the tag decl is inside a macro, but the spelling of
	// the tag name comes from a macro argument, it looks like a special
	// macro like NS_ENUM is being used to define the tag decl. In that
	// case, adjust the source location to the expansion loc so that we can
	// attach the comment to the tag decl.
	if (SourceMgr.isMacroArgExpansion(DeclLoc) &&
	TD->isCompleteDefinition())
	DeclLoc = SourceMgr.getExpansionLoc(DeclLoc);
	}
	}
	}

	// If the declaration doesn't map directly to a location in a file, we
	// can't find the comment.
	if (DeclLoc.isInvalid() \|\| !DeclLoc.isFileID())
	return nullptr;

	// Find the comment that occurs just after this declaration.
	ArrayRef<RawComment *>::iterator Comment;
	{
	// When searching for comments during parsing, the comment we are looking
	// for is usually among the last two comments we parsed -- check them
	// first.
	RawComment CommentAtDeclLoc(
	SourceMgr, SourceRange(DeclLoc), false,
	LangOpts.CommentOpts.ParseAllComments);
	BeforeThanCompare<RawComment> Compare(SourceMgr);
	ArrayRef<RawComment *>::iterator MaybeBeforeDecl = RawComments.end() - 1;
	bool Found = Compare(*MaybeBeforeDecl, &CommentAtDeclLoc);
	if (!Found && RawComments.size() >= 2) {
	MaybeBeforeDecl--;
	Found = Compare(*MaybeBeforeDecl, &CommentAtDeclLoc);
	}

	if (Found) {
	Comment = MaybeBeforeDecl + 1;
	assert(Comment == std::lower_bound(RawComments.begin(), RawComments.end(),
	&CommentAtDeclLoc, Compare));
	} else {
	// Slow path.
	Comment = std::lower_bound(RawComments.begin(), RawComments.end(),
	&CommentAtDeclLoc, Compare);
	}
	}

	// Decompose the location for the declaration and find the beginning of the
	// file buffer.
	std::pair<FileID, unsigned> DeclLocDecomp = SourceMgr.getDecomposedLoc(DeclLoc);

	// First check whether we have a trailing comment.
	if (Comment != RawComments.end() &&
	(Comment)->isDocumentation() && (Comment)->isTrailingComment() &&
	(isa<FieldDecl>(D) \|\| isa<EnumConstantDecl>(D) \|\| isa<VarDecl>(D) \|\|
	isa<ObjCMethodDecl>(D) \|\| isa<ObjCPropertyDecl>(D))) {
	std::pair<FileID, unsigned> CommentBeginDecomp
	= SourceMgr.getDecomposedLoc((*Comment)->getSourceRange().getBegin());
	// Check that Doxygen trailing comment comes after the declaration, starts
	// on the same line and in the same file as the declaration.
	if (DeclLocDecomp.first == CommentBeginDecomp.first &&
	SourceMgr.getLineNumber(DeclLocDecomp.first, DeclLocDecomp.second)
	== SourceMgr.getLineNumber(CommentBeginDecomp.first,
	CommentBeginDecomp.second)) {
	return *Comment;
	}
	}

	// The comment just after the declaration was not a trailing comment.
	// Let's look at the previous comment.
	if (Comment == RawComments.begin())
	return nullptr;
	--Comment;

	// Check that we actually have a non-member Doxygen comment.
	if (!(Comment)->isDocumentation() \|\| (Comment)->isTrailingComment())
	return nullptr;

	// Decompose the end of the comment.
	std::pair<FileID, unsigned> CommentEndDecomp
	= SourceMgr.getDecomposedLoc((*Comment)->getSourceRange().getEnd());

	// If the comment and the declaration aren't in the same file, then they
	// aren't related.
	if (DeclLocDecomp.first != CommentEndDecomp.first)
	return nullptr;

	// Get the corresponding buffer.
	bool Invalid = false;
	const char *Buffer = SourceMgr.getBufferData(DeclLocDecomp.first,
	&Invalid).data();
	if (Invalid)
	return nullptr;

	// Extract text between the comment and declaration.
	StringRef Text(Buffer + CommentEndDecomp.second,
	DeclLocDecomp.second - CommentEndDecomp.second);

	// There should be no other declarations or preprocessor directives between
	// comment and declaration.
	if (Text.find_first_of(";{}#@") != StringRef::npos)
	return nullptr;

	return *Comment;
	}

	/// If we have a 'templated' declaration for a template, adjust 'D' to
	/// refer to the actual template.
	/// If we have an implicit instantiation, adjust 'D' to refer to template.
	static const Decl adjustDeclToTemplate(const Decl D) {
	if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(D)) {
	// Is this function declaration part of a function template?
	if (const FunctionTemplateDecl *FTD = FD->getDescribedFunctionTemplate())
	return FTD;

	// Nothing to do if function is not an implicit instantiation.
	if (FD->getTemplateSpecializationKind() != TSK_ImplicitInstantiation)
	return D;

	// Function is an implicit instantiation of a function template?
	if (const FunctionTemplateDecl *FTD = FD->getPrimaryTemplate())
	return FTD;

	// Function is instantiated from a member definition of a class template?
	if (const FunctionDecl *MemberDecl =
	FD->getInstantiatedFromMemberFunction())
	return MemberDecl;

	return D;
	}
	if (const VarDecl *VD = dyn_cast<VarDecl>(D)) {
	// Static data member is instantiated from a member definition of a class
	// template?
	if (VD->isStaticDataMember())
	if (const VarDecl *MemberDecl = VD->getInstantiatedFromStaticDataMember())
	return MemberDecl;

	return D;
	}
	if (const CXXRecordDecl *CRD = dyn_cast<CXXRecordDecl>(D)) {
	// Is this class declaration part of a class template?
	if (const ClassTemplateDecl *CTD = CRD->getDescribedClassTemplate())
	return CTD;

	// Class is an implicit instantiation of a class template or partial
	// specialization?
	if (const ClassTemplateSpecializationDecl *CTSD =
	dyn_cast<ClassTemplateSpecializationDecl>(CRD)) {
	if (CTSD->getSpecializationKind() != TSK_ImplicitInstantiation)
	return D;
	llvm::PointerUnion<ClassTemplateDecl *,
	ClassTemplatePartialSpecializationDecl *>
	PU = CTSD->getSpecializedTemplateOrPartial();
	return PU.is<ClassTemplateDecl*>() ?
	static_cast<const Decl>(PU.get<ClassTemplateDecl >()) :
	static_cast<const Decl*>(
	PU.get<ClassTemplatePartialSpecializationDecl *>());
	}

	// Class is instantiated from a member definition of a class template?
	if (const MemberSpecializationInfo *Info =
	CRD->getMemberSpecializationInfo())
	return Info->getInstantiatedFrom();

	return D;
	}
	if (const EnumDecl *ED = dyn_cast<EnumDecl>(D)) {
	// Enum is instantiated from a member definition of a class template?
	if (const EnumDecl *MemberDecl = ED->getInstantiatedFromMemberEnum())
	return MemberDecl;

	return D;
	}
	// FIXME: Adjust alias templates?
	return D;
	}

	const RawComment *ASTContext::getRawCommentForAnyRedecl(
	const Decl *D,
	const Decl **OriginalDecl) const {
	D = adjustDeclToTemplate(D);

	// Check whether we have cached a comment for this declaration already.
	{
	llvm::DenseMap<const Decl *, RawCommentAndCacheFlags>::iterator Pos =
	RedeclComments.find(D);
	if (Pos != RedeclComments.end()) {
	const RawCommentAndCacheFlags &Raw = Pos->second;
	if (Raw.getKind() != RawCommentAndCacheFlags::NoCommentInDecl) {
	if (OriginalDecl)
	*OriginalDecl = Raw.getOriginalDecl();
	return Raw.getRaw();
	}
	}
	}

	// Search for comments attached to declarations in the redeclaration chain.
	const RawComment *RC = nullptr;
	const Decl *OriginalDeclForRC = nullptr;
	for (auto I : D->redecls()) {
	llvm::DenseMap<const Decl *, RawCommentAndCacheFlags>::iterator Pos =
	RedeclComments.find(I);
	if (Pos != RedeclComments.end()) {
	const RawCommentAndCacheFlags &Raw = Pos->second;
	if (Raw.getKind() != RawCommentAndCacheFlags::NoCommentInDecl) {
	RC = Raw.getRaw();
	OriginalDeclForRC = Raw.getOriginalDecl();
	break;
	}
	} else {
	RC = getRawCommentForDeclNoCache(I);
	OriginalDeclForRC = I;
	RawCommentAndCacheFlags Raw;
	if (RC) {
	// Call order swapped to work around ICE in VS2015 RTM (Release Win32)
	// https://connect.microsoft.com/VisualStudio/feedback/details/1741530
	Raw.setKind(RawCommentAndCacheFlags::FromDecl);
	Raw.setRaw(RC);
	} else
	Raw.setKind(RawCommentAndCacheFlags::NoCommentInDecl);
	Raw.setOriginalDecl(I);
	RedeclComments[I] = Raw;
	if (RC)
	break;
	}
	}

	// If we found a comment, it should be a documentation comment.
	assert(!RC \|\| RC->isDocumentation());

	if (OriginalDecl)
	*OriginalDecl = OriginalDeclForRC;

	// Update cache for every declaration in the redeclaration chain.
	RawCommentAndCacheFlags Raw;
	Raw.setRaw(RC);
	Raw.setKind(RawCommentAndCacheFlags::FromRedecl);
	Raw.setOriginalDecl(OriginalDeclForRC);

	for (auto I : D->redecls()) {
	RawCommentAndCacheFlags &R = RedeclComments[I];
	if (R.getKind() == RawCommentAndCacheFlags::NoCommentInDecl)
	R = Raw;
	}

	return RC;
	}

	static void addRedeclaredMethods(const ObjCMethodDecl *ObjCMethod,
	SmallVectorImpl<const NamedDecl *> &Redeclared) {
	const DeclContext *DC = ObjCMethod->getDeclContext();
	if (const ObjCImplDecl *IMD = dyn_cast<ObjCImplDecl>(DC)) {
	const ObjCInterfaceDecl *ID = IMD->getClassInterface();
	if (!ID)
	return;
	// Add redeclared method here.
	for (const auto *Ext : ID->known_extensions()) {
	if (ObjCMethodDecl *RedeclaredMethod =
	Ext->getMethod(ObjCMethod->getSelector(),
	ObjCMethod->isInstanceMethod()))
	Redeclared.push_back(RedeclaredMethod);
	}
	}
	}

	comments::FullComment ASTContext::cloneFullComment(comments::FullComment FC,
	const Decl *D) const {
	comments::DeclInfo ThisDeclInfo = new (this) comments::DeclInfo;
	ThisDeclInfo->CommentDecl = D;
	ThisDeclInfo->IsFilled = false;
	ThisDeclInfo->fill();
	ThisDeclInfo->CommentDecl = FC->getDecl();
	if (!ThisDeclInfo->TemplateParameters)
	ThisDeclInfo->TemplateParameters = FC->getDeclInfo()->TemplateParameters;
	comments::FullComment *CFC =
	new (*this) comments::FullComment(FC->getBlocks(),
	ThisDeclInfo);
	return CFC;
	}

	comments::FullComment ASTContext::getLocalCommentForDeclUncached(const Decl D) const {
	const RawComment *RC = getRawCommentForDeclNoCache(D);
	return RC ? RC->parse(*this, nullptr, D) : nullptr;
	}

	comments::FullComment *ASTContext::getCommentForDecl(
	const Decl *D,
	const Preprocessor *PP) const {
	if (D->isInvalidDecl())
	return nullptr;
	D = adjustDeclToTemplate(D);

	const Decl *Canonical = D->getCanonicalDecl();
	llvm::DenseMap<const Decl , comments::FullComment >::iterator Pos =
	ParsedComments.find(Canonical);

	if (Pos != ParsedComments.end()) {
	if (Canonical != D) {
	comments::FullComment *FC = Pos->second;
	comments::FullComment *CFC = cloneFullComment(FC, D);
	return CFC;
	}
	return Pos->second;
	}

	const Decl *OriginalDecl;

	const RawComment *RC = getRawCommentForAnyRedecl(D, &OriginalDecl);
	if (!RC) {
	if (isa<ObjCMethodDecl>(D) \|\| isa<FunctionDecl>(D)) {
	SmallVector<const NamedDecl*, 8> Overridden;
	const ObjCMethodDecl *OMD = dyn_cast<ObjCMethodDecl>(D);
	if (OMD && OMD->isPropertyAccessor())
	if (const ObjCPropertyDecl *PDecl = OMD->findPropertyDecl())
	if (comments::FullComment *FC = getCommentForDecl(PDecl, PP))
	return cloneFullComment(FC, D);
	if (OMD)
	addRedeclaredMethods(OMD, Overridden);
	getOverriddenMethods(dyn_cast<NamedDecl>(D), Overridden);
	for (unsigned i = 0, e = Overridden.size(); i < e; i++)
	if (comments::FullComment *FC = getCommentForDecl(Overridden[i], PP))
	return cloneFullComment(FC, D);
	}
	else if (const TypedefNameDecl *TD = dyn_cast<TypedefNameDecl>(D)) {
	// Attach any tag type's documentation to its typedef if latter
	// does not have one of its own.
	QualType QT = TD->getUnderlyingType();
	if (const TagType *TT = QT->getAs<TagType>())
	if (const Decl *TD = TT->getDecl())
	if (comments::FullComment *FC = getCommentForDecl(TD, PP))
	return cloneFullComment(FC, D);
	}
	else if (const ObjCInterfaceDecl *IC = dyn_cast<ObjCInterfaceDecl>(D)) {
	while (IC->getSuperClass()) {
	IC = IC->getSuperClass();
	if (comments::FullComment *FC = getCommentForDecl(IC, PP))
	return cloneFullComment(FC, D);
	}
	}
	else if (const ObjCCategoryDecl *CD = dyn_cast<ObjCCategoryDecl>(D)) {
	if (const ObjCInterfaceDecl *IC = CD->getClassInterface())
	if (comments::FullComment *FC = getCommentForDecl(IC, PP))
	return cloneFullComment(FC, D);
	}
	else if (const CXXRecordDecl *RD = dyn_cast<CXXRecordDecl>(D)) {
	if (!(RD = RD->getDefinition()))
	return nullptr;
	// Check non-virtual bases.
	for (const auto &I : RD->bases()) {
	if (I.isVirtual() \|\| (I.getAccessSpecifier() != AS_public))
	continue;
	QualType Ty = I.getType();
	if (Ty.isNull())
	continue;
	if (const CXXRecordDecl *NonVirtualBase = Ty->getAsCXXRecordDecl()) {
	if (!(NonVirtualBase= NonVirtualBase->getDefinition()))
	continue;

	if (comments::FullComment *FC = getCommentForDecl((NonVirtualBase), PP))
	return cloneFullComment(FC, D);
	}
	}
	// Check virtual bases.
	for (const auto &I : RD->vbases()) {
	if (I.getAccessSpecifier() != AS_public)
	continue;
	QualType Ty = I.getType();
	if (Ty.isNull())
	continue;
	if (const CXXRecordDecl *VirtualBase = Ty->getAsCXXRecordDecl()) {
	if (!(VirtualBase= VirtualBase->getDefinition()))
	continue;
	if (comments::FullComment *FC = getCommentForDecl((VirtualBase), PP))
	return cloneFullComment(FC, D);
	}
	}
	}
	return nullptr;
	}

	// If the RawComment was attached to other redeclaration of this Decl, we
	// should parse the comment in context of that other Decl. This is important
	// because comments can contain references to parameter names which can be
	// different across redeclarations.
	if (D != OriginalDecl)
	return getCommentForDecl(OriginalDecl, PP);

	comments::FullComment FC = RC->parse(this, PP, D);
	ParsedComments[Canonical] = FC;
	return FC;
	}

	void
	ASTContext::CanonicalTemplateTemplateParm::Profile(llvm::FoldingSetNodeID &ID,
	TemplateTemplateParmDecl *Parm) {
	ID.AddInteger(Parm->getDepth());
	ID.AddInteger(Parm->getPosition());
	ID.AddBoolean(Parm->isParameterPack());

	TemplateParameterList *Params = Parm->getTemplateParameters();
	ID.AddInteger(Params->size());
	for (TemplateParameterList::const_iterator P = Params->begin(),
	PEnd = Params->end();
	P != PEnd; ++P) {
	if (TemplateTypeParmDecl TTP = dyn_cast<TemplateTypeParmDecl>(P)) {
	ID.AddInteger(0);
	ID.AddBoolean(TTP->isParameterPack());
	continue;
	}

	if (NonTypeTemplateParmDecl NTTP = dyn_cast<NonTypeTemplateParmDecl>(P)) {
	ID.AddInteger(1);
	ID.AddBoolean(NTTP->isParameterPack());
	ID.AddPointer(NTTP->getType().getCanonicalType().getAsOpaquePtr());
	if (NTTP->isExpandedParameterPack()) {
	ID.AddBoolean(true);
	ID.AddInteger(NTTP->getNumExpansionTypes());
	for (unsigned I = 0, N = NTTP->getNumExpansionTypes(); I != N; ++I) {
	QualType T = NTTP->getExpansionType(I);
	ID.AddPointer(T.getCanonicalType().getAsOpaquePtr());
	}
	} else
	ID.AddBoolean(false);
	continue;
	}

	TemplateTemplateParmDecl TTP = cast<TemplateTemplateParmDecl>(P);
	ID.AddInteger(2);
	Profile(ID, TTP);
	}
	}

	TemplateTemplateParmDecl *
	ASTContext::getCanonicalTemplateTemplateParmDecl(
	TemplateTemplateParmDecl *TTP) const {
	// Check if we already have a canonical template template parameter.
	llvm::FoldingSetNodeID ID;
	CanonicalTemplateTemplateParm::Profile(ID, TTP);
	void *InsertPos = nullptr;
	CanonicalTemplateTemplateParm *Canonical
	= CanonTemplateTemplateParms.FindNodeOrInsertPos(ID, InsertPos);
	if (Canonical)
	return Canonical->getParam();

	// Build a canonical template parameter list.
	TemplateParameterList *Params = TTP->getTemplateParameters();
	SmallVector<NamedDecl *, 4> CanonParams;
	CanonParams.reserve(Params->size());
	for (TemplateParameterList::const_iterator P = Params->begin(),
	PEnd = Params->end();
	P != PEnd; ++P) {
	if (TemplateTypeParmDecl TTP = dyn_cast<TemplateTypeParmDecl>(P))
	CanonParams.push_back(
	TemplateTypeParmDecl::Create(*this, getTranslationUnitDecl(),
	SourceLocation(),
	SourceLocation(),
	TTP->getDepth(),
	TTP->getIndex(), nullptr, false,
	TTP->isParameterPack()));
	else if (NonTypeTemplateParmDecl *NTTP
	= dyn_cast<NonTypeTemplateParmDecl>(*P)) {
	QualType T = getCanonicalType(NTTP->getType());
	TypeSourceInfo *TInfo = getTrivialTypeSourceInfo(T);
	NonTypeTemplateParmDecl *Param;
	if (NTTP->isExpandedParameterPack()) {
	SmallVector<QualType, 2> ExpandedTypes;
	SmallVector<TypeSourceInfo *, 2> ExpandedTInfos;
	for (unsigned I = 0, N = NTTP->getNumExpansionTypes(); I != N; ++I) {
	ExpandedTypes.push_back(getCanonicalType(NTTP->getExpansionType(I)));
	ExpandedTInfos.push_back(
	getTrivialTypeSourceInfo(ExpandedTypes.back()));
	}

	Param = NonTypeTemplateParmDecl::Create(*this, getTranslationUnitDecl(),
	SourceLocation(),
	SourceLocation(),
	NTTP->getDepth(),
	NTTP->getPosition(), nullptr,
	T,
	TInfo,
	ExpandedTypes,
	ExpandedTInfos);
	} else {
	Param = NonTypeTemplateParmDecl::Create(*this, getTranslationUnitDecl(),
	SourceLocation(),
	SourceLocation(),
	NTTP->getDepth(),
	NTTP->getPosition(), nullptr,
	T,
	NTTP->isParameterPack(),
	TInfo);
	}
	CanonParams.push_back(Param);

	} else
	CanonParams.push_back(getCanonicalTemplateTemplateParmDecl(
	cast<TemplateTemplateParmDecl>(*P)));
	}

	assert(!TTP->getRequiresClause() &&
	"Unexpected requires-clause on template template-parameter");
	Expr *const CanonRequiresClause = nullptr;

	TemplateTemplateParmDecl *CanonTTP
	= TemplateTemplateParmDecl::Create(*this, getTranslationUnitDecl(),
	SourceLocation(), TTP->getDepth(),
	TTP->getPosition(),
	TTP->isParameterPack(),
	nullptr,
	TemplateParameterList::Create(*this, SourceLocation(),
	SourceLocation(),
	CanonParams,
	SourceLocation(),
	CanonRequiresClause));

	// Get the new insert position for the node we care about.
	Canonical = CanonTemplateTemplateParms.FindNodeOrInsertPos(ID, InsertPos);
	assert(!Canonical && "Shouldn't be in the map!");
	(void)Canonical;

	// Create the canonical template template parameter entry.
	Canonical = new (*this) CanonicalTemplateTemplateParm(CanonTTP);
	CanonTemplateTemplateParms.InsertNode(Canonical, InsertPos);
	return CanonTTP;
	}

	CXXABI *ASTContext::createCXXABI(const TargetInfo &T) {
	if (!LangOpts.CPlusPlus) return nullptr;

	switch (T.getCXXABI().getKind()) {
	case TargetCXXABI::GenericARM: // Same as Itanium at this level
	case TargetCXXABI::iOS:
	case TargetCXXABI::iOS64:
	case TargetCXXABI::WatchOS:
	case TargetCXXABI::GenericAArch64:
	case TargetCXXABI::GenericMIPS:
	case TargetCXXABI::GenericItanium:
	case TargetCXXABI::WebAssembly:
	return CreateItaniumCXXABI(*this);
	case TargetCXXABI::Microsoft:
	return CreateMicrosoftCXXABI(*this);
	}
	llvm_unreachable("Invalid CXXABI type!");
	}

	static const LangASMap *getAddressSpaceMap(const TargetInfo &T,
	const LangOptions &LOpts) {
	if (LOpts.FakeAddressSpaceMap) {
	// The fake address space map must have a distinct entry for each
	// language-specific address space.
	static const unsigned FakeAddrSpaceMap[] = {
	0, // Default
	1, // opencl_global
	3, // opencl_local
	2, // opencl_constant
	0, // opencl_private
	4, // opencl_generic
	5, // cuda_device
	6, // cuda_constant
	7 // cuda_shared
	};
	return &FakeAddrSpaceMap;
	} else {
	return &T.getAddressSpaceMap();
	}
	}

	static bool isAddrSpaceMapManglingEnabled(const TargetInfo &TI,
	const LangOptions &LangOpts) {
	switch (LangOpts.getAddressSpaceMapMangling()) {
	case LangOptions::ASMM_Target:
	return TI.useAddressSpaceMapMangling();
	case LangOptions::ASMM_On:
	return true;
	case LangOptions::ASMM_Off:
	return false;
	}
	llvm_unreachable("getAddressSpaceMapMangling() doesn't cover anything.");
	}

	ASTContext::ASTContext(LangOptions &LOpts, SourceManager &SM,
	IdentifierTable &idents, SelectorTable &sels,
	Builtin::Context &builtins)
	: FunctionProtoTypes(this_()), TemplateSpecializationTypes(this_()),
	DependentTemplateSpecializationTypes(this_()),
	SubstTemplateTemplateParmPacks(this_()), SourceMgr(SM), LangOpts(LOpts),
	SanitizerBL(new SanitizerBlacklist(LangOpts.SanitizerBlacklistFiles, SM)),
	XRayFilter(new XRayFunctionFilter(LangOpts.XRayAlwaysInstrumentFiles,
	LangOpts.XRayNeverInstrumentFiles, SM)),
	PrintingPolicy(LOpts), Idents(idents), Selectors(sels),
	BuiltinInfo(builtins), DeclarationNames(*this), Comments(SM),
	CommentCommandTraits(BumpAlloc, LOpts.CommentOpts), LastSDM(nullptr, 0) {
	TUDecl = TranslationUnitDecl::Create(*this);
	}

	ASTContext::~ASTContext() {
	ReleaseParentMapEntries();

	// Release the DenseMaps associated with DeclContext objects.
	// FIXME: Is this the ideal solution?
	ReleaseDeclContextMaps();

	// Call all of the deallocation functions on all of their targets.
	for (auto &Pair : Deallocations)
	(Pair.first)(Pair.second);

	// ASTRecordLayout objects in ASTRecordLayouts must always be destroyed
	// because they can contain DenseMaps.
	for (llvm::DenseMap<const ObjCContainerDecl*,
	const ASTRecordLayout*>::iterator
	I = ObjCLayouts.begin(), E = ObjCLayouts.end(); I != E; )
	// Increment in loop to prevent using deallocated memory.
	if (ASTRecordLayout R = const_cast<ASTRecordLayout>((I++)->second))
	R->Destroy(*this);

	for (llvm::DenseMap<const RecordDecl, const ASTRecordLayout>::iterator
	I = ASTRecordLayouts.begin(), E = ASTRecordLayouts.end(); I != E; ) {
	// Increment in loop to prevent using deallocated memory.
	if (ASTRecordLayout R = const_cast<ASTRecordLayout>((I++)->second))
	R->Destroy(*this);
	}

	for (llvm::DenseMap<const Decl, AttrVec>::iterator A = DeclAttrs.begin(),
	AEnd = DeclAttrs.end();
	A != AEnd; ++A)
	A->second->~AttrVec();

	for (std::pair<const MaterializeTemporaryExpr , APValue > &MTVPair :
	MaterializedTemporaryValues)
	MTVPair.second->~APValue();

	for (const auto &Value : ModuleInitializers)
	Value.second->~PerModuleInitializers();
	}

	void ASTContext::ReleaseParentMapEntries() {
	if (!PointerParents) return;
	for (const auto &Entry : *PointerParents) {
	if (Entry.second.is<ast_type_traits::DynTypedNode *>()) {
	delete Entry.second.get<ast_type_traits::DynTypedNode *>();
	} else if (Entry.second.is<ParentVector *>()) {
	delete Entry.second.get<ParentVector *>();
	}
	}
	for (const auto &Entry : *OtherParents) {
	if (Entry.second.is<ast_type_traits::DynTypedNode *>()) {
	delete Entry.second.get<ast_type_traits::DynTypedNode *>();
	} else if (Entry.second.is<ParentVector *>()) {
	delete Entry.second.get<ParentVector *>();
	}
	}
	}

	void ASTContext::AddDeallocation(void (Callback)(void), void *Data) {
	Deallocations.push_back({Callback, Data});
	}

	void
	ASTContext::setExternalSource(IntrusiveRefCntPtr<ExternalASTSource> Source) {
	ExternalSource = std::move(Source);
	}

	void ASTContext::PrintStats() const {
	llvm::errs() << "\n*** AST Context Stats:\n";
	llvm::errs() << " " << Types.size() << " types total.\n";

	unsigned counts[] = {
	#define TYPE(Name, Parent) 0,
	#define ABSTRACT_TYPE(Name, Parent)
	#include "clang/AST/TypeNodes.def"
	0 // Extra
	};

	for (unsigned i = 0, e = Types.size(); i != e; ++i) {
	Type *T = Types[i];
	counts[(unsigned)T->getTypeClass()]++;
	}

	unsigned Idx = 0;
	unsigned TotalBytes = 0;
	#define TYPE(Name, Parent) \
	if (counts[Idx]) \
	llvm::errs() << " " << counts[Idx] << " " << #Name \
	<< " types\n"; \
	TotalBytes += counts[Idx] * sizeof(Name##Type); \
	++Idx;
	#define ABSTRACT_TYPE(Name, Parent)
	#include "clang/AST/TypeNodes.def"

	llvm::errs() << "Total bytes = " << TotalBytes << "\n";

	// Implicit special member functions.
	llvm::errs() << NumImplicitDefaultConstructorsDeclared << "/"
	<< NumImplicitDefaultConstructors
	<< " implicit default constructors created\n";
	llvm::errs() << NumImplicitCopyConstructorsDeclared << "/"
	<< NumImplicitCopyConstructors
	<< " implicit copy constructors created\n";
	if (getLangOpts().CPlusPlus)
	llvm::errs() << NumImplicitMoveConstructorsDeclared << "/"
	<< NumImplicitMoveConstructors
	<< " implicit move constructors created\n";
	llvm::errs() << NumImplicitCopyAssignmentOperatorsDeclared << "/"
	<< NumImplicitCopyAssignmentOperators
	<< " implicit copy assignment operators created\n";
	if (getLangOpts().CPlusPlus)
	llvm::errs() << NumImplicitMoveAssignmentOperatorsDeclared << "/"
	<< NumImplicitMoveAssignmentOperators
	<< " implicit move assignment operators created\n";
	llvm::errs() << NumImplicitDestructorsDeclared << "/"
	<< NumImplicitDestructors
	<< " implicit destructors created\n";

	if (ExternalSource) {
	llvm::errs() << "\n";
	ExternalSource->PrintStats();
	}

	BumpAlloc.PrintStats();
	}

	void ASTContext::mergeDefinitionIntoModule(NamedDecl ND, Module M,
	bool NotifyListeners) {
	if (NotifyListeners)
	if (auto *Listener = getASTMutationListener())
	Listener->RedefinedHiddenDefinition(ND, M);

	if (getLangOpts().ModulesLocalVisibility)
	MergedDefModules[ND].push_back(M);
	else
	ND->setVisibleDespiteOwningModule();
	}

	void ASTContext::deduplicateMergedDefinitonsFor(NamedDecl *ND) {
	auto It = MergedDefModules.find(ND);
	if (It == MergedDefModules.end())
	return;

	auto &Merged = It->second;
	llvm::DenseSet<Module*> Found;
	for (Module *&M : Merged)
	if (!Found.insert(M).second)
	M = nullptr;
	Merged.erase(std::remove(Merged.begin(), Merged.end(), nullptr), Merged.end());
	}

	void ASTContext::PerModuleInitializers::resolve(ASTContext &Ctx) {
	if (LazyInitializers.empty())
	return;

	auto *Source = Ctx.getExternalSource();
	assert(Source && "lazy initializers but no external source");

	auto LazyInits = std::move(LazyInitializers);
	LazyInitializers.clear();

	for (auto ID : LazyInits)
	Initializers.push_back(Source->GetExternalDecl(ID));

	assert(LazyInitializers.empty() &&
	"GetExternalDecl for lazy module initializer added more inits");
	}

	void ASTContext::addModuleInitializer(Module M, Decl D) {
	// One special case: if we add a module initializer that imports another
	// module, and that module's only initializer is an ImportDecl, simplify.
	if (auto *ID = dyn_cast<ImportDecl>(D)) {
	auto It = ModuleInitializers.find(ID->getImportedModule());

	// Maybe the ImportDecl does nothing at all. (Common case.)
	if (It == ModuleInitializers.end())
	return;

	// Maybe the ImportDecl only imports another ImportDecl.
	auto &Imported = *It->second;
	if (Imported.Initializers.size() + Imported.LazyInitializers.size() == 1) {
	Imported.resolve(*this);
	auto *OnlyDecl = Imported.Initializers.front();
	if (isa<ImportDecl>(OnlyDecl))
	D = OnlyDecl;
	}
	}

	auto *&Inits = ModuleInitializers[M];
	if (!Inits)
	Inits = new (*this) PerModuleInitializers;
	Inits->Initializers.push_back(D);
	}

	void ASTContext::addLazyModuleInitializers(Module *M, ArrayRef<uint32_t> IDs) {
	auto *&Inits = ModuleInitializers[M];
	if (!Inits)
	Inits = new (*this) PerModuleInitializers;
	Inits->LazyInitializers.insert(Inits->LazyInitializers.end(),
	IDs.begin(), IDs.end());
	}

	ArrayRef<Decl> ASTContext::getModuleInitializers(Module M) {
	auto It = ModuleInitializers.find(M);
	if (It == ModuleInitializers.end())
	return None;

	auto *Inits = It->second;
	Inits->resolve(*this);
	return Inits->Initializers;
	}

	ExternCContextDecl *ASTContext::getExternCContextDecl() const {
	if (!ExternCContext)
	ExternCContext = ExternCContextDecl::Create(*this, getTranslationUnitDecl());

	return ExternCContext;
	}

	BuiltinTemplateDecl *
	ASTContext::buildBuiltinTemplateDecl(BuiltinTemplateKind BTK,
	const IdentifierInfo *II) const {
	auto BuiltinTemplate = BuiltinTemplateDecl::Create(this, TUDecl, II, BTK);
	BuiltinTemplate->setImplicit();
	TUDecl->addDecl(BuiltinTemplate);

	return BuiltinTemplate;
	}

	BuiltinTemplateDecl *
	ASTContext::getMakeIntegerSeqDecl() const {
	if (!MakeIntegerSeqDecl)
	MakeIntegerSeqDecl = buildBuiltinTemplateDecl(BTK__make_integer_seq,
	getMakeIntegerSeqName());
	return MakeIntegerSeqDecl;
	}

	BuiltinTemplateDecl *
	ASTContext::getTypePackElementDecl() const {
	if (!TypePackElementDecl)
	TypePackElementDecl = buildBuiltinTemplateDecl(BTK__type_pack_element,
	getTypePackElementName());
	return TypePackElementDecl;
	}

	RecordDecl *ASTContext::buildImplicitRecord(StringRef Name,
	RecordDecl::TagKind TK) const {
	SourceLocation Loc;
	RecordDecl *NewDecl;
	if (getLangOpts().CPlusPlus)
	NewDecl = CXXRecordDecl::Create(*this, TK, getTranslationUnitDecl(), Loc,
	Loc, &Idents.get(Name));
	else
	NewDecl = RecordDecl::Create(*this, TK, getTranslationUnitDecl(), Loc, Loc,
	&Idents.get(Name));
	NewDecl->setImplicit();
	NewDecl->addAttr(TypeVisibilityAttr::CreateImplicit(
	const_cast<ASTContext &>(*this), TypeVisibilityAttr::Default));
	return NewDecl;
	}

	TypedefDecl *ASTContext::buildImplicitTypedef(QualType T,
	StringRef Name) const {
	TypeSourceInfo *TInfo = getTrivialTypeSourceInfo(T);
	TypedefDecl *NewDecl = TypedefDecl::Create(
	const_cast<ASTContext &>(*this), getTranslationUnitDecl(),
	SourceLocation(), SourceLocation(), &Idents.get(Name), TInfo);
	NewDecl->setImplicit();
	return NewDecl;
	}

	TypedefDecl *ASTContext::getInt128Decl() const {
	if (!Int128Decl)
	Int128Decl = buildImplicitTypedef(Int128Ty, "__int128_t");
	return Int128Decl;
	}

	TypedefDecl *ASTContext::getUInt128Decl() const {
	if (!UInt128Decl)
	UInt128Decl = buildImplicitTypedef(UnsignedInt128Ty, "__uint128_t");
	return UInt128Decl;
	}

	void ASTContext::InitBuiltinType(CanQualType &R, BuiltinType::Kind K) {
	BuiltinType Ty = new (this, TypeAlignment) BuiltinType(K);
	R = CanQualType::CreateUnsafe(QualType(Ty, 0));
	Types.push_back(Ty);
	}

	void ASTContext::InitBuiltinTypes(const TargetInfo &Target,
	const TargetInfo *AuxTarget) {
	assert((!this->Target \|\| this->Target == &Target) &&
	"Incorrect target reinitialization");
	assert(VoidTy.isNull() && "Context reinitialized?");

	this->Target = &Target;
	this->AuxTarget = AuxTarget;

	ABI.reset(createCXXABI(Target));
	AddrSpaceMap = getAddressSpaceMap(Target, LangOpts);
	AddrSpaceMapMangling = isAddrSpaceMapManglingEnabled(Target, LangOpts);

	// C99 6.2.5p19.
	InitBuiltinType(VoidTy, BuiltinType::Void);

	// C99 6.2.5p2.
	InitBuiltinType(BoolTy, BuiltinType::Bool);
	// C99 6.2.5p3.
	if (LangOpts.CharIsSigned)
	InitBuiltinType(CharTy, BuiltinType::Char_S);
	else
	InitBuiltinType(CharTy, BuiltinType::Char_U);
	// C99 6.2.5p4.
	InitBuiltinType(SignedCharTy, BuiltinType::SChar);
	InitBuiltinType(ShortTy, BuiltinType::Short);
	InitBuiltinType(IntTy, BuiltinType::Int);
	InitBuiltinType(LongTy, BuiltinType::Long);
	InitBuiltinType(LongLongTy, BuiltinType::LongLong);

	// C99 6.2.5p6.
	InitBuiltinType(UnsignedCharTy, BuiltinType::UChar);
	InitBuiltinType(UnsignedShortTy, BuiltinType::UShort);
	InitBuiltinType(UnsignedIntTy, BuiltinType::UInt);
	InitBuiltinType(UnsignedLongTy, BuiltinType::ULong);
	InitBuiltinType(UnsignedLongLongTy, BuiltinType::ULongLong);

	// C99 6.2.5p10.
	InitBuiltinType(FloatTy, BuiltinType::Float);
	InitBuiltinType(DoubleTy, BuiltinType::Double);
	InitBuiltinType(LongDoubleTy, BuiltinType::LongDouble);

	// GNU extension, __float128 for IEEE quadruple precision
	InitBuiltinType(Float128Ty, BuiltinType::Float128);

	// C11 extension ISO/IEC TS 18661-3
	InitBuiltinType(Float16Ty, BuiltinType::Float16);

	// GNU extension, 128-bit integers.
	InitBuiltinType(Int128Ty, BuiltinType::Int128);
	InitBuiltinType(UnsignedInt128Ty, BuiltinType::UInt128);

	// C++ 3.9.1p5
	if (TargetInfo::isTypeSigned(Target.getWCharType()))
	InitBuiltinType(WCharTy, BuiltinType::WChar_S);
	else // -fshort-wchar makes wchar_t be unsigned.
	InitBuiltinType(WCharTy, BuiltinType::WChar_U);
	if (LangOpts.CPlusPlus && LangOpts.WChar)
	WideCharTy = WCharTy;
	else {
	// C99 (or C++ using -fno-wchar).
	WideCharTy = getFromTargetType(Target.getWCharType());
	}

	WIntTy = getFromTargetType(Target.getWIntType());

	if (LangOpts.CPlusPlus) // C++0x 3.9.1p5, extension for C++
	InitBuiltinType(Char16Ty, BuiltinType::Char16);
	else // C99
	Char16Ty = getFromTargetType(Target.getChar16Type());

	if (LangOpts.CPlusPlus) // C++0x 3.9.1p5, extension for C++
	InitBuiltinType(Char32Ty, BuiltinType::Char32);
	else // C99
	Char32Ty = getFromTargetType(Target.getChar32Type());

	// Placeholder type for type-dependent expressions whose type is
	// completely unknown. No code should ever check a type against
	// DependentTy and users should never see it; however, it is here to
	// help diagnose failures to properly check for type-dependent
	// expressions.
	InitBuiltinType(DependentTy, BuiltinType::Dependent);

	// Placeholder type for functions.
	InitBuiltinType(OverloadTy, BuiltinType::Overload);

	// Placeholder type for bound members.
	InitBuiltinType(BoundMemberTy, BuiltinType::BoundMember);

	// Placeholder type for pseudo-objects.
	InitBuiltinType(PseudoObjectTy, BuiltinType::PseudoObject);

	// "any" type; useful for debugger-like clients.
	InitBuiltinType(UnknownAnyTy, BuiltinType::UnknownAny);

	// Placeholder type for unbridged ARC casts.
	InitBuiltinType(ARCUnbridgedCastTy, BuiltinType::ARCUnbridgedCast);

	// Placeholder type for builtin functions.
	InitBuiltinType(BuiltinFnTy, BuiltinType::BuiltinFn);

	// Placeholder type for OMP array sections.
	if (LangOpts.OpenMP)
	InitBuiltinType(OMPArraySectionTy, BuiltinType::OMPArraySection);

	// C99 6.2.5p11.
	FloatComplexTy = getComplexType(FloatTy);
	DoubleComplexTy = getComplexType(DoubleTy);
	LongDoubleComplexTy = getComplexType(LongDoubleTy);
	Float128ComplexTy = getComplexType(Float128Ty);

	// Builtin types for 'id', 'Class', and 'SEL'.
	InitBuiltinType(ObjCBuiltinIdTy, BuiltinType::ObjCId);
	InitBuiltinType(ObjCBuiltinClassTy, BuiltinType::ObjCClass);
	InitBuiltinType(ObjCBuiltinSelTy, BuiltinType::ObjCSel);

	if (LangOpts.OpenCL) {
	#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
	InitBuiltinType(SingletonId, BuiltinType::Id);
	#include "clang/Basic/OpenCLImageTypes.def"

	InitBuiltinType(OCLSamplerTy, BuiltinType::OCLSampler);
	InitBuiltinType(OCLEventTy, BuiltinType::OCLEvent);
	InitBuiltinType(OCLClkEventTy, BuiltinType::OCLClkEvent);
	InitBuiltinType(OCLQueueTy, BuiltinType::OCLQueue);
	InitBuiltinType(OCLReserveIDTy, BuiltinType::OCLReserveID);
	}

	// Builtin type for __objc_yes and __objc_no
	ObjCBuiltinBoolTy = (Target.useSignedCharForObjCBool() ?
	SignedCharTy : BoolTy);

	ObjCConstantStringType = QualType();

	ObjCSuperType = QualType();

	// void * type
	if (LangOpts.OpenCLVersion >= 200) {
	auto Q = VoidTy.getQualifiers();
	Q.setAddressSpace(LangAS::opencl_generic);
	VoidPtrTy = getPointerType(getCanonicalType(
	getQualifiedType(VoidTy.getUnqualifiedType(), Q)));
	} else {
	VoidPtrTy = getPointerType(VoidTy);
	}

	// nullptr type (C++0x 2.14.7)
	InitBuiltinType(NullPtrTy, BuiltinType::NullPtr);

	// half type (OpenCL 6.1.1.1) / ARM NEON __fp16
	InitBuiltinType(HalfTy, BuiltinType::Half);

	// Builtin type used to help define __builtin_va_list.
	VaListTagDecl = nullptr;
	}

	DiagnosticsEngine &ASTContext::getDiagnostics() const {
	return SourceMgr.getDiagnostics();
	}

	AttrVec& ASTContext::getDeclAttrs(const Decl *D) {
	AttrVec *&Result = DeclAttrs[D];
	if (!Result) {
	void *Mem = Allocate(sizeof(AttrVec));
	Result = new (Mem) AttrVec;
	}

	return *Result;
	}

	/// \brief Erase the attributes corresponding to the given declaration.
	void ASTContext::eraseDeclAttrs(const Decl *D) {
	llvm::DenseMap<const Decl, AttrVec>::iterator Pos = DeclAttrs.find(D);
	if (Pos != DeclAttrs.end()) {
	Pos->second->~AttrVec();
	DeclAttrs.erase(Pos);
	}
	}

	// FIXME: Remove ?
	MemberSpecializationInfo *
	ASTContext::getInstantiatedFromStaticDataMember(const VarDecl *Var) {
	assert(Var->isStaticDataMember() && "Not a static data member");
	return getTemplateOrSpecializationInfo(Var)
	.dyn_cast<MemberSpecializationInfo *>();
	}

	ASTContext::TemplateOrSpecializationInfo
	ASTContext::getTemplateOrSpecializationInfo(const VarDecl *Var) {
	llvm::DenseMap<const VarDecl *, TemplateOrSpecializationInfo>::iterator Pos =
	TemplateOrInstantiation.find(Var);
	if (Pos == TemplateOrInstantiation.end())
	return TemplateOrSpecializationInfo();

	return Pos->second;
	}

	void
	ASTContext::setInstantiatedFromStaticDataMember(VarDecl Inst, VarDecl Tmpl,
	TemplateSpecializationKind TSK,
	SourceLocation PointOfInstantiation) {
	assert(Inst->isStaticDataMember() && "Not a static data member");
	assert(Tmpl->isStaticDataMember() && "Not a static data member");
	setTemplateOrSpecializationInfo(Inst, new (*this) MemberSpecializationInfo(
	Tmpl, TSK, PointOfInstantiation));
	}

	void
	ASTContext::setTemplateOrSpecializationInfo(VarDecl *Inst,
	TemplateOrSpecializationInfo TSI) {
	assert(!TemplateOrInstantiation[Inst] &&
	"Already noted what the variable was instantiated from");
	TemplateOrInstantiation[Inst] = TSI;
	}

	FunctionDecl *ASTContext::getClassScopeSpecializationPattern(
	const FunctionDecl *FD){
	assert(FD && "Specialization is 0");
	llvm::DenseMap<const FunctionDecl, FunctionDecl >::const_iterator Pos
	= ClassScopeSpecializationPattern.find(FD);
	if (Pos == ClassScopeSpecializationPattern.end())
	return nullptr;

	return Pos->second;
	}

	void ASTContext::setClassScopeSpecializationPattern(FunctionDecl *FD,
	FunctionDecl *Pattern) {
	assert(FD && "Specialization is 0");
	assert(Pattern && "Class scope specialization pattern is 0");
	ClassScopeSpecializationPattern[FD] = Pattern;
	}

	NamedDecl *
	ASTContext::getInstantiatedFromUsingDecl(NamedDecl *UUD) {
	auto Pos = InstantiatedFromUsingDecl.find(UUD);
	if (Pos == InstantiatedFromUsingDecl.end())
	return nullptr;

	return Pos->second;
	}

	void
	ASTContext::setInstantiatedFromUsingDecl(NamedDecl Inst, NamedDecl Pattern) {
	assert((isa<UsingDecl>(Pattern) \|\|
	isa<UnresolvedUsingValueDecl>(Pattern) \|\|
	isa<UnresolvedUsingTypenameDecl>(Pattern)) &&
	"pattern decl is not a using decl");
	assert((isa<UsingDecl>(Inst) \|\|
	isa<UnresolvedUsingValueDecl>(Inst) \|\|
	isa<UnresolvedUsingTypenameDecl>(Inst)) &&
	"instantiation did not produce a using decl");
	assert(!InstantiatedFromUsingDecl[Inst] && "pattern already exists");
	InstantiatedFromUsingDecl[Inst] = Pattern;
	}

	UsingShadowDecl *
	ASTContext::getInstantiatedFromUsingShadowDecl(UsingShadowDecl *Inst) {
	llvm::DenseMap<UsingShadowDecl, UsingShadowDecl>::const_iterator Pos
	= InstantiatedFromUsingShadowDecl.find(Inst);
	if (Pos == InstantiatedFromUsingShadowDecl.end())
	return nullptr;

	return Pos->second;
	}

	void
	ASTContext::setInstantiatedFromUsingShadowDecl(UsingShadowDecl *Inst,
	UsingShadowDecl *Pattern) {
	assert(!InstantiatedFromUsingShadowDecl[Inst] && "pattern already exists");
	InstantiatedFromUsingShadowDecl[Inst] = Pattern;
	}

	FieldDecl ASTContext::getInstantiatedFromUnnamedFieldDecl(FieldDecl Field) {
	llvm::DenseMap<FieldDecl , FieldDecl >::iterator Pos
	= InstantiatedFromUnnamedFieldDecl.find(Field);
	if (Pos == InstantiatedFromUnnamedFieldDecl.end())
	return nullptr;

	return Pos->second;
	}

	void ASTContext::setInstantiatedFromUnnamedFieldDecl(FieldDecl *Inst,
	FieldDecl *Tmpl) {
	assert(!Inst->getDeclName() && "Instantiated field decl is not unnamed");
	assert(!Tmpl->getDeclName() && "Template field decl is not unnamed");
	assert(!InstantiatedFromUnnamedFieldDecl[Inst] &&
	"Already noted what unnamed field was instantiated from");

	InstantiatedFromUnnamedFieldDecl[Inst] = Tmpl;
	}

	ASTContext::overridden_cxx_method_iterator
	ASTContext::overridden_methods_begin(const CXXMethodDecl *Method) const {
	return overridden_methods(Method).begin();
	}

	ASTContext::overridden_cxx_method_iterator
	ASTContext::overridden_methods_end(const CXXMethodDecl *Method) const {
	return overridden_methods(Method).end();
	}

	unsigned
	ASTContext::overridden_methods_size(const CXXMethodDecl *Method) const {
	auto Range = overridden_methods(Method);
	return Range.end() - Range.begin();
	}

	ASTContext::overridden_method_range
	ASTContext::overridden_methods(const CXXMethodDecl *Method) const {
	llvm::DenseMap<const CXXMethodDecl *, CXXMethodVector>::const_iterator Pos =
	OverriddenMethods.find(Method->getCanonicalDecl());
	if (Pos == OverriddenMethods.end())
	return overridden_method_range(nullptr, nullptr);
	return overridden_method_range(Pos->second.begin(), Pos->second.end());
	}

	void ASTContext::addOverriddenMethod(const CXXMethodDecl *Method,
	const CXXMethodDecl *Overridden) {
	assert(Method->isCanonicalDecl() && Overridden->isCanonicalDecl());
	OverriddenMethods[Method].push_back(Overridden);
	}

	void ASTContext::getOverriddenMethods(
	const NamedDecl *D,
	SmallVectorImpl<const NamedDecl *> &Overridden) const {
	assert(D);

	if (const CXXMethodDecl *CXXMethod = dyn_cast<CXXMethodDecl>(D)) {
	Overridden.append(overridden_methods_begin(CXXMethod),
	overridden_methods_end(CXXMethod));
	return;
	}

	const ObjCMethodDecl *Method = dyn_cast<ObjCMethodDecl>(D);
	if (!Method)
	return;

	SmallVector<const ObjCMethodDecl *, 8> OverDecls;
	Method->getOverriddenMethods(OverDecls);
	Overridden.append(OverDecls.begin(), OverDecls.end());
	}

	void ASTContext::addedLocalImportDecl(ImportDecl *Import) {
	assert(!Import->NextLocalImport && "Import declaration already in the chain");
	assert(!Import->isFromASTFile() && "Non-local import declaration");
	if (!FirstLocalImport) {
	FirstLocalImport = Import;
	LastLocalImport = Import;
	return;
	}

	LastLocalImport->NextLocalImport = Import;
	LastLocalImport = Import;
	}

	//===----------------------------------------------------------------------===//
	// Type Sizing and Analysis
	//===----------------------------------------------------------------------===//

	/// getFloatTypeSemantics - Return the APFloat 'semantics' for the specified
	/// scalar floating point type.
	const llvm::fltSemantics &ASTContext::getFloatTypeSemantics(QualType T) const {
	const BuiltinType *BT = T->getAs<BuiltinType>();
	assert(BT && "Not a floating point type!");
	switch (BT->getKind()) {
	default: llvm_unreachable("Not a floating point type!");
	case BuiltinType::Float16:
	case BuiltinType::Half:
	return Target->getHalfFormat();
	case BuiltinType::Float: return Target->getFloatFormat();
	case BuiltinType::Double: return Target->getDoubleFormat();
	case BuiltinType::LongDouble: return Target->getLongDoubleFormat();
	case BuiltinType::Float128: return Target->getFloat128Format();
	}
	}

	CharUnits ASTContext::getDeclAlign(const Decl *D, bool ForAlignof) const {
	unsigned Align = Target->getCharWidth();

	bool UseAlignAttrOnly = false;
	if (unsigned AlignFromAttr = D->getMaxAlignment()) {
	Align = AlignFromAttr;

	// __attribute__((aligned)) can increase or decrease alignment
	// except on a struct or struct member, where it only increases
	// alignment unless 'packed' is also specified.
	//
	// It is an error for alignas to decrease alignment, so we can
	// ignore that possibility; Sema should diagnose it.
	if (isa<FieldDecl>(D)) {
	UseAlignAttrOnly = D->hasAttr<PackedAttr>() \|\|
	cast<FieldDecl>(D)->getParent()->hasAttr<PackedAttr>();
	} else {
	UseAlignAttrOnly = true;
	}
	}
	else if (isa<FieldDecl>(D))
	UseAlignAttrOnly =
	D->hasAttr<PackedAttr>() \|\|
	cast<FieldDecl>(D)->getParent()->hasAttr<PackedAttr>();

	// If we're using the align attribute only, just ignore everything
	// else about the declaration and its type.
	if (UseAlignAttrOnly) {
	// do nothing
	} else if (const ValueDecl *VD = dyn_cast<ValueDecl>(D)) {
	QualType T = VD->getType();
	if (const ReferenceType *RT = T->getAs<ReferenceType>()) {
	if (ForAlignof)
	T = RT->getPointeeType();
	else
	T = getPointerType(RT->getPointeeType());
	}
	QualType BaseT = getBaseElementType(T);
	if (T->isFunctionType())
	Align = getTypeInfoImpl(T.getTypePtr()).Align;
	else if (!BaseT->isIncompleteType()) {
	// Adjust alignments of declarations with array type by the
	// large-array alignment on the target.
	if (const ArrayType *arrayType = getAsArrayType(T)) {
	unsigned MinWidth = Target->getLargeArrayMinWidth();
	if (!ForAlignof && MinWidth) {
	if (isa<VariableArrayType>(arrayType))
	Align = std::max(Align, Target->getLargeArrayAlign());
	else if (isa<ConstantArrayType>(arrayType) &&
	MinWidth <= getTypeSize(cast<ConstantArrayType>(arrayType)))
	Align = std::max(Align, Target->getLargeArrayAlign());
	}
	}
	Align = std::max(Align, getPreferredTypeAlign(T.getTypePtr()));
	if (BaseT.getQualifiers().hasUnaligned())
	Align = Target->getCharWidth();
	if (const VarDecl *VD = dyn_cast<VarDecl>(D)) {
	if (VD->hasGlobalStorage() && !ForAlignof)
	Align = std::max(Align, getTargetInfo().getMinGlobalAlign());
	}
	}

	// Fields can be subject to extra alignment constraints, like if
	// the field is packed, the struct is packed, or the struct has a
	// a max-field-alignment constraint (#pragma pack). So calculate
	// the actual alignment of the field within the struct, and then
	// (as we're expected to) constrain that by the alignment of the type.
	if (const FieldDecl *Field = dyn_cast<FieldDecl>(VD)) {
	const RecordDecl *Parent = Field->getParent();
	// We can only produce a sensible answer if the record is valid.
	if (!Parent->isInvalidDecl()) {
	const ASTRecordLayout &Layout = getASTRecordLayout(Parent);

	// Start with the record's overall alignment.
	unsigned FieldAlign = toBits(Layout.getAlignment());

	// Use the GCD of that and the offset within the record.
	uint64_t Offset = Layout.getFieldOffset(Field->getFieldIndex());
	if (Offset > 0) {
	// Alignment is always a power of 2, so the GCD will be a power of 2,
	// which means we get to do this crazy thing instead of Euclid's.
	uint64_t LowBitOfOffset = Offset & (~Offset + 1);
	if (LowBitOfOffset < FieldAlign)
	FieldAlign = static_cast<unsigned>(LowBitOfOffset);
	}

	Align = std::min(Align, FieldAlign);
	}
	}
	}

	return toCharUnitsFromBits(Align);
	}

	// getTypeInfoDataSizeInChars - Return the size of a type, in
	// chars. If the type is a record, its data size is returned. This is
	// the size of the memcpy that's performed when assigning this type
	// using a trivial copy/move assignment operator.
	std::pair<CharUnits, CharUnits>
	ASTContext::getTypeInfoDataSizeInChars(QualType T) const {
	std::pair<CharUnits, CharUnits> sizeAndAlign = getTypeInfoInChars(T);

	// In C++, objects can sometimes be allocated into the tail padding
	// of a base-class subobject. We decide whether that's possible
	// during class layout, so here we can just trust the layout results.
	if (getLangOpts().CPlusPlus) {
	if (const RecordType *RT = T->getAs<RecordType>()) {
	const ASTRecordLayout &layout = getASTRecordLayout(RT->getDecl());
	sizeAndAlign.first = layout.getDataSize();
	}
	}

	return sizeAndAlign;
	}

	/// getConstantArrayInfoInChars - Performing the computation in CharUnits
	/// instead of in bits prevents overflowing the uint64_t for some large arrays.
	std::pair<CharUnits, CharUnits>
	static getConstantArrayInfoInChars(const ASTContext &Context,
	const ConstantArrayType *CAT) {
	std::pair<CharUnits, CharUnits> EltInfo =
	Context.getTypeInfoInChars(CAT->getElementType());
	uint64_t Size = CAT->getSize().getZExtValue();
	assert((Size == 0 \|\| static_cast<uint64_t>(EltInfo.first.getQuantity()) <=
	(uint64_t)(-1)/Size) &&
	"Overflow in array type char size evaluation");
	uint64_t Width = EltInfo.first.getQuantity() * Size;
	unsigned Align = EltInfo.second.getQuantity();
	if (!Context.getTargetInfo().getCXXABI().isMicrosoft() \|\|
	Context.getTargetInfo().getPointerWidth(0) == 64)
	Width = llvm::alignTo(Width, Align);
	return std::make_pair(CharUnits::fromQuantity(Width),
	CharUnits::fromQuantity(Align));
	}

	std::pair<CharUnits, CharUnits>
	ASTContext::getTypeInfoInChars(const Type *T) const {
	if (const ConstantArrayType *CAT = dyn_cast<ConstantArrayType>(T))
	return getConstantArrayInfoInChars(*this, CAT);
	TypeInfo Info = getTypeInfo(T);
	return std::make_pair(toCharUnitsFromBits(Info.Width),
	toCharUnitsFromBits(Info.Align));
	}

	std::pair<CharUnits, CharUnits>
	ASTContext::getTypeInfoInChars(QualType T) const {
	return getTypeInfoInChars(T.getTypePtr());
	}

	bool ASTContext::isAlignmentRequired(const Type *T) const {
	return getTypeInfo(T).AlignIsRequired;
	}

	bool ASTContext::isAlignmentRequired(QualType T) const {
	return isAlignmentRequired(T.getTypePtr());
	}

	unsigned ASTContext::getTypeAlignIfKnown(QualType T) const {
	// An alignment on a typedef overrides anything else.
	if (auto *TT = T->getAs<TypedefType>())
	if (unsigned Align = TT->getDecl()->getMaxAlignment())
	return Align;

	// If we have an (array of) complete type, we're done.
	T = getBaseElementType(T);
	if (!T->isIncompleteType())
	return getTypeAlign(T);

	// If we had an array type, its element type might be a typedef
	// type with an alignment attribute.
	if (auto *TT = T->getAs<TypedefType>())
	if (unsigned Align = TT->getDecl()->getMaxAlignment())
	return Align;

	// Otherwise, see if the declaration of the type had an attribute.
	if (auto *TT = T->getAs<TagType>())
	return TT->getDecl()->getMaxAlignment();

	return 0;
	}

	TypeInfo ASTContext::getTypeInfo(const Type *T) const {
	TypeInfoMap::iterator I = MemoizedTypeInfo.find(T);
	if (I != MemoizedTypeInfo.end())
	return I->second;

	// This call can invalidate MemoizedTypeInfo[T], so we need a second lookup.
	TypeInfo TI = getTypeInfoImpl(T);
	MemoizedTypeInfo[T] = TI;
	return TI;
	}

	/// getTypeInfoImpl - Return the size of the specified type, in bits. This
	/// method does not work on incomplete types.
	///
	/// FIXME: Pointers into different addr spaces could have different sizes and
	/// alignment requirements: getPointerInfo should take an AddrSpace, this
	/// should take a QualType, &c.
	TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const {
	uint64_t Width = 0;
	unsigned Align = 8;
	bool AlignIsRequired = false;
	unsigned AS = 0;
	switch (T->getTypeClass()) {
	#define TYPE(Class, Base)
	#define ABSTRACT_TYPE(Class, Base)
	#define NON_CANONICAL_TYPE(Class, Base)
	#define DEPENDENT_TYPE(Class, Base) case Type::Class:
	#define NON_CANONICAL_UNLESS_DEPENDENT_TYPE(Class, Base) \
	case Type::Class: \
	assert(!T->isDependentType() && "should not see dependent types here"); \
	return getTypeInfo(cast<Class##Type>(T)->desugar().getTypePtr());
	#include "clang/AST/TypeNodes.def"
	llvm_unreachable("Should not see dependent types");

	case Type::FunctionNoProto:
	case Type::FunctionProto:
	// GCC extension: alignof(function) = 32 bits
	Width = 0;
	Align = 32;
	break;

	case Type::IncompleteArray:
	case Type::VariableArray:
	Width = 0;
	Align = getTypeAlign(cast<ArrayType>(T)->getElementType());
	break;

	case Type::ConstantArray: {
	const ConstantArrayType *CAT = cast<ConstantArrayType>(T);

	TypeInfo EltInfo = getTypeInfo(CAT->getElementType());
	uint64_t Size = CAT->getSize().getZExtValue();
	assert((Size == 0 \|\| EltInfo.Width <= (uint64_t)(-1) / Size) &&
	"Overflow in array type bit size evaluation");
	Width = EltInfo.Width * Size;
	Align = EltInfo.Align;
	if (!getTargetInfo().getCXXABI().isMicrosoft() \|\|
	getTargetInfo().getPointerWidth(0) == 64)
	Width = llvm::alignTo(Width, Align);
	break;
	}
	case Type::ExtVector:
	case Type::Vector: {
	const VectorType *VT = cast<VectorType>(T);
	TypeInfo EltInfo = getTypeInfo(VT->getElementType());
	Width = EltInfo.Width * VT->getNumElements();
	Align = Width;
	// If the alignment is not a power of 2, round up to the next power of 2.
	// This happens for non-power-of-2 length vectors.
	if (Align & (Align-1)) {
	Align = llvm::NextPowerOf2(Align);
	Width = llvm::alignTo(Width, Align);
	}
	// Adjust the alignment based on the target max.
	uint64_t TargetVectorAlign = Target->getMaxVectorAlign();
	if (TargetVectorAlign && TargetVectorAlign < Align)
	Align = TargetVectorAlign;
	break;
	}

	case Type::Builtin:
	switch (cast<BuiltinType>(T)->getKind()) {
	default: llvm_unreachable("Unknown builtin type!");
	case BuiltinType::Void:
	// GCC extension: alignof(void) = 8 bits.
	Width = 0;
	Align = 8;
	break;
	case BuiltinType::Bool:
	Width = Target->getBoolWidth();
	Align = Target->getBoolAlign();
	break;
	case BuiltinType::Char_S:
	case BuiltinType::Char_U:
	case BuiltinType::UChar:
	case BuiltinType::SChar:
	Width = Target->getCharWidth();
	Align = Target->getCharAlign();
	break;
	case BuiltinType::WChar_S:
	case BuiltinType::WChar_U:
	Width = Target->getWCharWidth();
	Align = Target->getWCharAlign();
	break;
	case BuiltinType::Char16:
	Width = Target->getChar16Width();
	Align = Target->getChar16Align();
	break;
	case BuiltinType::Char32:
	Width = Target->getChar32Width();
	Align = Target->getChar32Align();
	break;
	case BuiltinType::UShort:
	case BuiltinType::Short:
	Width = Target->getShortWidth();
	Align = Target->getShortAlign();
	break;
	case BuiltinType::UInt:
	case BuiltinType::Int:
	Width = Target->getIntWidth();
	Align = Target->getIntAlign();
	break;
	case BuiltinType::ULong:
	case BuiltinType::Long:
	Width = Target->getLongWidth();
	Align = Target->getLongAlign();
	break;
	case BuiltinType::ULongLong:
	case BuiltinType::LongLong:
	Width = Target->getLongLongWidth();
	Align = Target->getLongLongAlign();
	break;
	case BuiltinType::Int128:
	case BuiltinType::UInt128:
	Width = 128;
	Align = 128; // int128_t is 128-bit aligned on all targets.
	break;
	case BuiltinType::Float16:
	case BuiltinType::Half:
	Width = Target->getHalfWidth();
	Align = Target->getHalfAlign();
	break;
	case BuiltinType::Float:
	Width = Target->getFloatWidth();
	Align = Target->getFloatAlign();
	break;
	case BuiltinType::Double:
	Width = Target->getDoubleWidth();
	Align = Target->getDoubleAlign();
	break;
	case BuiltinType::LongDouble:
	Width = Target->getLongDoubleWidth();
	Align = Target->getLongDoubleAlign();
	break;
	case BuiltinType::Float128:
	Width = Target->getFloat128Width();
	Align = Target->getFloat128Align();
	break;
	case BuiltinType::NullPtr:
	Width = Target->getPointerWidth(0); // C++ 3.9.1p11: sizeof(nullptr_t)
	Align = Target->getPointerAlign(0); // == sizeof(void*)
	break;
	case BuiltinType::ObjCId:
	case BuiltinType::ObjCClass:
	case BuiltinType::ObjCSel:
	Width = Target->getPointerWidth(0);
	Align = Target->getPointerAlign(0);
	break;
	case BuiltinType::OCLSampler:
	case BuiltinType::OCLEvent:
	case BuiltinType::OCLClkEvent:
	case BuiltinType::OCLQueue:
	case BuiltinType::OCLReserveID:
	#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
	case BuiltinType::Id:
	#include "clang/Basic/OpenCLImageTypes.def"
	AS = getTargetAddressSpace(
	Target->getOpenCLTypeAddrSpace(getOpenCLTypeKind(T)));
	Width = Target->getPointerWidth(AS);
	Align = Target->getPointerAlign(AS);
	break;
	}
	break;
	case Type::ObjCObjectPointer:
	Width = Target->getPointerWidth(0);
	Align = Target->getPointerAlign(0);
	break;
	case Type::BlockPointer:
	AS = getTargetAddressSpace(cast<BlockPointerType>(T)->getPointeeType());
	Width = Target->getPointerWidth(AS);
	Align = Target->getPointerAlign(AS);
	break;
	case Type::LValueReference:
	case Type::RValueReference:
	// alignof and sizeof should never enter this code path here, so we go
	// the pointer route.
	AS = getTargetAddressSpace(cast<ReferenceType>(T)->getPointeeType());
	Width = Target->getPointerWidth(AS);
	Align = Target->getPointerAlign(AS);
	break;
	case Type::Pointer:
	AS = getTargetAddressSpace(cast<PointerType>(T)->getPointeeType());
	Width = Target->getPointerWidth(AS);
	Align = Target->getPointerAlign(AS);
	break;
	case Type::MemberPointer: {
	const MemberPointerType *MPT = cast<MemberPointerType>(T);
	CXXABI::MemberPointerInfo MPI = ABI->getMemberPointerInfo(MPT);
	Width = MPI.Width;
	Align = MPI.Align;
	break;
	}
	case Type::Complex: {
	// Complex types have the same alignment as their elements, but twice the
	// size.
	TypeInfo EltInfo = getTypeInfo(cast<ComplexType>(T)->getElementType());
	Width = EltInfo.Width * 2;
	Align = EltInfo.Align;
	break;
	}
	case Type::ObjCObject:
	return getTypeInfo(cast<ObjCObjectType>(T)->getBaseType().getTypePtr());
	case Type::Adjusted:
	case Type::Decayed:
	return getTypeInfo(cast<AdjustedType>(T)->getAdjustedType().getTypePtr());
	case Type::ObjCInterface: {
	const ObjCInterfaceType *ObjCI = cast<ObjCInterfaceType>(T);
	const ASTRecordLayout &Layout = getASTObjCInterfaceLayout(ObjCI->getDecl());
	Width = toBits(Layout.getSize());
	Align = toBits(Layout.getAlignment());
	break;
	}
	case Type::Record:
	case Type::Enum: {
	const TagType *TT = cast<TagType>(T);

	if (TT->getDecl()->isInvalidDecl()) {
	Width = 8;
	Align = 8;
	break;
	}

	if (const EnumType *ET = dyn_cast<EnumType>(TT)) {
	const EnumDecl *ED = ET->getDecl();
	TypeInfo Info =
	getTypeInfo(ED->getIntegerType()->getUnqualifiedDesugaredType());
	if (unsigned AttrAlign = ED->getMaxAlignment()) {
	Info.Align = AttrAlign;
	Info.AlignIsRequired = true;
	}
	return Info;
	}

	const RecordType *RT = cast<RecordType>(TT);
	const RecordDecl *RD = RT->getDecl();
	const ASTRecordLayout &Layout = getASTRecordLayout(RD);
	Width = toBits(Layout.getSize());
	Align = toBits(Layout.getAlignment());
	AlignIsRequired = RD->hasAttr<AlignedAttr>();
	break;
	}

	case Type::SubstTemplateTypeParm:
	return getTypeInfo(cast<SubstTemplateTypeParmType>(T)->
	getReplacementType().getTypePtr());

	case Type::Auto:
	case Type::DeducedTemplateSpecialization: {
	const DeducedType *A = cast<DeducedType>(T);
	assert(!A->getDeducedType().isNull() &&
	"cannot request the size of an undeduced or dependent auto type");
	return getTypeInfo(A->getDeducedType().getTypePtr());
	}

	case Type::Paren:
	return getTypeInfo(cast<ParenType>(T)->getInnerType().getTypePtr());

	case Type::ObjCTypeParam:
	return getTypeInfo(cast<ObjCTypeParamType>(T)->desugar().getTypePtr());

	case Type::Typedef: {
	const TypedefNameDecl *Typedef = cast<TypedefType>(T)->getDecl();
	TypeInfo Info = getTypeInfo(Typedef->getUnderlyingType().getTypePtr());
	// If the typedef has an aligned attribute on it, it overrides any computed
	// alignment we have. This violates the GCC documentation (which says that
	// attribute(aligned) can only round up) but matches its implementation.
	if (unsigned AttrAlign = Typedef->getMaxAlignment()) {
	Align = AttrAlign;
	AlignIsRequired = true;
	} else {
	Align = Info.Align;
	AlignIsRequired = Info.AlignIsRequired;
	}
	Width = Info.Width;
	break;
	}

	case Type::Elaborated:
	return getTypeInfo(cast<ElaboratedType>(T)->getNamedType().getTypePtr());

	case Type::Attributed:
	return getTypeInfo(
	cast<AttributedType>(T)->getEquivalentType().getTypePtr());

	case Type::Atomic: {
	// Start with the base type information.
	TypeInfo Info = getTypeInfo(cast<AtomicType>(T)->getValueType());
	Width = Info.Width;
	Align = Info.Align;

	// If the size of the type doesn't exceed the platform's max
	// atomic promotion width, make the size and alignment more
	// favorable to atomic operations:
	if (Width != 0 && Width <= Target->getMaxAtomicPromoteWidth()) {
	// Round the size up to a power of 2.
	if (!llvm::isPowerOf2_64(Width))
	Width = llvm::NextPowerOf2(Width);

	// Set the alignment equal to the size.
	Align = static_cast<unsigned>(Width);
	}
	}
	break;

	case Type::Pipe:
	Width = Target->getPointerWidth(getTargetAddressSpace(LangAS::opencl_global));
	Align = Target->getPointerAlign(getTargetAddressSpace(LangAS::opencl_global));
	break;
	}

	assert(llvm::isPowerOf2_32(Align) && "Alignment must be power of 2");
	return TypeInfo(Width, Align, AlignIsRequired);
	}

	unsigned ASTContext::getOpenMPDefaultSimdAlign(QualType T) const {
	unsigned SimdAlign = getTargetInfo().getSimdDefaultAlign();
	// Target ppc64 with QPX: simd default alignment for pointer to double is 32.
	if ((getTargetInfo().getTriple().getArch() == llvm::Triple::ppc64 \|\|
	getTargetInfo().getTriple().getArch() == llvm::Triple::ppc64le) &&
	getTargetInfo().getABI() == "elfv1-qpx" &&
	T->isSpecificBuiltinType(BuiltinType::Double))
	SimdAlign = 256;
	return SimdAlign;
	}

	/// toCharUnitsFromBits - Convert a size in bits to a size in characters.
	CharUnits ASTContext::toCharUnitsFromBits(int64_t BitSize) const {
	return CharUnits::fromQuantity(BitSize / getCharWidth());
	}

	/// toBits - Convert a size in characters to a size in characters.
	int64_t ASTContext::toBits(CharUnits CharSize) const {
	return CharSize.getQuantity() * getCharWidth();
	}

	/// getTypeSizeInChars - Return the size of the specified type, in characters.
	/// This method does not work on incomplete types.
	CharUnits ASTContext::getTypeSizeInChars(QualType T) const {
	return getTypeInfoInChars(T).first;
	}
	CharUnits ASTContext::getTypeSizeInChars(const Type *T) const {
	return getTypeInfoInChars(T).first;
	}

	/// getTypeAlignInChars - Return the ABI-specified alignment of a type, in
	/// characters. This method does not work on incomplete types.
	CharUnits ASTContext::getTypeAlignInChars(QualType T) const {
	return toCharUnitsFromBits(getTypeAlign(T));
	}
	CharUnits ASTContext::getTypeAlignInChars(const Type *T) const {
	return toCharUnitsFromBits(getTypeAlign(T));
	}

	/// getPreferredTypeAlign - Return the "preferred" alignment of the specified
	/// type for the current target in bits. This can be different than the ABI
	/// alignment in cases where it is beneficial for performance to overalign
	/// a data type.
	unsigned ASTContext::getPreferredTypeAlign(const Type *T) const {
	TypeInfo TI = getTypeInfo(T);
	unsigned ABIAlign = TI.Align;

	T = T->getBaseElementTypeUnsafe();

	// The preferred alignment of member pointers is that of a pointer.
	if (T->isMemberPointerType())
	return getPreferredTypeAlign(getPointerDiffType().getTypePtr());

	if (!Target->allowsLargerPreferedTypeAlignment())
	return ABIAlign;

	// Double and long long should be naturally aligned if possible.
	if (const ComplexType *CT = T->getAs<ComplexType>())
	T = CT->getElementType().getTypePtr();
	if (const EnumType *ET = T->getAs<EnumType>())
	T = ET->getDecl()->getIntegerType().getTypePtr();
	if (T->isSpecificBuiltinType(BuiltinType::Double) \|\|
	T->isSpecificBuiltinType(BuiltinType::LongLong) \|\|
	T->isSpecificBuiltinType(BuiltinType::ULongLong))
	// Don't increase the alignment if an alignment attribute was specified on a
	// typedef declaration.
	if (!TI.AlignIsRequired)
	return std::max(ABIAlign, (unsigned)getTypeSize(T));

	return ABIAlign;
	}

	/// getTargetDefaultAlignForAttributeAligned - Return the default alignment
	/// for __attribute__((aligned)) on this target, to be used if no alignment
	/// value is specified.
	unsigned ASTContext::getTargetDefaultAlignForAttributeAligned() const {
	return getTargetInfo().getDefaultAlignForAttributeAligned();
	}

	/// getAlignOfGlobalVar - Return the alignment in bits that should be given
	/// to a global variable of the specified type.
	unsigned ASTContext::getAlignOfGlobalVar(QualType T) const {
	return std::max(getTypeAlign(T), getTargetInfo().getMinGlobalAlign());
	}

	/// getAlignOfGlobalVarInChars - Return the alignment in characters that
	/// should be given to a global variable of the specified type.
	CharUnits ASTContext::getAlignOfGlobalVarInChars(QualType T) const {
	return toCharUnitsFromBits(getAlignOfGlobalVar(T));
	}

	CharUnits ASTContext::getOffsetOfBaseWithVBPtr(const CXXRecordDecl *RD) const {
	CharUnits Offset = CharUnits::Zero();
	const ASTRecordLayout *Layout = &getASTRecordLayout(RD);
	while (const CXXRecordDecl *Base = Layout->getBaseSharingVBPtr()) {
	Offset += Layout->getBaseClassOffset(Base);
	Layout = &getASTRecordLayout(Base);
	}
	return Offset;
	}

	/// DeepCollectObjCIvars -
	/// This routine first collects all declared, but not synthesized, ivars in
	/// super class and then collects all ivars, including those synthesized for
	/// current class. This routine is used for implementation of current class
	/// when all ivars, declared and synthesized are known.
	void ASTContext::DeepCollectObjCIvars(const ObjCInterfaceDecl *OI,
	bool leafClass,
	SmallVectorImpl<const ObjCIvarDecl*> &Ivars) const {
	if (const ObjCInterfaceDecl *SuperClass = OI->getSuperClass())
	DeepCollectObjCIvars(SuperClass, false, Ivars);
	if (!leafClass) {
	for (const auto *I : OI->ivars())
	Ivars.push_back(I);
	} else {
	ObjCInterfaceDecl IDecl = const_cast<ObjCInterfaceDecl >(OI);
	for (const ObjCIvarDecl *Iv = IDecl->all_declared_ivar_begin(); Iv;
	Iv= Iv->getNextIvar())
	Ivars.push_back(Iv);
	}
	}

	/// CollectInheritedProtocols - Collect all protocols in current class and
	/// those inherited by it.
	void ASTContext::CollectInheritedProtocols(const Decl *CDecl,
	llvm::SmallPtrSet<ObjCProtocolDecl*, 8> &Protocols) {
	if (const ObjCInterfaceDecl *OI = dyn_cast<ObjCInterfaceDecl>(CDecl)) {
	// We can use protocol_iterator here instead of
	// all_referenced_protocol_iterator since we are walking all categories.
	for (auto *Proto : OI->all_referenced_protocols()) {
	CollectInheritedProtocols(Proto, Protocols);
	}

	// Categories of this Interface.
	for (const auto *Cat : OI->visible_categories())
	CollectInheritedProtocols(Cat, Protocols);

	if (ObjCInterfaceDecl *SD = OI->getSuperClass())
	while (SD) {
	CollectInheritedProtocols(SD, Protocols);
	SD = SD->getSuperClass();
	}
	} else if (const ObjCCategoryDecl *OC = dyn_cast<ObjCCategoryDecl>(CDecl)) {
	for (auto *Proto : OC->protocols()) {
	CollectInheritedProtocols(Proto, Protocols);
	}
	} else if (const ObjCProtocolDecl *OP = dyn_cast<ObjCProtocolDecl>(CDecl)) {
	// Insert the protocol.
	if (!Protocols.insert(
	const_cast<ObjCProtocolDecl *>(OP->getCanonicalDecl())).second)
	return;

	for (auto *Proto : OP->protocols())
	CollectInheritedProtocols(Proto, Protocols);
	}
	}

	static bool unionHasUniqueObjectRepresentations(const ASTContext &Context,
	const RecordDecl *RD) {
	assert(RD->isUnion() && "Must be union type");
	CharUnits UnionSize = Context.getTypeSizeInChars(RD->getTypeForDecl());

	for (const auto *Field : RD->fields()) {
	if (!Context.hasUniqueObjectRepresentations(Field->getType()))
	return false;
	CharUnits FieldSize = Context.getTypeSizeInChars(Field->getType());
	if (FieldSize != UnionSize)
	return false;
	}
	- return true;
	+ return !RD->field_empty();
	}

	static bool isStructEmpty(QualType Ty) {
	const RecordDecl *RD = Ty->castAs<RecordType>()->getDecl();

	if (!RD->field_empty())
	return false;

	if (const auto *ClassDecl = dyn_cast<CXXRecordDecl>(RD))
	return ClassDecl->isEmpty();

	return true;
	}

	static llvm::Optional<int64_t>
	structHasUniqueObjectRepresentations(const ASTContext &Context,
	const RecordDecl *RD) {
	assert(!RD->isUnion() && "Must be struct/class type");
	const auto &Layout = Context.getASTRecordLayout(RD);

	int64_t CurOffsetInBits = 0;
	if (const auto *ClassDecl = dyn_cast<CXXRecordDecl>(RD)) {
	if (ClassDecl->isDynamicClass())
	return llvm::None;

	SmallVector<std::pair<QualType, int64_t>, 4> Bases;
	for (const auto Base : ClassDecl->bases()) {
	// Empty types can be inherited from, and non-empty types can potentially
	// have tail padding, so just make sure there isn't an error.
	if (!isStructEmpty(Base.getType())) {
	llvm::Optional<int64_t> Size = structHasUniqueObjectRepresentations(
	Context, Base.getType()->getAs<RecordType>()->getDecl());
	if (!Size)
	return llvm::None;
	Bases.emplace_back(Base.getType(), Size.getValue());
	}
	}

	std::sort(
	Bases.begin(), Bases.end(), [&](const std::pair<QualType, int64_t> &L,
	const std::pair<QualType, int64_t> &R) {
	return Layout.getBaseClassOffset(L.first->getAsCXXRecordDecl()) <
	Layout.getBaseClassOffset(R.first->getAsCXXRecordDecl());
	});

	for (const auto Base : Bases) {
	int64_t BaseOffset = Context.toBits(
	Layout.getBaseClassOffset(Base.first->getAsCXXRecordDecl()));
	int64_t BaseSize = Base.second;
	if (BaseOffset != CurOffsetInBits)
	return llvm::None;
	CurOffsetInBits = BaseOffset + BaseSize;
	}
	}

	for (const auto *Field : RD->fields()) {
	if (!Field->getType()->isReferenceType() &&
	!Context.hasUniqueObjectRepresentations(Field->getType()))
	return llvm::None;

	int64_t FieldSizeInBits =
	Context.toBits(Context.getTypeSizeInChars(Field->getType()));
	if (Field->isBitField()) {
	int64_t BitfieldSize = Field->getBitWidthValue(Context);

	if (BitfieldSize > FieldSizeInBits)
	return llvm::None;
	FieldSizeInBits = BitfieldSize;
	}

	int64_t FieldOffsetInBits = Context.getFieldOffset(Field);

	if (FieldOffsetInBits != CurOffsetInBits)
	return llvm::None;

	CurOffsetInBits = FieldSizeInBits + FieldOffsetInBits;
	}

	return CurOffsetInBits;
	}

	bool ASTContext::hasUniqueObjectRepresentations(QualType Ty) const {
	// C++17 [meta.unary.prop]:
	// The predicate condition for a template specialization
	// has_unique_object_representations<T> shall be
	// satisfied if and only if:
	// (9.1) - T is trivially copyable, and
	// (9.2) - any two objects of type T with the same value have the same
	// object representation, where two objects
	// of array or non-union class type are considered to have the same value
	// if their respective sequences of
	// direct subobjects have the same values, and two objects of union type
	// are considered to have the same
	// value if they have the same active member and the corresponding members
	// have the same value.
	// The set of scalar types for which this condition holds is
	// implementation-defined. [ Note: If a type has padding
	// bits, the condition does not hold; otherwise, the condition holds true
	// for unsigned integral types. -- end note ]
	assert(!Ty.isNull() && "Null QualType sent to unique object rep check");

	// Arrays are unique only if their element type is unique.
	if (Ty->isArrayType())
	return hasUniqueObjectRepresentations(getBaseElementType(Ty));

	// (9.1) - T is trivially copyable...
	if (!Ty.isTriviallyCopyableType(*this))
	return false;

	// All integrals and enums are unique.
	if (Ty->isIntegralOrEnumerationType())
	return true;

	// All other pointers are unique.
	if (Ty->isPointerType())
	return true;

	if (Ty->isMemberPointerType()) {
	const MemberPointerType *MPT = Ty->getAs<MemberPointerType>();
	return !ABI->getMemberPointerInfo(MPT).HasPadding;
	}

	if (Ty->isRecordType()) {
	const RecordDecl *Record = Ty->getAs<RecordType>()->getDecl();

	if (Record->isInvalidDecl())
	return false;

	if (Record->isUnion())
	return unionHasUniqueObjectRepresentations(*this, Record);

	Optional<int64_t> StructSize =
	structHasUniqueObjectRepresentations(*this, Record);

	return StructSize &&
	StructSize.getValue() == static_cast<int64_t>(getTypeSize(Ty));
	}

	// FIXME: More cases to handle here (list by rsmith):
	// vectors (careful about, eg, vector of 3 foo)
	// _Complex int and friends
	// _Atomic T
	// Obj-C block pointers
	// Obj-C object pointers
	// and perhaps OpenCL's various builtin types (pipe, sampler_t, event_t,
	// clk_event_t, queue_t, reserve_id_t)
	// There're also Obj-C class types and the Obj-C selector type, but I think it
	// makes sense for those to return false here.

	return false;
	}

	unsigned ASTContext::CountNonClassIvars(const ObjCInterfaceDecl *OI) const {
	unsigned count = 0;
	// Count ivars declared in class extension.
	for (const auto *Ext : OI->known_extensions())
	count += Ext->ivar_size();

	// Count ivar defined in this class's implementation. This
	// includes synthesized ivars.
	if (ObjCImplementationDecl *ImplDecl = OI->getImplementation())
	count += ImplDecl->ivar_size();

	return count;
	}

	bool ASTContext::isSentinelNullExpr(const Expr *E) {
	if (!E)
	return false;

	// nullptr_t is always treated as null.
	if (E->getType()->isNullPtrType()) return true;

	if (E->getType()->isAnyPointerType() &&
	E->IgnoreParenCasts()->isNullPointerConstant(*this,
	Expr::NPC_ValueDependentIsNull))
	return true;

	// Unfortunately, __null has type 'int'.
	if (isa<GNUNullExpr>(E)) return true;

	return false;
	}

	/// \brief Get the implementation of ObjCInterfaceDecl, or nullptr if none
	/// exists.
	ObjCImplementationDecl ASTContext::getObjCImplementation(ObjCInterfaceDecl D) {
	llvm::DenseMap<ObjCContainerDecl, ObjCImplDecl>::iterator
	I = ObjCImpls.find(D);
	if (I != ObjCImpls.end())
	return cast<ObjCImplementationDecl>(I->second);
	return nullptr;
	}

	/// \brief Get the implementation of ObjCCategoryDecl, or nullptr if none
	/// exists.
	ObjCCategoryImplDecl ASTContext::getObjCImplementation(ObjCCategoryDecl D) {
	llvm::DenseMap<ObjCContainerDecl, ObjCImplDecl>::iterator
	I = ObjCImpls.find(D);
	if (I != ObjCImpls.end())
	return cast<ObjCCategoryImplDecl>(I->second);
	return nullptr;
	}

	/// \brief Set the implementation of ObjCInterfaceDecl.
	void ASTContext::setObjCImplementation(ObjCInterfaceDecl *IFaceD,
	ObjCImplementationDecl *ImplD) {
	assert(IFaceD && ImplD && "Passed null params");
	ObjCImpls[IFaceD] = ImplD;
	}

	/// \brief Set the implementation of ObjCCategoryDecl.
	void ASTContext::setObjCImplementation(ObjCCategoryDecl *CatD,
	ObjCCategoryImplDecl *ImplD) {
	assert(CatD && ImplD && "Passed null params");
	ObjCImpls[CatD] = ImplD;
	}

	const ObjCMethodDecl *
	ASTContext::getObjCMethodRedeclaration(const ObjCMethodDecl *MD) const {
	return ObjCMethodRedecls.lookup(MD);
	}

	void ASTContext::setObjCMethodRedeclaration(const ObjCMethodDecl *MD,
	const ObjCMethodDecl *Redecl) {
	assert(!getObjCMethodRedeclaration(MD) && "MD already has a redeclaration");
	ObjCMethodRedecls[MD] = Redecl;
	}

	const ObjCInterfaceDecl *ASTContext::getObjContainingInterface(
	const NamedDecl *ND) const {
	if (const ObjCInterfaceDecl *ID =
	dyn_cast<ObjCInterfaceDecl>(ND->getDeclContext()))
	return ID;
	if (const ObjCCategoryDecl *CD =
	dyn_cast<ObjCCategoryDecl>(ND->getDeclContext()))
	return CD->getClassInterface();
	if (const ObjCImplDecl *IMD =
	dyn_cast<ObjCImplDecl>(ND->getDeclContext()))
	return IMD->getClassInterface();

	return nullptr;
	}

	/// \brief Get the copy initialization expression of VarDecl, or nullptr if
	/// none exists.
	Expr ASTContext::getBlockVarCopyInits(const VarDeclVD) {
	assert(VD && "Passed null params");
	assert(VD->hasAttr<BlocksAttr>() &&
	"getBlockVarCopyInits - not __block var");
	llvm::DenseMap<const VarDecl, Expr>::iterator
	I = BlockVarCopyInits.find(VD);
	return (I != BlockVarCopyInits.end()) ? cast<Expr>(I->second) : nullptr;
	}

	/// \brief Set the copy inialization expression of a block var decl.
	void ASTContext::setBlockVarCopyInits(VarDeclVD, Expr Init) {
	assert(VD && Init && "Passed null params");
	assert(VD->hasAttr<BlocksAttr>() &&
	"setBlockVarCopyInits - not __block var");
	BlockVarCopyInits[VD] = Init;
	}

	TypeSourceInfo *ASTContext::CreateTypeSourceInfo(QualType T,
	unsigned DataSize) const {
	if (!DataSize)
	DataSize = TypeLoc::getFullDataSizeForType(T);
	else
	assert(DataSize == TypeLoc::getFullDataSizeForType(T) &&
	"incorrect data size provided to CreateTypeSourceInfo!");

	TypeSourceInfo *TInfo =
	(TypeSourceInfo*)BumpAlloc.Allocate(sizeof(TypeSourceInfo) + DataSize, 8);
	new (TInfo) TypeSourceInfo(T);
	return TInfo;
	}

	TypeSourceInfo *ASTContext::getTrivialTypeSourceInfo(QualType T,
	SourceLocation L) const {
	TypeSourceInfo *DI = CreateTypeSourceInfo(T);
	DI->getTypeLoc().initialize(const_cast<ASTContext &>(*this), L);
	return DI;
	}

	const ASTRecordLayout &
	ASTContext::getASTObjCInterfaceLayout(const ObjCInterfaceDecl *D) const {
	return getObjCLayout(D, nullptr);
	}

	const ASTRecordLayout &
	ASTContext::getASTObjCImplementationLayout(
	const ObjCImplementationDecl *D) const {
	return getObjCLayout(D->getClassInterface(), D);
	}

	//===----------------------------------------------------------------------===//
	// Type creation/memoization methods
	//===----------------------------------------------------------------------===//

	QualType
	ASTContext::getExtQualType(const Type *baseType, Qualifiers quals) const {
	unsigned fastQuals = quals.getFastQualifiers();
	quals.removeFastQualifiers();

	// Check if we've already instantiated this type.
	llvm::FoldingSetNodeID ID;
	ExtQuals::Profile(ID, baseType, quals);
	void *insertPos = nullptr;
	if (ExtQuals *eq = ExtQualNodes.FindNodeOrInsertPos(ID, insertPos)) {
	assert(eq->getQualifiers() == quals);
	return QualType(eq, fastQuals);
	}

	// If the base type is not canonical, make the appropriate canonical type.
	QualType canon;
	if (!baseType->isCanonicalUnqualified()) {
	SplitQualType canonSplit = baseType->getCanonicalTypeInternal().split();
	canonSplit.Quals.addConsistentQualifiers(quals);
	canon = getExtQualType(canonSplit.Ty, canonSplit.Quals);

	// Re-find the insert position.
	(void) ExtQualNodes.FindNodeOrInsertPos(ID, insertPos);
	}

	ExtQuals eq = new (this, TypeAlignment) ExtQuals(baseType, canon, quals);
	ExtQualNodes.InsertNode(eq, insertPos);
	return QualType(eq, fastQuals);
	}

	QualType ASTContext::getAddrSpaceQualType(QualType T,
	LangAS AddressSpace) const {
	QualType CanT = getCanonicalType(T);
	if (CanT.getAddressSpace() == AddressSpace)
	return T;

	// If we are composing extended qualifiers together, merge together
	// into one ExtQuals node.
	QualifierCollector Quals;
	const Type *TypeNode = Quals.strip(T);

	// If this type already has an address space specified, it cannot get
	// another one.
	assert(!Quals.hasAddressSpace() &&
	"Type cannot be in multiple addr spaces!");
	Quals.addAddressSpace(AddressSpace);

	return getExtQualType(TypeNode, Quals);
	}

	QualType ASTContext::removeAddrSpaceQualType(QualType T) const {
	// If we are composing extended qualifiers together, merge together
	// into one ExtQuals node.
	QualifierCollector Quals;
	const Type *TypeNode = Quals.strip(T);

	// If the qualifier doesn't have an address space just return it.
	if (!Quals.hasAddressSpace())
	return T;

	Quals.removeAddressSpace();

	// Removal of the address space can mean there are no longer any
	// non-fast qualifiers, so creating an ExtQualType isn't possible (asserts)
	// or required.
	if (Quals.hasNonFastQualifiers())
	return getExtQualType(TypeNode, Quals);
	else
	return QualType(TypeNode, Quals.getFastQualifiers());
	}

	QualType ASTContext::getObjCGCQualType(QualType T,
	Qualifiers::GC GCAttr) const {
	QualType CanT = getCanonicalType(T);
	if (CanT.getObjCGCAttr() == GCAttr)
	return T;

	if (const PointerType *ptr = T->getAs<PointerType>()) {
	QualType Pointee = ptr->getPointeeType();
	if (Pointee->isAnyPointerType()) {
	QualType ResultType = getObjCGCQualType(Pointee, GCAttr);
	return getPointerType(ResultType);
	}
	}

	// If we are composing extended qualifiers together, merge together
	// into one ExtQuals node.
	QualifierCollector Quals;
	const Type *TypeNode = Quals.strip(T);

	// If this type already has an ObjCGC specified, it cannot get
	// another one.
	assert(!Quals.hasObjCGCAttr() &&
	"Type cannot have multiple ObjCGCs!");
	Quals.addObjCGCAttr(GCAttr);

	return getExtQualType(TypeNode, Quals);
	}

	const FunctionType ASTContext::adjustFunctionType(const FunctionType T,
	FunctionType::ExtInfo Info) {
	if (T->getExtInfo() == Info)
	return T;

	QualType Result;
	if (const FunctionNoProtoType *FNPT = dyn_cast<FunctionNoProtoType>(T)) {
	Result = getFunctionNoProtoType(FNPT->getReturnType(), Info);
	} else {
	const FunctionProtoType *FPT = cast<FunctionProtoType>(T);
	FunctionProtoType::ExtProtoInfo EPI = FPT->getExtProtoInfo();
	EPI.ExtInfo = Info;
	Result = getFunctionType(FPT->getReturnType(), FPT->getParamTypes(), EPI);
	}

	return cast<FunctionType>(Result.getTypePtr());
	}

	void ASTContext::adjustDeducedFunctionResultType(FunctionDecl *FD,
	QualType ResultType) {
	FD = FD->getMostRecentDecl();
	while (true) {
	const FunctionProtoType *FPT = FD->getType()->castAs<FunctionProtoType>();
	FunctionProtoType::ExtProtoInfo EPI = FPT->getExtProtoInfo();
	FD->setType(getFunctionType(ResultType, FPT->getParamTypes(), EPI));
	if (FunctionDecl *Next = FD->getPreviousDecl())
	FD = Next;
	else
	break;
	}
	if (ASTMutationListener *L = getASTMutationListener())
	L->DeducedReturnType(FD, ResultType);
	}

	/// Get a function type and produce the equivalent function type with the
	/// specified exception specification. Type sugar that can be present on a
	/// declaration of a function with an exception specification is permitted
	/// and preserved. Other type sugar (for instance, typedefs) is not.
	static QualType getFunctionTypeWithExceptionSpec(
	ASTContext &Context, QualType Orig,
	const FunctionProtoType::ExceptionSpecInfo &ESI) {
	// Might have some parens.
	if (auto *PT = dyn_cast<ParenType>(Orig))
	return Context.getParenType(
	getFunctionTypeWithExceptionSpec(Context, PT->getInnerType(), ESI));

	// Might have a calling-convention attribute.
	if (auto *AT = dyn_cast<AttributedType>(Orig))
	return Context.getAttributedType(
	AT->getAttrKind(),
	getFunctionTypeWithExceptionSpec(Context, AT->getModifiedType(), ESI),
	getFunctionTypeWithExceptionSpec(Context, AT->getEquivalentType(),
	ESI));

	// Anything else must be a function type. Rebuild it with the new exception
	// specification.
	const FunctionProtoType *Proto = cast<FunctionProtoType>(Orig);
	return Context.getFunctionType(
	Proto->getReturnType(), Proto->getParamTypes(),
	Proto->getExtProtoInfo().withExceptionSpec(ESI));
	}

	bool ASTContext::hasSameFunctionTypeIgnoringExceptionSpec(QualType T,
	QualType U) {
	return hasSameType(T, U) \|\|
	(getLangOpts().CPlusPlus17 &&
	hasSameType(getFunctionTypeWithExceptionSpec(*this, T, EST_None),
	getFunctionTypeWithExceptionSpec(*this, U, EST_None)));
	}

	void ASTContext::adjustExceptionSpec(
	FunctionDecl *FD, const FunctionProtoType::ExceptionSpecInfo &ESI,
	bool AsWritten) {
	// Update the type.
	QualType Updated =
	getFunctionTypeWithExceptionSpec(*this, FD->getType(), ESI);
	FD->setType(Updated);

	if (!AsWritten)
	return;

	// Update the type in the type source information too.
	if (TypeSourceInfo *TSInfo = FD->getTypeSourceInfo()) {
	// If the type and the type-as-written differ, we may need to update
	// the type-as-written too.
	if (TSInfo->getType() != FD->getType())
	Updated = getFunctionTypeWithExceptionSpec(*this, TSInfo->getType(), ESI);

	// FIXME: When we get proper type location information for exceptions,
	// we'll also have to rebuild the TypeSourceInfo. For now, we just patch
	// up the TypeSourceInfo;
	assert(TypeLoc::getFullDataSizeForType(Updated) ==
	TypeLoc::getFullDataSizeForType(TSInfo->getType()) &&
	"TypeLoc size mismatch from updating exception specification");
	TSInfo->overrideType(Updated);
	}
	}

	/// getComplexType - Return the uniqued reference to the type for a complex
	/// number with the specified element type.
	QualType ASTContext::getComplexType(QualType T) const {
	// Unique pointers, to guarantee there is only one pointer of a particular
	// structure.
	llvm::FoldingSetNodeID ID;
	ComplexType::Profile(ID, T);

	void *InsertPos = nullptr;
	if (ComplexType *CT = ComplexTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(CT, 0);

	// If the pointee type isn't canonical, this won't be a canonical type either,
	// so fill in the canonical type field.
	QualType Canonical;
	if (!T.isCanonical()) {
	Canonical = getComplexType(getCanonicalType(T));

	// Get the new insert position for the node we care about.
	ComplexType *NewIP = ComplexTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!NewIP && "Shouldn't be in the map!"); (void)NewIP;
	}
	ComplexType New = new (this, TypeAlignment) ComplexType(T, Canonical);
	Types.push_back(New);
	ComplexTypes.InsertNode(New, InsertPos);
	return QualType(New, 0);
	}

	/// getPointerType - Return the uniqued reference to the type for a pointer to
	/// the specified type.
	QualType ASTContext::getPointerType(QualType T) const {
	// Unique pointers, to guarantee there is only one pointer of a particular
	// structure.
	llvm::FoldingSetNodeID ID;
	PointerType::Profile(ID, T);

	void *InsertPos = nullptr;
	if (PointerType *PT = PointerTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(PT, 0);

	// If the pointee type isn't canonical, this won't be a canonical type either,
	// so fill in the canonical type field.
	QualType Canonical;
	if (!T.isCanonical()) {
	Canonical = getPointerType(getCanonicalType(T));

	// Get the new insert position for the node we care about.
	PointerType *NewIP = PointerTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!NewIP && "Shouldn't be in the map!"); (void)NewIP;
	}
	PointerType New = new (this, TypeAlignment) PointerType(T, Canonical);
	Types.push_back(New);
	PointerTypes.InsertNode(New, InsertPos);
	return QualType(New, 0);
	}

	QualType ASTContext::getAdjustedType(QualType Orig, QualType New) const {
	llvm::FoldingSetNodeID ID;
	AdjustedType::Profile(ID, Orig, New);
	void *InsertPos = nullptr;
	AdjustedType *AT = AdjustedTypes.FindNodeOrInsertPos(ID, InsertPos);
	if (AT)
	return QualType(AT, 0);

	QualType Canonical = getCanonicalType(New);

	// Get the new insert position for the node we care about.
	AT = AdjustedTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!AT && "Shouldn't be in the map!");

	AT = new (*this, TypeAlignment)
	AdjustedType(Type::Adjusted, Orig, New, Canonical);
	Types.push_back(AT);
	AdjustedTypes.InsertNode(AT, InsertPos);
	return QualType(AT, 0);
	}

	QualType ASTContext::getDecayedType(QualType T) const {
	assert((T->isArrayType() \|\| T->isFunctionType()) && "T does not decay");

	QualType Decayed;

	// C99 6.7.5.3p7:
	// A declaration of a parameter as "array of type" shall be
	// adjusted to "qualified pointer to type", where the type
	// qualifiers (if any) are those specified within the [ and ] of
	// the array type derivation.
	if (T->isArrayType())
	Decayed = getArrayDecayedType(T);

	// C99 6.7.5.3p8:
	// A declaration of a parameter as "function returning type"
	// shall be adjusted to "pointer to function returning type", as
	// in 6.3.2.1.
	if (T->isFunctionType())
	Decayed = getPointerType(T);

	llvm::FoldingSetNodeID ID;
	AdjustedType::Profile(ID, T, Decayed);
	void *InsertPos = nullptr;
	AdjustedType *AT = AdjustedTypes.FindNodeOrInsertPos(ID, InsertPos);
	if (AT)
	return QualType(AT, 0);

	QualType Canonical = getCanonicalType(Decayed);

	// Get the new insert position for the node we care about.
	AT = AdjustedTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!AT && "Shouldn't be in the map!");

	AT = new (*this, TypeAlignment) DecayedType(T, Decayed, Canonical);
	Types.push_back(AT);
	AdjustedTypes.InsertNode(AT, InsertPos);
	return QualType(AT, 0);
	}

	/// getBlockPointerType - Return the uniqued reference to the type for
	/// a pointer to the specified block.
	QualType ASTContext::getBlockPointerType(QualType T) const {
	assert(T->isFunctionType() && "block of function types only");
	// Unique pointers, to guarantee there is only one block of a particular
	// structure.
	llvm::FoldingSetNodeID ID;
	BlockPointerType::Profile(ID, T);

	void *InsertPos = nullptr;
	if (BlockPointerType *PT =
	BlockPointerTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(PT, 0);

	// If the block pointee type isn't canonical, this won't be a canonical
	// type either so fill in the canonical type field.
	QualType Canonical;
	if (!T.isCanonical()) {
	Canonical = getBlockPointerType(getCanonicalType(T));

	// Get the new insert position for the node we care about.
	BlockPointerType *NewIP =
	BlockPointerTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!NewIP && "Shouldn't be in the map!"); (void)NewIP;
	}
	BlockPointerType *New
	= new (*this, TypeAlignment) BlockPointerType(T, Canonical);
	Types.push_back(New);
	BlockPointerTypes.InsertNode(New, InsertPos);
	return QualType(New, 0);
	}

	/// getLValueReferenceType - Return the uniqued reference to the type for an
	/// lvalue reference to the specified type.
	QualType
	ASTContext::getLValueReferenceType(QualType T, bool SpelledAsLValue) const {
	assert(getCanonicalType(T) != OverloadTy &&
	"Unresolved overloaded function type");

	// Unique pointers, to guarantee there is only one pointer of a particular
	// structure.
	llvm::FoldingSetNodeID ID;
	ReferenceType::Profile(ID, T, SpelledAsLValue);

	void *InsertPos = nullptr;
	if (LValueReferenceType *RT =
	LValueReferenceTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(RT, 0);

	const ReferenceType *InnerRef = T->getAs<ReferenceType>();

	// If the referencee type isn't canonical, this won't be a canonical type
	// either, so fill in the canonical type field.
	QualType Canonical;
	if (!SpelledAsLValue \|\| InnerRef \|\| !T.isCanonical()) {
	QualType PointeeType = (InnerRef ? InnerRef->getPointeeType() : T);
	Canonical = getLValueReferenceType(getCanonicalType(PointeeType));

	// Get the new insert position for the node we care about.
	LValueReferenceType *NewIP =
	LValueReferenceTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!NewIP && "Shouldn't be in the map!"); (void)NewIP;
	}

	LValueReferenceType *New
	= new (*this, TypeAlignment) LValueReferenceType(T, Canonical,
	SpelledAsLValue);
	Types.push_back(New);
	LValueReferenceTypes.InsertNode(New, InsertPos);

	return QualType(New, 0);
	}

	/// getRValueReferenceType - Return the uniqued reference to the type for an
	/// rvalue reference to the specified type.
	QualType ASTContext::getRValueReferenceType(QualType T) const {
	// Unique pointers, to guarantee there is only one pointer of a particular
	// structure.
	llvm::FoldingSetNodeID ID;
	ReferenceType::Profile(ID, T, false);

	void *InsertPos = nullptr;
	if (RValueReferenceType *RT =
	RValueReferenceTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(RT, 0);

	const ReferenceType *InnerRef = T->getAs<ReferenceType>();

	// If the referencee type isn't canonical, this won't be a canonical type
	// either, so fill in the canonical type field.
	QualType Canonical;
	if (InnerRef \|\| !T.isCanonical()) {
	QualType PointeeType = (InnerRef ? InnerRef->getPointeeType() : T);
	Canonical = getRValueReferenceType(getCanonicalType(PointeeType));

	// Get the new insert position for the node we care about.
	RValueReferenceType *NewIP =
	RValueReferenceTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!NewIP && "Shouldn't be in the map!"); (void)NewIP;
	}

	RValueReferenceType *New
	= new (*this, TypeAlignment) RValueReferenceType(T, Canonical);
	Types.push_back(New);
	RValueReferenceTypes.InsertNode(New, InsertPos);
	return QualType(New, 0);
	}

	/// getMemberPointerType - Return the uniqued reference to the type for a
	/// member pointer to the specified type, in the specified class.
	QualType ASTContext::getMemberPointerType(QualType T, const Type *Cls) const {
	// Unique pointers, to guarantee there is only one pointer of a particular
	// structure.
	llvm::FoldingSetNodeID ID;
	MemberPointerType::Profile(ID, T, Cls);

	void *InsertPos = nullptr;
	if (MemberPointerType *PT =
	MemberPointerTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(PT, 0);

	// If the pointee or class type isn't canonical, this won't be a canonical
	// type either, so fill in the canonical type field.
	QualType Canonical;
	if (!T.isCanonical() \|\| !Cls->isCanonicalUnqualified()) {
	Canonical = getMemberPointerType(getCanonicalType(T),getCanonicalType(Cls));

	// Get the new insert position for the node we care about.
	MemberPointerType *NewIP =
	MemberPointerTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!NewIP && "Shouldn't be in the map!"); (void)NewIP;
	}
	MemberPointerType *New
	= new (*this, TypeAlignment) MemberPointerType(T, Cls, Canonical);
	Types.push_back(New);
	MemberPointerTypes.InsertNode(New, InsertPos);
	return QualType(New, 0);
	}

	/// getConstantArrayType - Return the unique reference to the type for an
	/// array of the specified element type.
	QualType ASTContext::getConstantArrayType(QualType EltTy,
	const llvm::APInt &ArySizeIn,
	ArrayType::ArraySizeModifier ASM,
	unsigned IndexTypeQuals) const {
	assert((EltTy->isDependentType() \|\|
	EltTy->isIncompleteType() \|\| EltTy->isConstantSizeType()) &&
	"Constant array of VLAs is illegal!");

	// Convert the array size into a canonical width matching the pointer size for
	// the target.
	llvm::APInt ArySize(ArySizeIn);
	ArySize = ArySize.zextOrTrunc(Target->getMaxPointerWidth());

	llvm::FoldingSetNodeID ID;
	ConstantArrayType::Profile(ID, EltTy, ArySize, ASM, IndexTypeQuals);

	void *InsertPos = nullptr;
	if (ConstantArrayType *ATP =
	ConstantArrayTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(ATP, 0);

	// If the element type isn't canonical or has qualifiers, this won't
	// be a canonical type either, so fill in the canonical type field.
	QualType Canon;
	if (!EltTy.isCanonical() \|\| EltTy.hasLocalQualifiers()) {
	SplitQualType canonSplit = getCanonicalType(EltTy).split();
	Canon = getConstantArrayType(QualType(canonSplit.Ty, 0), ArySize,
	ASM, IndexTypeQuals);
	Canon = getQualifiedType(Canon, canonSplit.Quals);

	// Get the new insert position for the node we care about.
	ConstantArrayType *NewIP =
	ConstantArrayTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!NewIP && "Shouldn't be in the map!"); (void)NewIP;
	}

	ConstantArrayType New = new(this,TypeAlignment)
	ConstantArrayType(EltTy, Canon, ArySize, ASM, IndexTypeQuals);
	ConstantArrayTypes.InsertNode(New, InsertPos);
	Types.push_back(New);
	return QualType(New, 0);
	}

	/// getVariableArrayDecayedType - Turns the given type, which may be
	/// variably-modified, into the corresponding type with all the known
	/// sizes replaced with [*].
	QualType ASTContext::getVariableArrayDecayedType(QualType type) const {
	// Vastly most common case.
	if (!type->isVariablyModifiedType()) return type;

	QualType result;

	SplitQualType split = type.getSplitDesugaredType();
	const Type *ty = split.Ty;
	switch (ty->getTypeClass()) {
	#define TYPE(Class, Base)
	#define ABSTRACT_TYPE(Class, Base)
	#define NON_CANONICAL_TYPE(Class, Base) case Type::Class:
	#include "clang/AST/TypeNodes.def"
	llvm_unreachable("didn't desugar past all non-canonical types?");

	// These types should never be variably-modified.
	case Type::Builtin:
	case Type::Complex:
	case Type::Vector:
	case Type::ExtVector:
	case Type::DependentSizedExtVector:
	case Type::DependentAddressSpace:
	case Type::ObjCObject:
	case Type::ObjCInterface:
	case Type::ObjCObjectPointer:
	case Type::Record:
	case Type::Enum:
	case Type::UnresolvedUsing:
	case Type::TypeOfExpr:
	case Type::TypeOf:
	case Type::Decltype:
	case Type::UnaryTransform:
	case Type::DependentName:
	case Type::InjectedClassName:
	case Type::TemplateSpecialization:
	case Type::DependentTemplateSpecialization:
	case Type::TemplateTypeParm:
	case Type::SubstTemplateTypeParmPack:
	case Type::Auto:
	case Type::DeducedTemplateSpecialization:
	case Type::PackExpansion:
	llvm_unreachable("type should never be variably-modified");

	// These types can be variably-modified but should never need to
	// further decay.
	case Type::FunctionNoProto:
	case Type::FunctionProto:
	case Type::BlockPointer:
	case Type::MemberPointer:
	case Type::Pipe:
	return type;

	// These types can be variably-modified. All these modifications
	// preserve structure except as noted by comments.
	// TODO: if we ever care about optimizing VLAs, there are no-op
	// optimizations available here.
	case Type::Pointer:
	result = getPointerType(getVariableArrayDecayedType(
	cast<PointerType>(ty)->getPointeeType()));
	break;

	case Type::LValueReference: {
	const LValueReferenceType *lv = cast<LValueReferenceType>(ty);
	result = getLValueReferenceType(
	getVariableArrayDecayedType(lv->getPointeeType()),
	lv->isSpelledAsLValue());
	break;
	}

	case Type::RValueReference: {
	const RValueReferenceType *lv = cast<RValueReferenceType>(ty);
	result = getRValueReferenceType(
	getVariableArrayDecayedType(lv->getPointeeType()));
	break;
	}

	case Type::Atomic: {
	const AtomicType *at = cast<AtomicType>(ty);
	result = getAtomicType(getVariableArrayDecayedType(at->getValueType()));
	break;
	}

	case Type::ConstantArray: {
	const ConstantArrayType *cat = cast<ConstantArrayType>(ty);
	result = getConstantArrayType(
	getVariableArrayDecayedType(cat->getElementType()),
	cat->getSize(),
	cat->getSizeModifier(),
	cat->getIndexTypeCVRQualifiers());
	break;
	}

	case Type::DependentSizedArray: {
	const DependentSizedArrayType *dat = cast<DependentSizedArrayType>(ty);
	result = getDependentSizedArrayType(
	getVariableArrayDecayedType(dat->getElementType()),
	dat->getSizeExpr(),
	dat->getSizeModifier(),
	dat->getIndexTypeCVRQualifiers(),
	dat->getBracketsRange());
	break;
	}

	// Turn incomplete types into [*] types.
	case Type::IncompleteArray: {
	const IncompleteArrayType *iat = cast<IncompleteArrayType>(ty);
	result = getVariableArrayType(
	getVariableArrayDecayedType(iat->getElementType()),
	/size/ nullptr,
	ArrayType::Normal,
	iat->getIndexTypeCVRQualifiers(),
	SourceRange());
	break;
	}

	// Turn VLA types into [*] types.
	case Type::VariableArray: {
	const VariableArrayType *vat = cast<VariableArrayType>(ty);
	result = getVariableArrayType(
	getVariableArrayDecayedType(vat->getElementType()),
	/size/ nullptr,
	ArrayType::Star,
	vat->getIndexTypeCVRQualifiers(),
	vat->getBracketsRange());
	break;
	}
	}

	// Apply the top-level qualifiers from the original.
	return getQualifiedType(result, split.Quals);
	}

	/// getVariableArrayType - Returns a non-unique reference to the type for a
	/// variable array of the specified element type.
	QualType ASTContext::getVariableArrayType(QualType EltTy,
	Expr *NumElts,
	ArrayType::ArraySizeModifier ASM,
	unsigned IndexTypeQuals,
	SourceRange Brackets) const {
	// Since we don't unique expressions, it isn't possible to unique VLA's
	// that have an expression provided for their size.
	QualType Canon;

	// Be sure to pull qualifiers off the element type.
	if (!EltTy.isCanonical() \|\| EltTy.hasLocalQualifiers()) {
	SplitQualType canonSplit = getCanonicalType(EltTy).split();
	Canon = getVariableArrayType(QualType(canonSplit.Ty, 0), NumElts, ASM,
	IndexTypeQuals, Brackets);
	Canon = getQualifiedType(Canon, canonSplit.Quals);
	}

	VariableArrayType New = new(this, TypeAlignment)
	VariableArrayType(EltTy, Canon, NumElts, ASM, IndexTypeQuals, Brackets);

	VariableArrayTypes.push_back(New);
	Types.push_back(New);
	return QualType(New, 0);
	}

	/// getDependentSizedArrayType - Returns a non-unique reference to
	/// the type for a dependently-sized array of the specified element
	/// type.
	QualType ASTContext::getDependentSizedArrayType(QualType elementType,
	Expr *numElements,
	ArrayType::ArraySizeModifier ASM,
	unsigned elementTypeQuals,
	SourceRange brackets) const {
	assert((!numElements \|\| numElements->isTypeDependent() \|\|
	numElements->isValueDependent()) &&
	"Size must be type- or value-dependent!");

	// Dependently-sized array types that do not have a specified number
	// of elements will have their sizes deduced from a dependent
	// initializer. We do no canonicalization here at all, which is okay
	// because they can't be used in most locations.
	if (!numElements) {
	DependentSizedArrayType *newType
	= new (*this, TypeAlignment)
	DependentSizedArrayType(*this, elementType, QualType(),
	numElements, ASM, elementTypeQuals,
	brackets);
	Types.push_back(newType);
	return QualType(newType, 0);
	}

	// Otherwise, we actually build a new type every time, but we
	// also build a canonical type.

	SplitQualType canonElementType = getCanonicalType(elementType).split();

	void *insertPos = nullptr;
	llvm::FoldingSetNodeID ID;
	DependentSizedArrayType::Profile(ID, *this,
	QualType(canonElementType.Ty, 0),
	ASM, elementTypeQuals, numElements);

	// Look for an existing type with these properties.
	DependentSizedArrayType *canonTy =
	DependentSizedArrayTypes.FindNodeOrInsertPos(ID, insertPos);

	// If we don't have one, build one.
	if (!canonTy) {
	canonTy = new (*this, TypeAlignment)
	DependentSizedArrayType(*this, QualType(canonElementType.Ty, 0),
	QualType(), numElements, ASM, elementTypeQuals,
	brackets);
	DependentSizedArrayTypes.InsertNode(canonTy, insertPos);
	Types.push_back(canonTy);
	}

	// Apply qualifiers from the element type to the array.
	QualType canon = getQualifiedType(QualType(canonTy,0),
	canonElementType.Quals);

	// If we didn't need extra canonicalization for the element type or the size
	// expression, then just use that as our result.
	if (QualType(canonElementType.Ty, 0) == elementType &&
	canonTy->getSizeExpr() == numElements)
	return canon;

	// Otherwise, we need to build a type which follows the spelling
	// of the element type.
	DependentSizedArrayType *sugaredType
	= new (*this, TypeAlignment)
	DependentSizedArrayType(*this, elementType, canon, numElements,
	ASM, elementTypeQuals, brackets);
	Types.push_back(sugaredType);
	return QualType(sugaredType, 0);
	}

	QualType ASTContext::getIncompleteArrayType(QualType elementType,
	ArrayType::ArraySizeModifier ASM,
	unsigned elementTypeQuals) const {
	llvm::FoldingSetNodeID ID;
	IncompleteArrayType::Profile(ID, elementType, ASM, elementTypeQuals);

	void *insertPos = nullptr;
	if (IncompleteArrayType *iat =
	IncompleteArrayTypes.FindNodeOrInsertPos(ID, insertPos))
	return QualType(iat, 0);

	// If the element type isn't canonical, this won't be a canonical type
	// either, so fill in the canonical type field. We also have to pull
	// qualifiers off the element type.
	QualType canon;

	if (!elementType.isCanonical() \|\| elementType.hasLocalQualifiers()) {
	SplitQualType canonSplit = getCanonicalType(elementType).split();
	canon = getIncompleteArrayType(QualType(canonSplit.Ty, 0),
	ASM, elementTypeQuals);
	canon = getQualifiedType(canon, canonSplit.Quals);

	// Get the new insert position for the node we care about.
	IncompleteArrayType *existing =
	IncompleteArrayTypes.FindNodeOrInsertPos(ID, insertPos);
	assert(!existing && "Shouldn't be in the map!"); (void) existing;
	}

	IncompleteArrayType newType = new (this, TypeAlignment)
	IncompleteArrayType(elementType, canon, ASM, elementTypeQuals);

	IncompleteArrayTypes.InsertNode(newType, insertPos);
	Types.push_back(newType);
	return QualType(newType, 0);
	}

	/// getVectorType - Return the unique reference to a vector type of
	/// the specified element type and size. VectorType must be a built-in type.
	QualType ASTContext::getVectorType(QualType vecType, unsigned NumElts,
	VectorType::VectorKind VecKind) const {
	assert(vecType->isBuiltinType());

	// Check if we've already instantiated a vector of this type.
	llvm::FoldingSetNodeID ID;
	VectorType::Profile(ID, vecType, NumElts, Type::Vector, VecKind);

	void *InsertPos = nullptr;
	if (VectorType *VTP = VectorTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(VTP, 0);

	// If the element type isn't canonical, this won't be a canonical type either,
	// so fill in the canonical type field.
	QualType Canonical;
	if (!vecType.isCanonical()) {
	Canonical = getVectorType(getCanonicalType(vecType), NumElts, VecKind);

	// Get the new insert position for the node we care about.
	VectorType *NewIP = VectorTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!NewIP && "Shouldn't be in the map!"); (void)NewIP;
	}
	VectorType New = new (this, TypeAlignment)
	VectorType(vecType, NumElts, Canonical, VecKind);
	VectorTypes.InsertNode(New, InsertPos);
	Types.push_back(New);
	return QualType(New, 0);
	}

	/// getExtVectorType - Return the unique reference to an extended vector type of
	/// the specified element type and size. VectorType must be a built-in type.
	QualType
	ASTContext::getExtVectorType(QualType vecType, unsigned NumElts) const {
	assert(vecType->isBuiltinType() \|\| vecType->isDependentType());

	// Check if we've already instantiated a vector of this type.
	llvm::FoldingSetNodeID ID;
	VectorType::Profile(ID, vecType, NumElts, Type::ExtVector,
	VectorType::GenericVector);
	void *InsertPos = nullptr;
	if (VectorType *VTP = VectorTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(VTP, 0);

	// If the element type isn't canonical, this won't be a canonical type either,
	// so fill in the canonical type field.
	QualType Canonical;
	if (!vecType.isCanonical()) {
	Canonical = getExtVectorType(getCanonicalType(vecType), NumElts);

	// Get the new insert position for the node we care about.
	VectorType *NewIP = VectorTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!NewIP && "Shouldn't be in the map!"); (void)NewIP;
	}
	ExtVectorType New = new (this, TypeAlignment)
	ExtVectorType(vecType, NumElts, Canonical);
	VectorTypes.InsertNode(New, InsertPos);
	Types.push_back(New);
	return QualType(New, 0);
	}

	QualType
	ASTContext::getDependentSizedExtVectorType(QualType vecType,
	Expr *SizeExpr,
	SourceLocation AttrLoc) const {
	llvm::FoldingSetNodeID ID;
	DependentSizedExtVectorType::Profile(ID, *this, getCanonicalType(vecType),
	SizeExpr);

	void *InsertPos = nullptr;
	DependentSizedExtVectorType *Canon
	= DependentSizedExtVectorTypes.FindNodeOrInsertPos(ID, InsertPos);
	DependentSizedExtVectorType *New;
	if (Canon) {
	// We already have a canonical version of this array type; use it as
	// the canonical type for a newly-built type.
	New = new (*this, TypeAlignment)
	DependentSizedExtVectorType(*this, vecType, QualType(Canon, 0),
	SizeExpr, AttrLoc);
	} else {
	QualType CanonVecTy = getCanonicalType(vecType);
	if (CanonVecTy == vecType) {
	New = new (*this, TypeAlignment)
	DependentSizedExtVectorType(*this, vecType, QualType(), SizeExpr,
	AttrLoc);

	DependentSizedExtVectorType *CanonCheck
	= DependentSizedExtVectorTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!CanonCheck && "Dependent-sized ext_vector canonical type broken");
	(void)CanonCheck;
	DependentSizedExtVectorTypes.InsertNode(New, InsertPos);
	} else {
	QualType Canon = getDependentSizedExtVectorType(CanonVecTy, SizeExpr,
	SourceLocation());
	New = new (*this, TypeAlignment)
	DependentSizedExtVectorType(*this, vecType, Canon, SizeExpr, AttrLoc);
	}
	}

	Types.push_back(New);
	return QualType(New, 0);
	}

	QualType ASTContext::getDependentAddressSpaceType(QualType PointeeType,
	Expr *AddrSpaceExpr,
	SourceLocation AttrLoc) const {
	assert(AddrSpaceExpr->isInstantiationDependent());

	QualType canonPointeeType = getCanonicalType(PointeeType);

	void *insertPos = nullptr;
	llvm::FoldingSetNodeID ID;
	DependentAddressSpaceType::Profile(ID, *this, canonPointeeType,
	AddrSpaceExpr);

	DependentAddressSpaceType *canonTy =
	DependentAddressSpaceTypes.FindNodeOrInsertPos(ID, insertPos);

	if (!canonTy) {
	canonTy = new (*this, TypeAlignment)
	DependentAddressSpaceType(*this, canonPointeeType,
	QualType(), AddrSpaceExpr, AttrLoc);
	DependentAddressSpaceTypes.InsertNode(canonTy, insertPos);
	Types.push_back(canonTy);
	}

	if (canonPointeeType == PointeeType &&
	canonTy->getAddrSpaceExpr() == AddrSpaceExpr)
	return QualType(canonTy, 0);

	DependentAddressSpaceType *sugaredType
	= new (*this, TypeAlignment)
	DependentAddressSpaceType(*this, PointeeType, QualType(canonTy, 0),
	AddrSpaceExpr, AttrLoc);
	Types.push_back(sugaredType);
	return QualType(sugaredType, 0);
	}

	/// \brief Determine whether \p T is canonical as the result type of a function.
	static bool isCanonicalResultType(QualType T) {
	return T.isCanonical() &&
	(T.getObjCLifetime() == Qualifiers::OCL_None \|\|
	T.getObjCLifetime() == Qualifiers::OCL_ExplicitNone);
	}

	/// getFunctionNoProtoType - Return a K&R style C function type like 'int()'.
	QualType
	ASTContext::getFunctionNoProtoType(QualType ResultTy,
	const FunctionType::ExtInfo &Info) const {
	// Unique functions, to guarantee there is only one function of a particular
	// structure.
	llvm::FoldingSetNodeID ID;
	FunctionNoProtoType::Profile(ID, ResultTy, Info);

	void *InsertPos = nullptr;
	if (FunctionNoProtoType *FT =
	FunctionNoProtoTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(FT, 0);

	QualType Canonical;
	if (!isCanonicalResultType(ResultTy)) {
	Canonical =
	getFunctionNoProtoType(getCanonicalFunctionResultType(ResultTy), Info);

	// Get the new insert position for the node we care about.
	FunctionNoProtoType *NewIP =
	FunctionNoProtoTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!NewIP && "Shouldn't be in the map!"); (void)NewIP;
	}

	FunctionNoProtoType New = new (this, TypeAlignment)
	FunctionNoProtoType(ResultTy, Canonical, Info);
	Types.push_back(New);
	FunctionNoProtoTypes.InsertNode(New, InsertPos);
	return QualType(New, 0);
	}

	CanQualType
	ASTContext::getCanonicalFunctionResultType(QualType ResultType) const {
	CanQualType CanResultType = getCanonicalType(ResultType);

	// Canonical result types do not have ARC lifetime qualifiers.
	if (CanResultType.getQualifiers().hasObjCLifetime()) {
	Qualifiers Qs = CanResultType.getQualifiers();
	Qs.removeObjCLifetime();
	return CanQualType::CreateUnsafe(
	getQualifiedType(CanResultType.getUnqualifiedType(), Qs));
	}

	return CanResultType;
	}

	static bool isCanonicalExceptionSpecification(
	const FunctionProtoType::ExceptionSpecInfo &ESI, bool NoexceptInType) {
	if (ESI.Type == EST_None)
	return true;
	if (!NoexceptInType)
	return false;

	// C++17 onwards: exception specification is part of the type, as a simple
	// boolean "can this function type throw".
	if (ESI.Type == EST_BasicNoexcept)
	return true;

	// A dynamic exception specification is canonical if it only contains pack
	// expansions (so we can't tell whether it's non-throwing) and all its
	// contained types are canonical.
	if (ESI.Type == EST_Dynamic) {
	bool AnyPackExpansions = false;
	for (QualType ET : ESI.Exceptions) {
	if (!ET.isCanonical())
	return false;
	if (ET->getAs<PackExpansionType>())
	AnyPackExpansions = true;
	}
	return AnyPackExpansions;
	}

	// A noexcept(expr) specification is (possibly) canonical if expr is
	// value-dependent.
	if (ESI.Type == EST_ComputedNoexcept)
	return ESI.NoexceptExpr && ESI.NoexceptExpr->isValueDependent();

	return false;
	}

	QualType ASTContext::getFunctionTypeInternal(
	QualType ResultTy, ArrayRef<QualType> ArgArray,
	const FunctionProtoType::ExtProtoInfo &EPI, bool OnlyWantCanonical) const {
	size_t NumArgs = ArgArray.size();

	// Unique functions, to guarantee there is only one function of a particular
	// structure.
	llvm::FoldingSetNodeID ID;
	FunctionProtoType::Profile(ID, ResultTy, ArgArray.begin(), NumArgs, EPI,
	*this, true);

	QualType Canonical;
	bool Unique = false;

	void *InsertPos = nullptr;
	if (FunctionProtoType *FPT =
	FunctionProtoTypes.FindNodeOrInsertPos(ID, InsertPos)) {
	QualType Existing = QualType(FPT, 0);

	// If we find a pre-existing equivalent FunctionProtoType, we can just reuse
	// it so long as our exception specification doesn't contain a dependent
	// noexcept expression, or we're just looking for a canonical type.
	// Otherwise, we're going to need to create a type
	// sugar node to hold the concrete expression.
	if (OnlyWantCanonical \|\| EPI.ExceptionSpec.Type != EST_ComputedNoexcept \|\|
	EPI.ExceptionSpec.NoexceptExpr == FPT->getNoexceptExpr())
	return Existing;

	// We need a new type sugar node for this one, to hold the new noexcept
	// expression. We do no canonicalization here, but that's OK since we don't
	// expect to see the same noexcept expression much more than once.
	Canonical = getCanonicalType(Existing);
	Unique = true;
	}

	bool NoexceptInType = getLangOpts().CPlusPlus17;
	bool IsCanonicalExceptionSpec =
	isCanonicalExceptionSpecification(EPI.ExceptionSpec, NoexceptInType);

	// Determine whether the type being created is already canonical or not.
	bool isCanonical = !Unique && IsCanonicalExceptionSpec &&
	isCanonicalResultType(ResultTy) && !EPI.HasTrailingReturn;
	for (unsigned i = 0; i != NumArgs && isCanonical; ++i)
	if (!ArgArray[i].isCanonicalAsParam())
	isCanonical = false;

	if (OnlyWantCanonical)
	assert(isCanonical &&
	"given non-canonical parameters constructing canonical type");

	// If this type isn't canonical, get the canonical version of it if we don't
	// already have it. The exception spec is only partially part of the
	// canonical type, and only in C++17 onwards.
	if (!isCanonical && Canonical.isNull()) {
	SmallVector<QualType, 16> CanonicalArgs;
	CanonicalArgs.reserve(NumArgs);
	for (unsigned i = 0; i != NumArgs; ++i)
	CanonicalArgs.push_back(getCanonicalParamType(ArgArray[i]));

	llvm::SmallVector<QualType, 8> ExceptionTypeStorage;
	FunctionProtoType::ExtProtoInfo CanonicalEPI = EPI;
	CanonicalEPI.HasTrailingReturn = false;

	if (IsCanonicalExceptionSpec) {
	// Exception spec is already OK.
	} else if (NoexceptInType) {
	switch (EPI.ExceptionSpec.Type) {
	case EST_Unparsed: case EST_Unevaluated: case EST_Uninstantiated:
	// We don't know yet. It shouldn't matter what we pick here; no-one
	// should ever look at this.
	LLVM_FALLTHROUGH;
	case EST_None: case EST_MSAny:
	CanonicalEPI.ExceptionSpec.Type = EST_None;
	break;

	// A dynamic exception specification is almost always "not noexcept",
	// with the exception that a pack expansion might expand to no types.
	case EST_Dynamic: {
	bool AnyPacks = false;
	for (QualType ET : EPI.ExceptionSpec.Exceptions) {
	if (ET->getAs<PackExpansionType>())
	AnyPacks = true;
	ExceptionTypeStorage.push_back(getCanonicalType(ET));
	}
	if (!AnyPacks)
	CanonicalEPI.ExceptionSpec.Type = EST_None;
	else {
	CanonicalEPI.ExceptionSpec.Type = EST_Dynamic;
	CanonicalEPI.ExceptionSpec.Exceptions = ExceptionTypeStorage;
	}
	break;
	}

	case EST_DynamicNone: case EST_BasicNoexcept:
	CanonicalEPI.ExceptionSpec.Type = EST_BasicNoexcept;
	break;

	case EST_ComputedNoexcept:
	llvm::APSInt Value(1);
	auto *E = CanonicalEPI.ExceptionSpec.NoexceptExpr;
	if (!E \|\| !E->isIntegerConstantExpr(Value, *this, nullptr,
	/IsEvaluated/false)) {
	// This noexcept specification is invalid.
	// FIXME: Should this be able to happen?
	CanonicalEPI.ExceptionSpec.Type = EST_None;
	break;
	}

	CanonicalEPI.ExceptionSpec.Type =
	Value.getBoolValue() ? EST_BasicNoexcept : EST_None;
	break;
	}
	} else {
	CanonicalEPI.ExceptionSpec = FunctionProtoType::ExceptionSpecInfo();
	}

	// Adjust the canonical function result type.
	CanQualType CanResultTy = getCanonicalFunctionResultType(ResultTy);
	Canonical =
	getFunctionTypeInternal(CanResultTy, CanonicalArgs, CanonicalEPI, true);

	// Get the new insert position for the node we care about.
	FunctionProtoType *NewIP =
	FunctionProtoTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!NewIP && "Shouldn't be in the map!"); (void)NewIP;
	}

	// FunctionProtoType objects are allocated with extra bytes after
	// them for three variable size arrays at the end:
	// - parameter types
	// - exception types
	// - extended parameter information
	// Instead of the exception types, there could be a noexcept
	// expression, or information used to resolve the exception
	// specification.
	size_t Size = sizeof(FunctionProtoType) +
	NumArgs * sizeof(QualType);

	if (EPI.ExceptionSpec.Type == EST_Dynamic) {
	Size += EPI.ExceptionSpec.Exceptions.size() * sizeof(QualType);
	} else if (EPI.ExceptionSpec.Type == EST_ComputedNoexcept) {
	Size += sizeof(Expr*);
	} else if (EPI.ExceptionSpec.Type == EST_Uninstantiated) {
	Size += 2 * sizeof(FunctionDecl*);
	} else if (EPI.ExceptionSpec.Type == EST_Unevaluated) {
	Size += sizeof(FunctionDecl*);
	}

	// Put the ExtParameterInfos last. If all were equal, it would make
	// more sense to put these before the exception specification, because
	// it's much easier to skip past them compared to the elaborate switch
	// required to skip the exception specification. However, all is not
	// equal; ExtParameterInfos are used to model very uncommon features,
	// and it's better not to burden the more common paths.
	if (EPI.ExtParameterInfos) {
	Size += NumArgs * sizeof(FunctionProtoType::ExtParameterInfo);
	}

	FunctionProtoType FTP = (FunctionProtoType) Allocate(Size, TypeAlignment);
	FunctionProtoType::ExtProtoInfo newEPI = EPI;
	new (FTP) FunctionProtoType(ResultTy, ArgArray, Canonical, newEPI);
	Types.push_back(FTP);
	if (!Unique)
	FunctionProtoTypes.InsertNode(FTP, InsertPos);
	return QualType(FTP, 0);
	}

	QualType ASTContext::getPipeType(QualType T, bool ReadOnly) const {
	llvm::FoldingSetNodeID ID;
	PipeType::Profile(ID, T, ReadOnly);

	void *InsertPos = nullptr;
	if (PipeType *PT = PipeTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(PT, 0);

	// If the pipe element type isn't canonical, this won't be a canonical type
	// either, so fill in the canonical type field.
	QualType Canonical;
	if (!T.isCanonical()) {
	Canonical = getPipeType(getCanonicalType(T), ReadOnly);

	// Get the new insert position for the node we care about.
	PipeType *NewIP = PipeTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!NewIP && "Shouldn't be in the map!");
	(void)NewIP;
	}
	PipeType New = new (this, TypeAlignment) PipeType(T, Canonical, ReadOnly);
	Types.push_back(New);
	PipeTypes.InsertNode(New, InsertPos);
	return QualType(New, 0);
	}

	QualType ASTContext::getReadPipeType(QualType T) const {
	return getPipeType(T, true);
	}

	QualType ASTContext::getWritePipeType(QualType T) const {
	return getPipeType(T, false);
	}

	#ifndef NDEBUG
	static bool NeedsInjectedClassNameType(const RecordDecl *D) {
	if (!isa<CXXRecordDecl>(D)) return false;
	const CXXRecordDecl *RD = cast<CXXRecordDecl>(D);
	if (isa<ClassTemplatePartialSpecializationDecl>(RD))
	return true;
	if (RD->getDescribedClassTemplate() &&
	!isa<ClassTemplateSpecializationDecl>(RD))
	return true;
	return false;
	}
	#endif

	/// getInjectedClassNameType - Return the unique reference to the
	/// injected class name type for the specified templated declaration.
	QualType ASTContext::getInjectedClassNameType(CXXRecordDecl *Decl,
	QualType TST) const {
	assert(NeedsInjectedClassNameType(Decl));
	if (Decl->TypeForDecl) {
	assert(isa<InjectedClassNameType>(Decl->TypeForDecl));
	} else if (CXXRecordDecl *PrevDecl = Decl->getPreviousDecl()) {
	assert(PrevDecl->TypeForDecl && "previous declaration has no type");
	Decl->TypeForDecl = PrevDecl->TypeForDecl;
	assert(isa<InjectedClassNameType>(Decl->TypeForDecl));
	} else {
	Type *newType =
	new (*this, TypeAlignment) InjectedClassNameType(Decl, TST);
	Decl->TypeForDecl = newType;
	Types.push_back(newType);
	}
	return QualType(Decl->TypeForDecl, 0);
	}

	/// getTypeDeclType - Return the unique reference to the type for the
	/// specified type declaration.
	QualType ASTContext::getTypeDeclTypeSlow(const TypeDecl *Decl) const {
	assert(Decl && "Passed null for Decl param");
	assert(!Decl->TypeForDecl && "TypeForDecl present in slow case");

	if (const TypedefNameDecl *Typedef = dyn_cast<TypedefNameDecl>(Decl))
	return getTypedefType(Typedef);

	assert(!isa<TemplateTypeParmDecl>(Decl) &&
	"Template type parameter types are always available.");

	if (const RecordDecl *Record = dyn_cast<RecordDecl>(Decl)) {
	assert(Record->isFirstDecl() && "struct/union has previous declaration");
	assert(!NeedsInjectedClassNameType(Record));
	return getRecordType(Record);
	} else if (const EnumDecl *Enum = dyn_cast<EnumDecl>(Decl)) {
	assert(Enum->isFirstDecl() && "enum has previous declaration");
	return getEnumType(Enum);
	} else if (const UnresolvedUsingTypenameDecl *Using =
	dyn_cast<UnresolvedUsingTypenameDecl>(Decl)) {
	Type newType = new (this, TypeAlignment) UnresolvedUsingType(Using);
	Decl->TypeForDecl = newType;
	Types.push_back(newType);
	} else
	llvm_unreachable("TypeDecl without a type?");

	return QualType(Decl->TypeForDecl, 0);
	}

	/// getTypedefType - Return the unique reference to the type for the
	/// specified typedef name decl.
	QualType
	ASTContext::getTypedefType(const TypedefNameDecl *Decl,
	QualType Canonical) const {
	if (Decl->TypeForDecl) return QualType(Decl->TypeForDecl, 0);

	if (Canonical.isNull())
	Canonical = getCanonicalType(Decl->getUnderlyingType());
	TypedefType newType = new(this, TypeAlignment)
	TypedefType(Type::Typedef, Decl, Canonical);
	Decl->TypeForDecl = newType;
	Types.push_back(newType);
	return QualType(newType, 0);
	}

	QualType ASTContext::getRecordType(const RecordDecl *Decl) const {
	if (Decl->TypeForDecl) return QualType(Decl->TypeForDecl, 0);

	if (const RecordDecl *PrevDecl = Decl->getPreviousDecl())
	if (PrevDecl->TypeForDecl)
	return QualType(Decl->TypeForDecl = PrevDecl->TypeForDecl, 0);

	RecordType newType = new (this, TypeAlignment) RecordType(Decl);
	Decl->TypeForDecl = newType;
	Types.push_back(newType);
	return QualType(newType, 0);
	}

	QualType ASTContext::getEnumType(const EnumDecl *Decl) const {
	if (Decl->TypeForDecl) return QualType(Decl->TypeForDecl, 0);

	if (const EnumDecl *PrevDecl = Decl->getPreviousDecl())
	if (PrevDecl->TypeForDecl)
	return QualType(Decl->TypeForDecl = PrevDecl->TypeForDecl, 0);

	EnumType newType = new (this, TypeAlignment) EnumType(Decl);
	Decl->TypeForDecl = newType;
	Types.push_back(newType);
	return QualType(newType, 0);
	}

	QualType ASTContext::getAttributedType(AttributedType::Kind attrKind,
	QualType modifiedType,
	QualType equivalentType) {
	llvm::FoldingSetNodeID id;
	AttributedType::Profile(id, attrKind, modifiedType, equivalentType);

	void *insertPos = nullptr;
	AttributedType *type = AttributedTypes.FindNodeOrInsertPos(id, insertPos);
	if (type) return QualType(type, 0);

	QualType canon = getCanonicalType(equivalentType);
	type = new (*this, TypeAlignment)
	AttributedType(canon, attrKind, modifiedType, equivalentType);

	Types.push_back(type);
	AttributedTypes.InsertNode(type, insertPos);

	return QualType(type, 0);
	}

	/// \brief Retrieve a substitution-result type.
	QualType
	ASTContext::getSubstTemplateTypeParmType(const TemplateTypeParmType *Parm,
	QualType Replacement) const {
	assert(Replacement.isCanonical()
	&& "replacement types must always be canonical");

	llvm::FoldingSetNodeID ID;
	SubstTemplateTypeParmType::Profile(ID, Parm, Replacement);
	void *InsertPos = nullptr;
	SubstTemplateTypeParmType *SubstParm
	= SubstTemplateTypeParmTypes.FindNodeOrInsertPos(ID, InsertPos);

	if (!SubstParm) {
	SubstParm = new (*this, TypeAlignment)
	SubstTemplateTypeParmType(Parm, Replacement);
	Types.push_back(SubstParm);
	SubstTemplateTypeParmTypes.InsertNode(SubstParm, InsertPos);
	}

	return QualType(SubstParm, 0);
	}

	/// \brief Retrieve a
	QualType ASTContext::getSubstTemplateTypeParmPackType(
	const TemplateTypeParmType *Parm,
	const TemplateArgument &ArgPack) {
	#ifndef NDEBUG
	for (const auto &P : ArgPack.pack_elements()) {
	assert(P.getKind() == TemplateArgument::Type &&"Pack contains a non-type");
	assert(P.getAsType().isCanonical() && "Pack contains non-canonical type");
	}
	#endif

	llvm::FoldingSetNodeID ID;
	SubstTemplateTypeParmPackType::Profile(ID, Parm, ArgPack);
	void *InsertPos = nullptr;
	if (SubstTemplateTypeParmPackType *SubstParm
	= SubstTemplateTypeParmPackTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(SubstParm, 0);

	QualType Canon;
	if (!Parm->isCanonicalUnqualified()) {
	Canon = getCanonicalType(QualType(Parm, 0));
	Canon = getSubstTemplateTypeParmPackType(cast<TemplateTypeParmType>(Canon),
	ArgPack);
	SubstTemplateTypeParmPackTypes.FindNodeOrInsertPos(ID, InsertPos);
	}

	SubstTemplateTypeParmPackType *SubstParm
	= new (*this, TypeAlignment) SubstTemplateTypeParmPackType(Parm, Canon,
	ArgPack);
	Types.push_back(SubstParm);
	SubstTemplateTypeParmPackTypes.InsertNode(SubstParm, InsertPos);
	return QualType(SubstParm, 0);
	}

	/// \brief Retrieve the template type parameter type for a template
	/// parameter or parameter pack with the given depth, index, and (optionally)
	/// name.
	QualType ASTContext::getTemplateTypeParmType(unsigned Depth, unsigned Index,
	bool ParameterPack,
	TemplateTypeParmDecl *TTPDecl) const {
	llvm::FoldingSetNodeID ID;
	TemplateTypeParmType::Profile(ID, Depth, Index, ParameterPack, TTPDecl);
	void *InsertPos = nullptr;
	TemplateTypeParmType *TypeParm
	= TemplateTypeParmTypes.FindNodeOrInsertPos(ID, InsertPos);

	if (TypeParm)
	return QualType(TypeParm, 0);

	if (TTPDecl) {
	QualType Canon = getTemplateTypeParmType(Depth, Index, ParameterPack);
	TypeParm = new (*this, TypeAlignment) TemplateTypeParmType(TTPDecl, Canon);

	TemplateTypeParmType *TypeCheck
	= TemplateTypeParmTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!TypeCheck && "Template type parameter canonical type broken");
	(void)TypeCheck;
	} else
	TypeParm = new (*this, TypeAlignment)
	TemplateTypeParmType(Depth, Index, ParameterPack);

	Types.push_back(TypeParm);
	TemplateTypeParmTypes.InsertNode(TypeParm, InsertPos);

	return QualType(TypeParm, 0);
	}

	TypeSourceInfo *
	ASTContext::getTemplateSpecializationTypeInfo(TemplateName Name,
	SourceLocation NameLoc,
	const TemplateArgumentListInfo &Args,
	QualType Underlying) const {
	assert(!Name.getAsDependentTemplateName() &&
	"No dependent template names here!");
	QualType TST = getTemplateSpecializationType(Name, Args, Underlying);

	TypeSourceInfo *DI = CreateTypeSourceInfo(TST);
	TemplateSpecializationTypeLoc TL =
	DI->getTypeLoc().castAs<TemplateSpecializationTypeLoc>();
	TL.setTemplateKeywordLoc(SourceLocation());
	TL.setTemplateNameLoc(NameLoc);
	TL.setLAngleLoc(Args.getLAngleLoc());
	TL.setRAngleLoc(Args.getRAngleLoc());
	for (unsigned i = 0, e = TL.getNumArgs(); i != e; ++i)
	TL.setArgLocInfo(i, Args[i].getLocInfo());
	return DI;
	}

	QualType
	ASTContext::getTemplateSpecializationType(TemplateName Template,
	const TemplateArgumentListInfo &Args,
	QualType Underlying) const {
	assert(!Template.getAsDependentTemplateName() &&
	"No dependent template names here!");

	SmallVector<TemplateArgument, 4> ArgVec;
	ArgVec.reserve(Args.size());
	for (const TemplateArgumentLoc &Arg : Args.arguments())
	ArgVec.push_back(Arg.getArgument());

	return getTemplateSpecializationType(Template, ArgVec, Underlying);
	}

	#ifndef NDEBUG
	static bool hasAnyPackExpansions(ArrayRef<TemplateArgument> Args) {
	for (const TemplateArgument &Arg : Args)
	if (Arg.isPackExpansion())
	return true;

	return true;
	}
	#endif

	QualType
	ASTContext::getTemplateSpecializationType(TemplateName Template,
	ArrayRef<TemplateArgument> Args,
	QualType Underlying) const {
	assert(!Template.getAsDependentTemplateName() &&
	"No dependent template names here!");
	// Look through qualified template names.
	if (QualifiedTemplateName *QTN = Template.getAsQualifiedTemplateName())
	Template = TemplateName(QTN->getTemplateDecl());

	bool IsTypeAlias =
	Template.getAsTemplateDecl() &&
	isa<TypeAliasTemplateDecl>(Template.getAsTemplateDecl());
	QualType CanonType;
	if (!Underlying.isNull())
	CanonType = getCanonicalType(Underlying);
	else {
	// We can get here with an alias template when the specialization contains
	// a pack expansion that does not match up with a parameter pack.
	assert((!IsTypeAlias \|\| hasAnyPackExpansions(Args)) &&
	"Caller must compute aliased type");
	IsTypeAlias = false;
	CanonType = getCanonicalTemplateSpecializationType(Template, Args);
	}

	// Allocate the (non-canonical) template specialization type, but don't
	// try to unique it: these types typically have location information that
	// we don't unique and don't want to lose.
	void *Mem = Allocate(sizeof(TemplateSpecializationType) +
	sizeof(TemplateArgument) * Args.size() +
	(IsTypeAlias? sizeof(QualType) : 0),
	TypeAlignment);
	TemplateSpecializationType *Spec
	= new (Mem) TemplateSpecializationType(Template, Args, CanonType,
	IsTypeAlias ? Underlying : QualType());

	Types.push_back(Spec);
	return QualType(Spec, 0);
	}

	QualType ASTContext::getCanonicalTemplateSpecializationType(
	TemplateName Template, ArrayRef<TemplateArgument> Args) const {
	assert(!Template.getAsDependentTemplateName() &&
	"No dependent template names here!");

	// Look through qualified template names.
	if (QualifiedTemplateName *QTN = Template.getAsQualifiedTemplateName())
	Template = TemplateName(QTN->getTemplateDecl());

	// Build the canonical template specialization type.
	TemplateName CanonTemplate = getCanonicalTemplateName(Template);
	SmallVector<TemplateArgument, 4> CanonArgs;
	unsigned NumArgs = Args.size();
	CanonArgs.reserve(NumArgs);
	for (const TemplateArgument &Arg : Args)
	CanonArgs.push_back(getCanonicalTemplateArgument(Arg));

	// Determine whether this canonical template specialization type already
	// exists.
	llvm::FoldingSetNodeID ID;
	TemplateSpecializationType::Profile(ID, CanonTemplate,
	CanonArgs, *this);

	void *InsertPos = nullptr;
	TemplateSpecializationType *Spec
	= TemplateSpecializationTypes.FindNodeOrInsertPos(ID, InsertPos);

	if (!Spec) {
	// Allocate a new canonical template specialization type.
	void *Mem = Allocate((sizeof(TemplateSpecializationType) +
	sizeof(TemplateArgument) * NumArgs),
	TypeAlignment);
	Spec = new (Mem) TemplateSpecializationType(CanonTemplate,
	CanonArgs,
	QualType(), QualType());
	Types.push_back(Spec);
	TemplateSpecializationTypes.InsertNode(Spec, InsertPos);
	}

	assert(Spec->isDependentType() &&
	"Non-dependent template-id type must have a canonical type");
	return QualType(Spec, 0);
	}

	QualType
	ASTContext::getElaboratedType(ElaboratedTypeKeyword Keyword,
	NestedNameSpecifier *NNS,
	QualType NamedType) const {
	llvm::FoldingSetNodeID ID;
	ElaboratedType::Profile(ID, Keyword, NNS, NamedType);

	void *InsertPos = nullptr;
	ElaboratedType *T = ElaboratedTypes.FindNodeOrInsertPos(ID, InsertPos);
	if (T)
	return QualType(T, 0);

	QualType Canon = NamedType;
	if (!Canon.isCanonical()) {
	Canon = getCanonicalType(NamedType);
	ElaboratedType *CheckT = ElaboratedTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!CheckT && "Elaborated canonical type broken");
	(void)CheckT;
	}

	T = new (*this, TypeAlignment) ElaboratedType(Keyword, NNS, NamedType, Canon);
	Types.push_back(T);
	ElaboratedTypes.InsertNode(T, InsertPos);
	return QualType(T, 0);
	}

	QualType
	ASTContext::getParenType(QualType InnerType) const {
	llvm::FoldingSetNodeID ID;
	ParenType::Profile(ID, InnerType);

	void *InsertPos = nullptr;
	ParenType *T = ParenTypes.FindNodeOrInsertPos(ID, InsertPos);
	if (T)
	return QualType(T, 0);

	QualType Canon = InnerType;
	if (!Canon.isCanonical()) {
	Canon = getCanonicalType(InnerType);
	ParenType *CheckT = ParenTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!CheckT && "Paren canonical type broken");
	(void)CheckT;
	}

	T = new (*this, TypeAlignment) ParenType(InnerType, Canon);
	Types.push_back(T);
	ParenTypes.InsertNode(T, InsertPos);
	return QualType(T, 0);
	}

	QualType ASTContext::getDependentNameType(ElaboratedTypeKeyword Keyword,
	NestedNameSpecifier *NNS,
	const IdentifierInfo *Name,
	QualType Canon) const {
	if (Canon.isNull()) {
	NestedNameSpecifier *CanonNNS = getCanonicalNestedNameSpecifier(NNS);
	if (CanonNNS != NNS)
	Canon = getDependentNameType(Keyword, CanonNNS, Name);
	}

	llvm::FoldingSetNodeID ID;
	DependentNameType::Profile(ID, Keyword, NNS, Name);

	void *InsertPos = nullptr;
	DependentNameType *T
	= DependentNameTypes.FindNodeOrInsertPos(ID, InsertPos);
	if (T)
	return QualType(T, 0);

	T = new (*this, TypeAlignment) DependentNameType(Keyword, NNS, Name, Canon);
	Types.push_back(T);
	DependentNameTypes.InsertNode(T, InsertPos);
	return QualType(T, 0);
	}

	QualType
	ASTContext::getDependentTemplateSpecializationType(
	ElaboratedTypeKeyword Keyword,
	NestedNameSpecifier *NNS,
	const IdentifierInfo *Name,
	const TemplateArgumentListInfo &Args) const {
	// TODO: avoid this copy
	SmallVector<TemplateArgument, 16> ArgCopy;
	for (unsigned I = 0, E = Args.size(); I != E; ++I)
	ArgCopy.push_back(Args[I].getArgument());
	return getDependentTemplateSpecializationType(Keyword, NNS, Name, ArgCopy);
	}

	QualType
	ASTContext::getDependentTemplateSpecializationType(
	ElaboratedTypeKeyword Keyword,
	NestedNameSpecifier *NNS,
	const IdentifierInfo *Name,
	ArrayRef<TemplateArgument> Args) const {
	assert((!NNS \|\| NNS->isDependent()) &&
	"nested-name-specifier must be dependent");

	llvm::FoldingSetNodeID ID;
	DependentTemplateSpecializationType::Profile(ID, *this, Keyword, NNS,
	Name, Args);

	void *InsertPos = nullptr;
	DependentTemplateSpecializationType *T
	= DependentTemplateSpecializationTypes.FindNodeOrInsertPos(ID, InsertPos);
	if (T)
	return QualType(T, 0);

	NestedNameSpecifier *CanonNNS = getCanonicalNestedNameSpecifier(NNS);

	ElaboratedTypeKeyword CanonKeyword = Keyword;
	if (Keyword == ETK_None) CanonKeyword = ETK_Typename;

	bool AnyNonCanonArgs = false;
	unsigned NumArgs = Args.size();
	SmallVector<TemplateArgument, 16> CanonArgs(NumArgs);
	for (unsigned I = 0; I != NumArgs; ++I) {
	CanonArgs[I] = getCanonicalTemplateArgument(Args[I]);
	if (!CanonArgs[I].structurallyEquals(Args[I]))
	AnyNonCanonArgs = true;
	}

	QualType Canon;
	if (AnyNonCanonArgs \|\| CanonNNS != NNS \|\| CanonKeyword != Keyword) {
	Canon = getDependentTemplateSpecializationType(CanonKeyword, CanonNNS,
	Name,
	CanonArgs);

	// Find the insert position again.
	DependentTemplateSpecializationTypes.FindNodeOrInsertPos(ID, InsertPos);
	}

	void *Mem = Allocate((sizeof(DependentTemplateSpecializationType) +
	sizeof(TemplateArgument) * NumArgs),
	TypeAlignment);
	T = new (Mem) DependentTemplateSpecializationType(Keyword, NNS,
	Name, Args, Canon);
	Types.push_back(T);
	DependentTemplateSpecializationTypes.InsertNode(T, InsertPos);
	return QualType(T, 0);
	}

	TemplateArgument ASTContext::getInjectedTemplateArg(NamedDecl *Param) {
	TemplateArgument Arg;
	if (auto *TTP = dyn_cast<TemplateTypeParmDecl>(Param)) {
	QualType ArgType = getTypeDeclType(TTP);
	if (TTP->isParameterPack())
	ArgType = getPackExpansionType(ArgType, None);

	Arg = TemplateArgument(ArgType);
	} else if (auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(Param)) {
	Expr E = new (this) DeclRefExpr(
	NTTP, /enclosing/false,
	NTTP->getType().getNonLValueExprType(*this),
	Expr::getValueKindForType(NTTP->getType()), NTTP->getLocation());

	if (NTTP->isParameterPack())
	E = new (*this) PackExpansionExpr(DependentTy, E, NTTP->getLocation(),
	None);
	Arg = TemplateArgument(E);
	} else {
	auto *TTP = cast<TemplateTemplateParmDecl>(Param);
	if (TTP->isParameterPack())
	Arg = TemplateArgument(TemplateName(TTP), Optional<unsigned>());
	else
	Arg = TemplateArgument(TemplateName(TTP));
	}

	if (Param->isTemplateParameterPack())
	Arg = TemplateArgument::CreatePackCopy(*this, Arg);

	return Arg;
	}

	void
	ASTContext::getInjectedTemplateArgs(const TemplateParameterList *Params,
	SmallVectorImpl<TemplateArgument> &Args) {
	Args.reserve(Args.size() + Params->size());

	for (NamedDecl Param : Params)
	Args.push_back(getInjectedTemplateArg(Param));
	}

	QualType ASTContext::getPackExpansionType(QualType Pattern,
	Optional<unsigned> NumExpansions) {
	llvm::FoldingSetNodeID ID;
	PackExpansionType::Profile(ID, Pattern, NumExpansions);

	assert(Pattern->containsUnexpandedParameterPack() &&
	"Pack expansions must expand one or more parameter packs");
	void *InsertPos = nullptr;
	PackExpansionType *T
	= PackExpansionTypes.FindNodeOrInsertPos(ID, InsertPos);
	if (T)
	return QualType(T, 0);

	QualType Canon;
	if (!Pattern.isCanonical()) {
	Canon = getCanonicalType(Pattern);
	// The canonical type might not contain an unexpanded parameter pack, if it
	// contains an alias template specialization which ignores one of its
	// parameters.
	if (Canon->containsUnexpandedParameterPack()) {
	Canon = getPackExpansionType(Canon, NumExpansions);

	// Find the insert position again, in case we inserted an element into
	// PackExpansionTypes and invalidated our insert position.
	PackExpansionTypes.FindNodeOrInsertPos(ID, InsertPos);
	}
	}

	T = new (*this, TypeAlignment)
	PackExpansionType(Pattern, Canon, NumExpansions);
	Types.push_back(T);
	PackExpansionTypes.InsertNode(T, InsertPos);
	return QualType(T, 0);
	}

	/// CmpProtocolNames - Comparison predicate for sorting protocols
	/// alphabetically.
	static int CmpProtocolNames(ObjCProtocolDecl const LHS,
	ObjCProtocolDecl const RHS) {
	return DeclarationName::compare((LHS)->getDeclName(), (RHS)->getDeclName());
	}

	static bool areSortedAndUniqued(ArrayRef<ObjCProtocolDecl *> Protocols) {
	if (Protocols.empty()) return true;

	if (Protocols[0]->getCanonicalDecl() != Protocols[0])
	return false;

	for (unsigned i = 1; i != Protocols.size(); ++i)
	if (CmpProtocolNames(&Protocols[i - 1], &Protocols[i]) >= 0 \|\|
	Protocols[i]->getCanonicalDecl() != Protocols[i])
	return false;
	return true;
	}

	static void
	SortAndUniqueProtocols(SmallVectorImpl<ObjCProtocolDecl *> &Protocols) {
	// Sort protocols, keyed by name.
	llvm::array_pod_sort(Protocols.begin(), Protocols.end(), CmpProtocolNames);

	// Canonicalize.
	for (ObjCProtocolDecl *&P : Protocols)
	P = P->getCanonicalDecl();

	// Remove duplicates.
	auto ProtocolsEnd = std::unique(Protocols.begin(), Protocols.end());
	Protocols.erase(ProtocolsEnd, Protocols.end());
	}

	QualType ASTContext::getObjCObjectType(QualType BaseType,
	ObjCProtocolDecl * const *Protocols,
	unsigned NumProtocols) const {
	return getObjCObjectType(BaseType, {},
	llvm::makeArrayRef(Protocols, NumProtocols),
	/isKindOf=/false);
	}

	QualType ASTContext::getObjCObjectType(
	QualType baseType,
	ArrayRef<QualType> typeArgs,
	ArrayRef<ObjCProtocolDecl *> protocols,
	bool isKindOf) const {
	// If the base type is an interface and there aren't any protocols or
	// type arguments to add, then the interface type will do just fine.
	if (typeArgs.empty() && protocols.empty() && !isKindOf &&
	isa<ObjCInterfaceType>(baseType))
	return baseType;

	// Look in the folding set for an existing type.
	llvm::FoldingSetNodeID ID;
	ObjCObjectTypeImpl::Profile(ID, baseType, typeArgs, protocols, isKindOf);
	void *InsertPos = nullptr;
	if (ObjCObjectType *QT = ObjCObjectTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(QT, 0);

	// Determine the type arguments to be used for canonicalization,
	// which may be explicitly specified here or written on the base
	// type.
	ArrayRef<QualType> effectiveTypeArgs = typeArgs;
	if (effectiveTypeArgs.empty()) {
	if (auto baseObject = baseType->getAs<ObjCObjectType>())
	effectiveTypeArgs = baseObject->getTypeArgs();
	}

	// Build the canonical type, which has the canonical base type and a
	// sorted-and-uniqued list of protocols and the type arguments
	// canonicalized.
	QualType canonical;
	bool typeArgsAreCanonical = std::all_of(effectiveTypeArgs.begin(),
	effectiveTypeArgs.end(),
	[&](QualType type) {
	return type.isCanonical();
	});
	bool protocolsSorted = areSortedAndUniqued(protocols);
	if (!typeArgsAreCanonical \|\| !protocolsSorted \|\| !baseType.isCanonical()) {
	// Determine the canonical type arguments.
	ArrayRef<QualType> canonTypeArgs;
	SmallVector<QualType, 4> canonTypeArgsVec;
	if (!typeArgsAreCanonical) {
	canonTypeArgsVec.reserve(effectiveTypeArgs.size());
	for (auto typeArg : effectiveTypeArgs)
	canonTypeArgsVec.push_back(getCanonicalType(typeArg));
	canonTypeArgs = canonTypeArgsVec;
	} else {
	canonTypeArgs = effectiveTypeArgs;
	}

	ArrayRef<ObjCProtocolDecl *> canonProtocols;
	SmallVector<ObjCProtocolDecl*, 8> canonProtocolsVec;
	if (!protocolsSorted) {
	canonProtocolsVec.append(protocols.begin(), protocols.end());
	SortAndUniqueProtocols(canonProtocolsVec);
	canonProtocols = canonProtocolsVec;
	} else {
	canonProtocols = protocols;
	}

	canonical = getObjCObjectType(getCanonicalType(baseType), canonTypeArgs,
	canonProtocols, isKindOf);

	// Regenerate InsertPos.
	ObjCObjectTypes.FindNodeOrInsertPos(ID, InsertPos);
	}

	unsigned size = sizeof(ObjCObjectTypeImpl);
	size += typeArgs.size() * sizeof(QualType);
	size += protocols.size() * sizeof(ObjCProtocolDecl *);
	void *mem = Allocate(size, TypeAlignment);
	ObjCObjectTypeImpl *T =
	new (mem) ObjCObjectTypeImpl(canonical, baseType, typeArgs, protocols,
	isKindOf);

	Types.push_back(T);
	ObjCObjectTypes.InsertNode(T, InsertPos);
	return QualType(T, 0);
	}

	/// Apply Objective-C protocol qualifiers to the given type.
	/// If this is for the canonical type of a type parameter, we can apply
	/// protocol qualifiers on the ObjCObjectPointerType.
	QualType
	ASTContext::applyObjCProtocolQualifiers(QualType type,
	ArrayRef<ObjCProtocolDecl *> protocols, bool &hasError,
	bool allowOnPointerType) const {
	hasError = false;

	if (const ObjCTypeParamType *objT =
	dyn_cast<ObjCTypeParamType>(type.getTypePtr())) {
	return getObjCTypeParamType(objT->getDecl(), protocols);
	}

	// Apply protocol qualifiers to ObjCObjectPointerType.
	if (allowOnPointerType) {
	if (const ObjCObjectPointerType *objPtr =
	dyn_cast<ObjCObjectPointerType>(type.getTypePtr())) {
	const ObjCObjectType *objT = objPtr->getObjectType();
	// Merge protocol lists and construct ObjCObjectType.
	SmallVector<ObjCProtocolDecl*, 8> protocolsVec;
	protocolsVec.append(objT->qual_begin(),
	objT->qual_end());
	protocolsVec.append(protocols.begin(), protocols.end());
	ArrayRef<ObjCProtocolDecl *> protocols = protocolsVec;
	type = getObjCObjectType(
	objT->getBaseType(),
	objT->getTypeArgsAsWritten(),
	protocols,
	objT->isKindOfTypeAsWritten());
	return getObjCObjectPointerType(type);
	}
	}

	// Apply protocol qualifiers to ObjCObjectType.
	if (const ObjCObjectType *objT = dyn_cast<ObjCObjectType>(type.getTypePtr())){
	// FIXME: Check for protocols to which the class type is already
	// known to conform.

	return getObjCObjectType(objT->getBaseType(),
	objT->getTypeArgsAsWritten(),
	protocols,
	objT->isKindOfTypeAsWritten());
	}

	// If the canonical type is ObjCObjectType, ...
	if (type->isObjCObjectType()) {
	// Silently overwrite any existing protocol qualifiers.
	// TODO: determine whether that's the right thing to do.

	// FIXME: Check for protocols to which the class type is already
	// known to conform.
	return getObjCObjectType(type, {}, protocols, false);
	}

	// id<protocol-list>
	if (type->isObjCIdType()) {
	const ObjCObjectPointerType *objPtr = type->castAs<ObjCObjectPointerType>();
	type = getObjCObjectType(ObjCBuiltinIdTy, {}, protocols,
	objPtr->isKindOfType());
	return getObjCObjectPointerType(type);
	}

	// Class<protocol-list>
	if (type->isObjCClassType()) {
	const ObjCObjectPointerType *objPtr = type->castAs<ObjCObjectPointerType>();
	type = getObjCObjectType(ObjCBuiltinClassTy, {}, protocols,
	objPtr->isKindOfType());
	return getObjCObjectPointerType(type);
	}

	hasError = true;
	return type;
	}

	QualType
	ASTContext::getObjCTypeParamType(const ObjCTypeParamDecl *Decl,
	ArrayRef<ObjCProtocolDecl *> protocols,
	QualType Canonical) const {
	// Look in the folding set for an existing type.
	llvm::FoldingSetNodeID ID;
	ObjCTypeParamType::Profile(ID, Decl, protocols);
	void *InsertPos = nullptr;
	if (ObjCTypeParamType *TypeParam =
	ObjCTypeParamTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(TypeParam, 0);

	if (Canonical.isNull()) {
	// We canonicalize to the underlying type.
	Canonical = getCanonicalType(Decl->getUnderlyingType());
	if (!protocols.empty()) {
	// Apply the protocol qualifers.
	bool hasError;
	Canonical = applyObjCProtocolQualifiers(Canonical, protocols, hasError,
	true/allowOnPointerType/);
	assert(!hasError && "Error when apply protocol qualifier to bound type");
	}
	}

	unsigned size = sizeof(ObjCTypeParamType);
	size += protocols.size() * sizeof(ObjCProtocolDecl *);
	void *mem = Allocate(size, TypeAlignment);
	ObjCTypeParamType *newType = new (mem)
	ObjCTypeParamType(Decl, Canonical, protocols);

	Types.push_back(newType);
	ObjCTypeParamTypes.InsertNode(newType, InsertPos);
	return QualType(newType, 0);
	}

	/// ObjCObjectAdoptsQTypeProtocols - Checks that protocols in IC's
	/// protocol list adopt all protocols in QT's qualified-id protocol
	/// list.
	bool ASTContext::ObjCObjectAdoptsQTypeProtocols(QualType QT,
	ObjCInterfaceDecl *IC) {
	if (!QT->isObjCQualifiedIdType())
	return false;

	if (const ObjCObjectPointerType *OPT = QT->getAs<ObjCObjectPointerType>()) {
	// If both the right and left sides have qualifiers.
	for (auto *Proto : OPT->quals()) {
	if (!IC->ClassImplementsProtocol(Proto, false))
	return false;
	}
	return true;
	}
	return false;
	}

	/// QIdProtocolsAdoptObjCObjectProtocols - Checks that protocols in
	/// QT's qualified-id protocol list adopt all protocols in IDecl's list
	/// of protocols.
	bool ASTContext::QIdProtocolsAdoptObjCObjectProtocols(QualType QT,
	ObjCInterfaceDecl *IDecl) {
	if (!QT->isObjCQualifiedIdType())
	return false;
	const ObjCObjectPointerType *OPT = QT->getAs<ObjCObjectPointerType>();
	if (!OPT)
	return false;
	if (!IDecl->hasDefinition())
	return false;
	llvm::SmallPtrSet<ObjCProtocolDecl *, 8> InheritedProtocols;
	CollectInheritedProtocols(IDecl, InheritedProtocols);
	if (InheritedProtocols.empty())
	return false;
	// Check that if every protocol in list of id<plist> conforms to a protcol
	// of IDecl's, then bridge casting is ok.
	bool Conforms = false;
	for (auto *Proto : OPT->quals()) {
	Conforms = false;
	for (auto *PI : InheritedProtocols) {
	if (ProtocolCompatibleWithProtocol(Proto, PI)) {
	Conforms = true;
	break;
	}
	}
	if (!Conforms)
	break;
	}
	if (Conforms)
	return true;

	for (auto *PI : InheritedProtocols) {
	// If both the right and left sides have qualifiers.
	bool Adopts = false;
	for (auto *Proto : OPT->quals()) {
	// return 'true' if 'PI' is in the inheritance hierarchy of Proto
	if ((Adopts = ProtocolCompatibleWithProtocol(PI, Proto)))
	break;
	}
	if (!Adopts)
	return false;
	}
	return true;
	}

	/// getObjCObjectPointerType - Return a ObjCObjectPointerType type for
	/// the given object type.
	QualType ASTContext::getObjCObjectPointerType(QualType ObjectT) const {
	llvm::FoldingSetNodeID ID;
	ObjCObjectPointerType::Profile(ID, ObjectT);

	void *InsertPos = nullptr;
	if (ObjCObjectPointerType *QT =
	ObjCObjectPointerTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(QT, 0);

	// Find the canonical object type.
	QualType Canonical;
	if (!ObjectT.isCanonical()) {
	Canonical = getObjCObjectPointerType(getCanonicalType(ObjectT));

	// Regenerate InsertPos.
	ObjCObjectPointerTypes.FindNodeOrInsertPos(ID, InsertPos);
	}

	// No match.
	void *Mem = Allocate(sizeof(ObjCObjectPointerType), TypeAlignment);
	ObjCObjectPointerType *QType =
	new (Mem) ObjCObjectPointerType(Canonical, ObjectT);

	Types.push_back(QType);
	ObjCObjectPointerTypes.InsertNode(QType, InsertPos);
	return QualType(QType, 0);
	}

	/// getObjCInterfaceType - Return the unique reference to the type for the
	/// specified ObjC interface decl. The list of protocols is optional.
	QualType ASTContext::getObjCInterfaceType(const ObjCInterfaceDecl *Decl,
	ObjCInterfaceDecl *PrevDecl) const {
	if (Decl->TypeForDecl)
	return QualType(Decl->TypeForDecl, 0);

	if (PrevDecl) {
	assert(PrevDecl->TypeForDecl && "previous decl has no TypeForDecl");
	Decl->TypeForDecl = PrevDecl->TypeForDecl;
	return QualType(PrevDecl->TypeForDecl, 0);
	}

	// Prefer the definition, if there is one.
	if (const ObjCInterfaceDecl *Def = Decl->getDefinition())
	Decl = Def;

	void *Mem = Allocate(sizeof(ObjCInterfaceType), TypeAlignment);
	ObjCInterfaceType *T = new (Mem) ObjCInterfaceType(Decl);
	Decl->TypeForDecl = T;
	Types.push_back(T);
	return QualType(T, 0);
	}

	/// getTypeOfExprType - Unlike many "get<Type>" functions, we can't unique
	/// TypeOfExprType AST's (since expression's are never shared). For example,
	/// multiple declarations that refer to "typeof(x)" all contain different
	/// DeclRefExpr's. This doesn't effect the type checker, since it operates
	/// on canonical type's (which are always unique).
	QualType ASTContext::getTypeOfExprType(Expr *tofExpr) const {
	TypeOfExprType *toe;
	if (tofExpr->isTypeDependent()) {
	llvm::FoldingSetNodeID ID;
	DependentTypeOfExprType::Profile(ID, *this, tofExpr);

	void *InsertPos = nullptr;
	DependentTypeOfExprType *Canon
	= DependentTypeOfExprTypes.FindNodeOrInsertPos(ID, InsertPos);
	if (Canon) {
	// We already have a "canonical" version of an identical, dependent
	// typeof(expr) type. Use that as our canonical type.
	toe = new (*this, TypeAlignment) TypeOfExprType(tofExpr,
	QualType((TypeOfExprType*)Canon, 0));
	} else {
	// Build a new, canonical typeof(expr) type.
	Canon
	= new (this, TypeAlignment) DependentTypeOfExprType(this, tofExpr);
	DependentTypeOfExprTypes.InsertNode(Canon, InsertPos);
	toe = Canon;
	}
	} else {
	QualType Canonical = getCanonicalType(tofExpr->getType());
	toe = new (*this, TypeAlignment) TypeOfExprType(tofExpr, Canonical);
	}
	Types.push_back(toe);
	return QualType(toe, 0);
	}

	/// getTypeOfType - Unlike many "get<Type>" functions, we don't unique
	/// TypeOfType nodes. The only motivation to unique these nodes would be
	/// memory savings. Since typeof(t) is fairly uncommon, space shouldn't be
	/// an issue. This doesn't affect the type checker, since it operates
	/// on canonical types (which are always unique).
	QualType ASTContext::getTypeOfType(QualType tofType) const {
	QualType Canonical = getCanonicalType(tofType);
	TypeOfType tot = new (this, TypeAlignment) TypeOfType(tofType, Canonical);
	Types.push_back(tot);
	return QualType(tot, 0);
	}

	/// \brief Unlike many "get<Type>" functions, we don't unique DecltypeType
	/// nodes. This would never be helpful, since each such type has its own
	/// expression, and would not give a significant memory saving, since there
	/// is an Expr tree under each such type.
	QualType ASTContext::getDecltypeType(Expr *e, QualType UnderlyingType) const {
	DecltypeType *dt;

	// C++11 [temp.type]p2:
	// If an expression e involves a template parameter, decltype(e) denotes a
	// unique dependent type. Two such decltype-specifiers refer to the same
	// type only if their expressions are equivalent (14.5.6.1).
	if (e->isInstantiationDependent()) {
	llvm::FoldingSetNodeID ID;
	DependentDecltypeType::Profile(ID, *this, e);

	void *InsertPos = nullptr;
	DependentDecltypeType *Canon
	= DependentDecltypeTypes.FindNodeOrInsertPos(ID, InsertPos);
	if (!Canon) {
	// Build a new, canonical decltype(expr) type.
	Canon = new (this, TypeAlignment) DependentDecltypeType(this, e);
	DependentDecltypeTypes.InsertNode(Canon, InsertPos);
	}
	dt = new (*this, TypeAlignment)
	DecltypeType(e, UnderlyingType, QualType((DecltypeType *)Canon, 0));
	} else {
	dt = new (*this, TypeAlignment)
	DecltypeType(e, UnderlyingType, getCanonicalType(UnderlyingType));
	}
	Types.push_back(dt);
	return QualType(dt, 0);
	}

	/// getUnaryTransformationType - We don't unique these, since the memory
	/// savings are minimal and these are rare.
	QualType ASTContext::getUnaryTransformType(QualType BaseType,
	QualType UnderlyingType,
	UnaryTransformType::UTTKind Kind)
	const {
	UnaryTransformType *ut = nullptr;

	if (BaseType->isDependentType()) {
	// Look in the folding set for an existing type.
	llvm::FoldingSetNodeID ID;
	DependentUnaryTransformType::Profile(ID, getCanonicalType(BaseType), Kind);

	void *InsertPos = nullptr;
	DependentUnaryTransformType *Canon
	= DependentUnaryTransformTypes.FindNodeOrInsertPos(ID, InsertPos);

	if (!Canon) {
	// Build a new, canonical __underlying_type(type) type.
	Canon = new (*this, TypeAlignment)
	DependentUnaryTransformType(*this, getCanonicalType(BaseType),
	Kind);
	DependentUnaryTransformTypes.InsertNode(Canon, InsertPos);
	}
	ut = new (*this, TypeAlignment) UnaryTransformType (BaseType,
	QualType(), Kind,
	QualType(Canon, 0));
	} else {
	QualType CanonType = getCanonicalType(UnderlyingType);
	ut = new (*this, TypeAlignment) UnaryTransformType (BaseType,
	UnderlyingType, Kind,
	CanonType);
	}
	Types.push_back(ut);
	return QualType(ut, 0);
	}

	/// getAutoType - Return the uniqued reference to the 'auto' type which has been
	/// deduced to the given type, or to the canonical undeduced 'auto' type, or the
	/// canonical deduced-but-dependent 'auto' type.
	QualType ASTContext::getAutoType(QualType DeducedType, AutoTypeKeyword Keyword,
	bool IsDependent) const {
	if (DeducedType.isNull() && Keyword == AutoTypeKeyword::Auto && !IsDependent)
	return getAutoDeductType();

	// Look in the folding set for an existing type.
	void *InsertPos = nullptr;
	llvm::FoldingSetNodeID ID;
	AutoType::Profile(ID, DeducedType, Keyword, IsDependent);
	if (AutoType *AT = AutoTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(AT, 0);

	AutoType AT = new (this, TypeAlignment) AutoType(DeducedType,
	Keyword,
	IsDependent);
	Types.push_back(AT);
	if (InsertPos)
	AutoTypes.InsertNode(AT, InsertPos);
	return QualType(AT, 0);
	}

	/// Return the uniqued reference to the deduced template specialization type
	/// which has been deduced to the given type, or to the canonical undeduced
	/// such type, or the canonical deduced-but-dependent such type.
	QualType ASTContext::getDeducedTemplateSpecializationType(
	TemplateName Template, QualType DeducedType, bool IsDependent) const {
	// Look in the folding set for an existing type.
	void *InsertPos = nullptr;
	llvm::FoldingSetNodeID ID;
	DeducedTemplateSpecializationType::Profile(ID, Template, DeducedType,
	IsDependent);
	if (DeducedTemplateSpecializationType *DTST =
	DeducedTemplateSpecializationTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(DTST, 0);

	DeducedTemplateSpecializationType DTST = new (this, TypeAlignment)
	DeducedTemplateSpecializationType(Template, DeducedType, IsDependent);
	Types.push_back(DTST);
	if (InsertPos)
	DeducedTemplateSpecializationTypes.InsertNode(DTST, InsertPos);
	return QualType(DTST, 0);
	}

	/// getAtomicType - Return the uniqued reference to the atomic type for
	/// the given value type.
	QualType ASTContext::getAtomicType(QualType T) const {
	// Unique pointers, to guarantee there is only one pointer of a particular
	// structure.
	llvm::FoldingSetNodeID ID;
	AtomicType::Profile(ID, T);

	void *InsertPos = nullptr;
	if (AtomicType *AT = AtomicTypes.FindNodeOrInsertPos(ID, InsertPos))
	return QualType(AT, 0);

	// If the atomic value type isn't canonical, this won't be a canonical type
	// either, so fill in the canonical type field.
	QualType Canonical;
	if (!T.isCanonical()) {
	Canonical = getAtomicType(getCanonicalType(T));

	// Get the new insert position for the node we care about.
	AtomicType *NewIP = AtomicTypes.FindNodeOrInsertPos(ID, InsertPos);
	assert(!NewIP && "Shouldn't be in the map!"); (void)NewIP;
	}
	AtomicType New = new (this, TypeAlignment) AtomicType(T, Canonical);
	Types.push_back(New);
	AtomicTypes.InsertNode(New, InsertPos);
	return QualType(New, 0);
	}

	/// getAutoDeductType - Get type pattern for deducing against 'auto'.
	QualType ASTContext::getAutoDeductType() const {
	if (AutoDeductTy.isNull())
	AutoDeductTy = QualType(
	new (*this, TypeAlignment) AutoType(QualType(), AutoTypeKeyword::Auto,
	/dependent/false),
	0);
	return AutoDeductTy;
	}

	/// getAutoRRefDeductType - Get type pattern for deducing against 'auto &&'.
	QualType ASTContext::getAutoRRefDeductType() const {
	if (AutoRRefDeductTy.isNull())
	AutoRRefDeductTy = getRValueReferenceType(getAutoDeductType());
	assert(!AutoRRefDeductTy.isNull() && "can't build 'auto &&' pattern");
	return AutoRRefDeductTy;
	}

	/// getTagDeclType - Return the unique reference to the type for the
	/// specified TagDecl (struct/union/class/enum) decl.
	QualType ASTContext::getTagDeclType(const TagDecl *Decl) const {
	assert(Decl);
	// FIXME: What is the design on getTagDeclType when it requires casting
	// away const? mutable?
	return getTypeDeclType(const_cast<TagDecl*>(Decl));
	}

	/// getSizeType - Return the unique type for "size_t" (C99 7.17), the result
	/// of the sizeof operator (C99 6.5.3.4p4). The value is target dependent and
	/// needs to agree with the definition in <stddef.h>.
	CanQualType ASTContext::getSizeType() const {
	return getFromTargetType(Target->getSizeType());
	}

	/// Return the unique signed counterpart of the integer type
	/// corresponding to size_t.
	CanQualType ASTContext::getSignedSizeType() const {
	return getFromTargetType(Target->getSignedSizeType());
	}

	/// getIntMaxType - Return the unique type for "intmax_t" (C99 7.18.1.5).
	CanQualType ASTContext::getIntMaxType() const {
	return getFromTargetType(Target->getIntMaxType());
	}

	/// getUIntMaxType - Return the unique type for "uintmax_t" (C99 7.18.1.5).
	CanQualType ASTContext::getUIntMaxType() const {
	return getFromTargetType(Target->getUIntMaxType());
	}

	/// getSignedWCharType - Return the type of "signed wchar_t".
	/// Used when in C++, as a GCC extension.
	QualType ASTContext::getSignedWCharType() const {
	// FIXME: derive from "Target" ?
	return WCharTy;
	}

	/// getUnsignedWCharType - Return the type of "unsigned wchar_t".
	/// Used when in C++, as a GCC extension.
	QualType ASTContext::getUnsignedWCharType() const {
	// FIXME: derive from "Target" ?
	return UnsignedIntTy;
	}

	QualType ASTContext::getIntPtrType() const {
	return getFromTargetType(Target->getIntPtrType());
	}

	QualType ASTContext::getUIntPtrType() const {
	return getCorrespondingUnsignedType(getIntPtrType());
	}

	/// getPointerDiffType - Return the unique type for "ptrdiff_t" (C99 7.17)
	/// defined in <stddef.h>. Pointer - pointer requires this (C99 6.5.6p9).
	QualType ASTContext::getPointerDiffType() const {
	return getFromTargetType(Target->getPtrDiffType(0));
	}

	/// \brief Return the unique unsigned counterpart of "ptrdiff_t"
	/// integer type. The standard (C11 7.21.6.1p7) refers to this type
	/// in the definition of %tu format specifier.
	QualType ASTContext::getUnsignedPointerDiffType() const {
	return getFromTargetType(Target->getUnsignedPtrDiffType(0));
	}

	/// \brief Return the unique type for "pid_t" defined in
	/// <sys/types.h>. We need this to compute the correct type for vfork().
	QualType ASTContext::getProcessIDType() const {
	return getFromTargetType(Target->getProcessIDType());
	}

	//===----------------------------------------------------------------------===//
	// Type Operators
	//===----------------------------------------------------------------------===//

	CanQualType ASTContext::getCanonicalParamType(QualType T) const {
	// Push qualifiers into arrays, and then discard any remaining
	// qualifiers.
	T = getCanonicalType(T);
	T = getVariableArrayDecayedType(T);
	const Type *Ty = T.getTypePtr();
	QualType Result;
	if (isa<ArrayType>(Ty)) {
	Result = getArrayDecayedType(QualType(Ty,0));
	} else if (isa<FunctionType>(Ty)) {
	Result = getPointerType(QualType(Ty, 0));
	} else {
	Result = QualType(Ty, 0);
	}

	return CanQualType::CreateUnsafe(Result);
	}

	QualType ASTContext::getUnqualifiedArrayType(QualType type,
	Qualifiers &quals) {
	SplitQualType splitType = type.getSplitUnqualifiedType();

	// FIXME: getSplitUnqualifiedType() actually walks all the way to
	// the unqualified desugared type and then drops it on the floor.
	// We then have to strip that sugar back off with
	// getUnqualifiedDesugaredType(), which is silly.
	const ArrayType *AT =
	dyn_cast<ArrayType>(splitType.Ty->getUnqualifiedDesugaredType());

	// If we don't have an array, just use the results in splitType.
	if (!AT) {
	quals = splitType.Quals;
	return QualType(splitType.Ty, 0);
	}

	// Otherwise, recurse on the array's element type.
	QualType elementType = AT->getElementType();
	QualType unqualElementType = getUnqualifiedArrayType(elementType, quals);

	// If that didn't change the element type, AT has no qualifiers, so we
	// can just use the results in splitType.
	if (elementType == unqualElementType) {
	assert(quals.empty()); // from the recursive call
	quals = splitType.Quals;
	return QualType(splitType.Ty, 0);
	}

	// Otherwise, add in the qualifiers from the outermost type, then
	// build the type back up.
	quals.addConsistentQualifiers(splitType.Quals);

	if (const ConstantArrayType *CAT = dyn_cast<ConstantArrayType>(AT)) {
	return getConstantArrayType(unqualElementType, CAT->getSize(),
	CAT->getSizeModifier(), 0);
	}

	if (const IncompleteArrayType *IAT = dyn_cast<IncompleteArrayType>(AT)) {
	return getIncompleteArrayType(unqualElementType, IAT->getSizeModifier(), 0);
	}

	if (const VariableArrayType *VAT = dyn_cast<VariableArrayType>(AT)) {
	return getVariableArrayType(unqualElementType,
	VAT->getSizeExpr(),
	VAT->getSizeModifier(),
	VAT->getIndexTypeCVRQualifiers(),
	VAT->getBracketsRange());
	}

	const DependentSizedArrayType *DSAT = cast<DependentSizedArrayType>(AT);
	return getDependentSizedArrayType(unqualElementType, DSAT->getSizeExpr(),
	DSAT->getSizeModifier(), 0,
	SourceRange());
	}

	/// UnwrapSimilarPointerTypes - If T1 and T2 are pointer types that
	/// may be similar (C++ 4.4), replaces T1 and T2 with the type that
	/// they point to and return true. If T1 and T2 aren't pointer types
	/// or pointer-to-member types, or if they are not similar at this
	/// level, returns false and leaves T1 and T2 unchanged. Top-level
	/// qualifiers on T1 and T2 are ignored. This function will typically
	/// be called in a loop that successively "unwraps" pointer and
	/// pointer-to-member types to compare them at each level.
	bool ASTContext::UnwrapSimilarPointerTypes(QualType &T1, QualType &T2) {
	const PointerType *T1PtrType = T1->getAs<PointerType>(),
	*T2PtrType = T2->getAs<PointerType>();
	if (T1PtrType && T2PtrType) {
	T1 = T1PtrType->getPointeeType();
	T2 = T2PtrType->getPointeeType();
	return true;
	}

	const MemberPointerType *T1MPType = T1->getAs<MemberPointerType>(),
	*T2MPType = T2->getAs<MemberPointerType>();
	if (T1MPType && T2MPType &&
	hasSameUnqualifiedType(QualType(T1MPType->getClass(), 0),
	QualType(T2MPType->getClass(), 0))) {
	T1 = T1MPType->getPointeeType();
	T2 = T2MPType->getPointeeType();
	return true;
	}

	if (getLangOpts().ObjC1) {
	const ObjCObjectPointerType *T1OPType = T1->getAs<ObjCObjectPointerType>(),
	*T2OPType = T2->getAs<ObjCObjectPointerType>();
	if (T1OPType && T2OPType) {
	T1 = T1OPType->getPointeeType();
	T2 = T2OPType->getPointeeType();
	return true;
	}
	}

	// FIXME: Block pointers, too?

	return false;
	}

	DeclarationNameInfo
	ASTContext::getNameForTemplate(TemplateName Name,
	SourceLocation NameLoc) const {
	switch (Name.getKind()) {
	case TemplateName::QualifiedTemplate:
	case TemplateName::Template:
	// DNInfo work in progress: CHECKME: what about DNLoc?
	return DeclarationNameInfo(Name.getAsTemplateDecl()->getDeclName(),
	NameLoc);

	case TemplateName::OverloadedTemplate: {
	OverloadedTemplateStorage *Storage = Name.getAsOverloadedTemplate();
	// DNInfo work in progress: CHECKME: what about DNLoc?
	return DeclarationNameInfo((*Storage->begin())->getDeclName(), NameLoc);
	}

	case TemplateName::DependentTemplate: {
	DependentTemplateName *DTN = Name.getAsDependentTemplateName();
	DeclarationName DName;
	if (DTN->isIdentifier()) {
	DName = DeclarationNames.getIdentifier(DTN->getIdentifier());
	return DeclarationNameInfo(DName, NameLoc);
	} else {
	DName = DeclarationNames.getCXXOperatorName(DTN->getOperator());
	// DNInfo work in progress: FIXME: source locations?
	DeclarationNameLoc DNLoc;
	DNLoc.CXXOperatorName.BeginOpNameLoc = SourceLocation().getRawEncoding();
	DNLoc.CXXOperatorName.EndOpNameLoc = SourceLocation().getRawEncoding();
	return DeclarationNameInfo(DName, NameLoc, DNLoc);
	}
	}

	case TemplateName::SubstTemplateTemplateParm: {
	SubstTemplateTemplateParmStorage *subst
	= Name.getAsSubstTemplateTemplateParm();
	return DeclarationNameInfo(subst->getParameter()->getDeclName(),
	NameLoc);
	}

	case TemplateName::SubstTemplateTemplateParmPack: {
	SubstTemplateTemplateParmPackStorage *subst
	= Name.getAsSubstTemplateTemplateParmPack();
	return DeclarationNameInfo(subst->getParameterPack()->getDeclName(),
	NameLoc);
	}
	}

	llvm_unreachable("bad template name kind!");
	}

	TemplateName ASTContext::getCanonicalTemplateName(TemplateName Name) const {
	switch (Name.getKind()) {
	case TemplateName::QualifiedTemplate:
	case TemplateName::Template: {
	TemplateDecl *Template = Name.getAsTemplateDecl();
	if (TemplateTemplateParmDecl *TTP
	= dyn_cast<TemplateTemplateParmDecl>(Template))
	Template = getCanonicalTemplateTemplateParmDecl(TTP);

	// The canonical template name is the canonical template declaration.
	return TemplateName(cast<TemplateDecl>(Template->getCanonicalDecl()));
	}

	case TemplateName::OverloadedTemplate:
	llvm_unreachable("cannot canonicalize overloaded template");

	case TemplateName::DependentTemplate: {
	DependentTemplateName *DTN = Name.getAsDependentTemplateName();
	assert(DTN && "Non-dependent template names must refer to template decls.");
	return DTN->CanonicalTemplateName;
	}

	case TemplateName::SubstTemplateTemplateParm: {
	SubstTemplateTemplateParmStorage *subst
	= Name.getAsSubstTemplateTemplateParm();
	return getCanonicalTemplateName(subst->getReplacement());
	}

	case TemplateName::SubstTemplateTemplateParmPack: {
	SubstTemplateTemplateParmPackStorage *subst
	= Name.getAsSubstTemplateTemplateParmPack();
	TemplateTemplateParmDecl *canonParameter
	= getCanonicalTemplateTemplateParmDecl(subst->getParameterPack());
	TemplateArgument canonArgPack
	= getCanonicalTemplateArgument(subst->getArgumentPack());
	return getSubstTemplateTemplateParmPack(canonParameter, canonArgPack);
	}
	}

	llvm_unreachable("bad template name!");
	}

	bool ASTContext::hasSameTemplateName(TemplateName X, TemplateName Y) {
	X = getCanonicalTemplateName(X);
	Y = getCanonicalTemplateName(Y);
	return X.getAsVoidPointer() == Y.getAsVoidPointer();
	}

	TemplateArgument
	ASTContext::getCanonicalTemplateArgument(const TemplateArgument &Arg) const {
	switch (Arg.getKind()) {
	case TemplateArgument::Null:
	return Arg;

	case TemplateArgument::Expression:
	return Arg;

	case TemplateArgument::Declaration: {
	ValueDecl *D = cast<ValueDecl>(Arg.getAsDecl()->getCanonicalDecl());
	return TemplateArgument(D, Arg.getParamTypeForDecl());
	}

	case TemplateArgument::NullPtr:
	return TemplateArgument(getCanonicalType(Arg.getNullPtrType()),
	/isNullPtr/true);

	case TemplateArgument::Template:
	return TemplateArgument(getCanonicalTemplateName(Arg.getAsTemplate()));

	case TemplateArgument::TemplateExpansion:
	return TemplateArgument(getCanonicalTemplateName(
	Arg.getAsTemplateOrTemplatePattern()),
	Arg.getNumTemplateExpansions());

	case TemplateArgument::Integral:
	return TemplateArgument(Arg, getCanonicalType(Arg.getIntegralType()));

	case TemplateArgument::Type:
	return TemplateArgument(getCanonicalType(Arg.getAsType()));

	case TemplateArgument::Pack: {
	if (Arg.pack_size() == 0)
	return Arg;

	TemplateArgument *CanonArgs
	= new (*this) TemplateArgument[Arg.pack_size()];
	unsigned Idx = 0;
	for (TemplateArgument::pack_iterator A = Arg.pack_begin(),
	AEnd = Arg.pack_end();
	A != AEnd; (void)++A, ++Idx)
	CanonArgs[Idx] = getCanonicalTemplateArgument(*A);

	return TemplateArgument(llvm::makeArrayRef(CanonArgs, Arg.pack_size()));
	}
	}

	// Silence GCC warning
	llvm_unreachable("Unhandled template argument kind");
	}

	NestedNameSpecifier *
	ASTContext::getCanonicalNestedNameSpecifier(NestedNameSpecifier *NNS) const {
	if (!NNS)
	return nullptr;

	switch (NNS->getKind()) {
	case NestedNameSpecifier::Identifier:
	// Canonicalize the prefix but keep the identifier the same.
	return NestedNameSpecifier::Create(*this,
	getCanonicalNestedNameSpecifier(NNS->getPrefix()),
	NNS->getAsIdentifier());

	case NestedNameSpecifier::Namespace:
	// A namespace is canonical; build a nested-name-specifier with
	// this namespace and no prefix.
	return NestedNameSpecifier::Create(*this, nullptr,
	NNS->getAsNamespace()->getOriginalNamespace());

	case NestedNameSpecifier::NamespaceAlias:
	// A namespace is canonical; build a nested-name-specifier with
	// this namespace and no prefix.
	return NestedNameSpecifier::Create(*this, nullptr,
	NNS->getAsNamespaceAlias()->getNamespace()
	->getOriginalNamespace());

	case NestedNameSpecifier::TypeSpec:
	case NestedNameSpecifier::TypeSpecWithTemplate: {
	QualType T = getCanonicalType(QualType(NNS->getAsType(), 0));

	// If we have some kind of dependent-named type (e.g., "typename T::type"),
	// break it apart into its prefix and identifier, then reconsititute those
	// as the canonical nested-name-specifier. This is required to canonicalize
	// a dependent nested-name-specifier involving typedefs of dependent-name
	// types, e.g.,
	// typedef typename T::type T1;
	// typedef typename T1::type T2;
	if (const DependentNameType *DNT = T->getAs<DependentNameType>())
	return NestedNameSpecifier::Create(*this, DNT->getQualifier(),
	const_cast<IdentifierInfo *>(DNT->getIdentifier()));

	// Otherwise, just canonicalize the type, and force it to be a TypeSpec.
	// FIXME: Why are TypeSpec and TypeSpecWithTemplate distinct in the
	// first place?
	return NestedNameSpecifier::Create(*this, nullptr, false,
	const_cast<Type *>(T.getTypePtr()));
	}

	case NestedNameSpecifier::Global:
	case NestedNameSpecifier::Super:
	// The global specifier and __super specifer are canonical and unique.
	return NNS;
	}

	llvm_unreachable("Invalid NestedNameSpecifier::Kind!");
	}

	const ArrayType *ASTContext::getAsArrayType(QualType T) const {
	// Handle the non-qualified case efficiently.
	if (!T.hasLocalQualifiers()) {
	// Handle the common positive case fast.
	if (const ArrayType *AT = dyn_cast<ArrayType>(T))
	return AT;
	}

	// Handle the common negative case fast.
	if (!isa<ArrayType>(T.getCanonicalType()))
	return nullptr;

	// Apply any qualifiers from the array type to the element type. This
	// implements C99 6.7.3p8: "If the specification of an array type includes
	// any type qualifiers, the element type is so qualified, not the array type."

	// If we get here, we either have type qualifiers on the type, or we have
	// sugar such as a typedef in the way. If we have type qualifiers on the type
	// we must propagate them down into the element type.

	SplitQualType split = T.getSplitDesugaredType();
	Qualifiers qs = split.Quals;

	// If we have a simple case, just return now.
	const ArrayType *ATy = dyn_cast<ArrayType>(split.Ty);
	if (!ATy \|\| qs.empty())
	return ATy;

	// Otherwise, we have an array and we have qualifiers on it. Push the
	// qualifiers into the array element type and return a new array type.
	QualType NewEltTy = getQualifiedType(ATy->getElementType(), qs);

	if (const ConstantArrayType *CAT = dyn_cast<ConstantArrayType>(ATy))
	return cast<ArrayType>(getConstantArrayType(NewEltTy, CAT->getSize(),
	CAT->getSizeModifier(),
	CAT->getIndexTypeCVRQualifiers()));
	if (const IncompleteArrayType *IAT = dyn_cast<IncompleteArrayType>(ATy))
	return cast<ArrayType>(getIncompleteArrayType(NewEltTy,
	IAT->getSizeModifier(),
	IAT->getIndexTypeCVRQualifiers()));

	if (const DependentSizedArrayType *DSAT
	= dyn_cast<DependentSizedArrayType>(ATy))
	return cast<ArrayType>(
	getDependentSizedArrayType(NewEltTy,
	DSAT->getSizeExpr(),
	DSAT->getSizeModifier(),
	DSAT->getIndexTypeCVRQualifiers(),
	DSAT->getBracketsRange()));

	const VariableArrayType *VAT = cast<VariableArrayType>(ATy);
	return cast<ArrayType>(getVariableArrayType(NewEltTy,
	VAT->getSizeExpr(),
	VAT->getSizeModifier(),
	VAT->getIndexTypeCVRQualifiers(),
	VAT->getBracketsRange()));
	}

	QualType ASTContext::getAdjustedParameterType(QualType T) const {
	if (T->isArrayType() \|\| T->isFunctionType())
	return getDecayedType(T);
	return T;
	}

	QualType ASTContext::getSignatureParameterType(QualType T) const {
	T = getVariableArrayDecayedType(T);
	T = getAdjustedParameterType(T);
	return T.getUnqualifiedType();
	}

	QualType ASTContext::getExceptionObjectType(QualType T) const {
	// C++ [except.throw]p3:
	// A throw-expression initializes a temporary object, called the exception
	// object, the type of which is determined by removing any top-level
	// cv-qualifiers from the static type of the operand of throw and adjusting
	// the type from "array of T" or "function returning T" to "pointer to T"
	// or "pointer to function returning T", [...]
	T = getVariableArrayDecayedType(T);
	if (T->isArrayType() \|\| T->isFunctionType())
	T = getDecayedType(T);
	return T.getUnqualifiedType();
	}

	/// getArrayDecayedType - Return the properly qualified result of decaying the
	/// specified array type to a pointer. This operation is non-trivial when
	/// handling typedefs etc. The canonical type of "T" must be an array type,
	/// this returns a pointer to a properly qualified element of the array.
	///
	/// See C99 6.7.5.3p7 and C99 6.3.2.1p3.
	QualType ASTContext::getArrayDecayedType(QualType Ty) const {
	// Get the element type with 'getAsArrayType' so that we don't lose any
	// typedefs in the element type of the array. This also handles propagation
	// of type qualifiers from the array type into the element type if present
	// (C99 6.7.3p8).
	const ArrayType *PrettyArrayType = getAsArrayType(Ty);
	assert(PrettyArrayType && "Not an array type!");

	QualType PtrTy = getPointerType(PrettyArrayType->getElementType());

	// int x[restrict 4] -> int *restrict
	QualType Result = getQualifiedType(PtrTy,
	PrettyArrayType->getIndexTypeQualifiers());

	// int x[_Nullable] -> int * _Nullable
	if (auto Nullability = Ty->getNullability(*this)) {
	Result = const_cast<ASTContext *>(this)->getAttributedType(
	AttributedType::getNullabilityAttrKind(*Nullability), Result, Result);
	}
	return Result;
	}

	QualType ASTContext::getBaseElementType(const ArrayType *array) const {
	return getBaseElementType(array->getElementType());
	}

	QualType ASTContext::getBaseElementType(QualType type) const {
	Qualifiers qs;
	while (true) {
	SplitQualType split = type.getSplitDesugaredType();
	const ArrayType *array = split.Ty->getAsArrayTypeUnsafe();
	if (!array) break;

	type = array->getElementType();
	qs.addConsistentQualifiers(split.Quals);
	}

	return getQualifiedType(type, qs);
	}

	/// getConstantArrayElementCount - Returns number of constant array elements.
	uint64_t
	ASTContext::getConstantArrayElementCount(const ConstantArrayType *CA) const {
	uint64_t ElementCount = 1;
	do {
	ElementCount *= CA->getSize().getZExtValue();
	CA = dyn_cast_or_null<ConstantArrayType>(
	CA->getElementType()->getAsArrayTypeUnsafe());
	} while (CA);
	return ElementCount;
	}

	/// getFloatingRank - Return a relative rank for floating point types.
	/// This routine will assert if passed a built-in type that isn't a float.
	static FloatingRank getFloatingRank(QualType T) {
	if (const ComplexType *CT = T->getAs<ComplexType>())
	return getFloatingRank(CT->getElementType());

	assert(T->getAs<BuiltinType>() && "getFloatingRank(): not a floating type");
	switch (T->getAs<BuiltinType>()->getKind()) {
	default: llvm_unreachable("getFloatingRank(): not a floating type");
	case BuiltinType::Float16: return Float16Rank;
	case BuiltinType::Half: return HalfRank;
	case BuiltinType::Float: return FloatRank;
	case BuiltinType::Double: return DoubleRank;
	case BuiltinType::LongDouble: return LongDoubleRank;
	case BuiltinType::Float128: return Float128Rank;
	}
	}

	/// getFloatingTypeOfSizeWithinDomain - Returns a real floating
	/// point or a complex type (based on typeDomain/typeSize).
	/// 'typeDomain' is a real floating point or complex type.
	/// 'typeSize' is a real floating point or complex type.
	QualType ASTContext::getFloatingTypeOfSizeWithinDomain(QualType Size,
	QualType Domain) const {
	FloatingRank EltRank = getFloatingRank(Size);
	if (Domain->isComplexType()) {
	switch (EltRank) {
	case Float16Rank:
	case HalfRank: llvm_unreachable("Complex half is not supported");
	case FloatRank: return FloatComplexTy;
	case DoubleRank: return DoubleComplexTy;
	case LongDoubleRank: return LongDoubleComplexTy;
	case Float128Rank: return Float128ComplexTy;
	}
	}

	assert(Domain->isRealFloatingType() && "Unknown domain!");
	switch (EltRank) {
	case Float16Rank: return HalfTy;
	case HalfRank: return HalfTy;
	case FloatRank: return FloatTy;
	case DoubleRank: return DoubleTy;
	case LongDoubleRank: return LongDoubleTy;
	case Float128Rank: return Float128Ty;
	}
	llvm_unreachable("getFloatingRank(): illegal value for rank");
	}

	/// getFloatingTypeOrder - Compare the rank of the two specified floating
	/// point types, ignoring the domain of the type (i.e. 'double' ==
	/// '_Complex double'). If LHS > RHS, return 1. If LHS == RHS, return 0. If
	/// LHS < RHS, return -1.
	int ASTContext::getFloatingTypeOrder(QualType LHS, QualType RHS) const {
	FloatingRank LHSR = getFloatingRank(LHS);
	FloatingRank RHSR = getFloatingRank(RHS);

	if (LHSR == RHSR)
	return 0;
	if (LHSR > RHSR)
	return 1;
	return -1;
	}

	/// getIntegerRank - Return an integer conversion rank (C99 6.3.1.1p1). This
	/// routine will assert if passed a built-in type that isn't an integer or enum,
	/// or if it is not canonicalized.
	unsigned ASTContext::getIntegerRank(const Type *T) const {
	assert(T->isCanonicalUnqualified() && "T should be canonicalized");

	switch (cast<BuiltinType>(T)->getKind()) {
	default: llvm_unreachable("getIntegerRank(): not a built-in integer");
	case BuiltinType::Bool:
	return 1 + (getIntWidth(BoolTy) << 3);
	case BuiltinType::Char_S:
	case BuiltinType::Char_U:
	case BuiltinType::SChar:
	case BuiltinType::UChar:
	return 2 + (getIntWidth(CharTy) << 3);
	case BuiltinType::Short:
	case BuiltinType::UShort:
	return 3 + (getIntWidth(ShortTy) << 3);
	case BuiltinType::Int:
	case BuiltinType::UInt:
	return 4 + (getIntWidth(IntTy) << 3);
	case BuiltinType::Long:
	case BuiltinType::ULong:
	return 5 + (getIntWidth(LongTy) << 3);
	case BuiltinType::LongLong:
	case BuiltinType::ULongLong:
	return 6 + (getIntWidth(LongLongTy) << 3);
	case BuiltinType::Int128:
	case BuiltinType::UInt128:
	return 7 + (getIntWidth(Int128Ty) << 3);
	}
	}

	/// \brief Whether this is a promotable bitfield reference according
	/// to C99 6.3.1.1p2, bullet 2 (and GCC extensions).
	///
	/// \returns the type this bit-field will promote to, or NULL if no
	/// promotion occurs.
	QualType ASTContext::isPromotableBitField(Expr *E) const {
	if (E->isTypeDependent() \|\| E->isValueDependent())
	return QualType();

	// FIXME: We should not do this unless E->refersToBitField() is true. This
	// matters in C where getSourceBitField() will find bit-fields for various
	// cases where the source expression is not a bit-field designator.

	FieldDecl *Field = E->getSourceBitField(); // FIXME: conditional bit-fields?
	if (!Field)
	return QualType();

	QualType FT = Field->getType();

	uint64_t BitWidth = Field->getBitWidthValue(*this);
	uint64_t IntSize = getTypeSize(IntTy);
	// C++ [conv.prom]p5:
	// A prvalue for an integral bit-field can be converted to a prvalue of type
	// int if int can represent all the values of the bit-field; otherwise, it
	// can be converted to unsigned int if unsigned int can represent all the
	// values of the bit-field. If the bit-field is larger yet, no integral
	// promotion applies to it.
	// C11 6.3.1.1/2:
	// [For a bit-field of type _Bool, int, signed int, or unsigned int:]
	// If an int can represent all values of the original type (as restricted by
	// the width, for a bit-field), the value is converted to an int; otherwise,
	// it is converted to an unsigned int.
	//
	// FIXME: C does not permit promotion of a 'long : 3' bitfield to int.
	// We perform that promotion here to match GCC and C++.
	if (BitWidth < IntSize)
	return IntTy;

	if (BitWidth == IntSize)
	return FT->isSignedIntegerType() ? IntTy : UnsignedIntTy;

	// Types bigger than int are not subject to promotions, and therefore act
	// like the base type. GCC has some weird bugs in this area that we
	// deliberately do not follow (GCC follows a pre-standard resolution to
	// C's DR315 which treats bit-width as being part of the type, and this leaks
	// into their semantics in some cases).
	return QualType();
	}

	/// getPromotedIntegerType - Returns the type that Promotable will
	/// promote to: C99 6.3.1.1p2, assuming that Promotable is a promotable
	/// integer type.
	QualType ASTContext::getPromotedIntegerType(QualType Promotable) const {
	assert(!Promotable.isNull());
	assert(Promotable->isPromotableIntegerType());
	if (const EnumType *ET = Promotable->getAs<EnumType>())
	return ET->getDecl()->getPromotionType();

	if (const BuiltinType *BT = Promotable->getAs<BuiltinType>()) {
	// C++ [conv.prom]: A prvalue of type char16_t, char32_t, or wchar_t
	// (3.9.1) can be converted to a prvalue of the first of the following
	// types that can represent all the values of its underlying type:
	// int, unsigned int, long int, unsigned long int, long long int, or
	// unsigned long long int [...]
	// FIXME: Is there some better way to compute this?
	if (BT->getKind() == BuiltinType::WChar_S \|\|
	BT->getKind() == BuiltinType::WChar_U \|\|
	BT->getKind() == BuiltinType::Char16 \|\|
	BT->getKind() == BuiltinType::Char32) {
	bool FromIsSigned = BT->getKind() == BuiltinType::WChar_S;
	uint64_t FromSize = getTypeSize(BT);
	QualType PromoteTypes[] = { IntTy, UnsignedIntTy, LongTy, UnsignedLongTy,
	LongLongTy, UnsignedLongLongTy };
	for (size_t Idx = 0; Idx < llvm::array_lengthof(PromoteTypes); ++Idx) {
	uint64_t ToSize = getTypeSize(PromoteTypes[Idx]);
	if (FromSize < ToSize \|\|
	(FromSize == ToSize &&
	FromIsSigned == PromoteTypes[Idx]->isSignedIntegerType()))
	return PromoteTypes[Idx];
	}
	llvm_unreachable("char type should fit into long long");
	}
	}

	// At this point, we should have a signed or unsigned integer type.
	if (Promotable->isSignedIntegerType())
	return IntTy;
	uint64_t PromotableSize = getIntWidth(Promotable);
	uint64_t IntSize = getIntWidth(IntTy);
	assert(Promotable->isUnsignedIntegerType() && PromotableSize <= IntSize);
	return (PromotableSize != IntSize) ? IntTy : UnsignedIntTy;
	}

	/// \brief Recurses in pointer/array types until it finds an objc retainable
	/// type and returns its ownership.
	Qualifiers::ObjCLifetime ASTContext::getInnerObjCOwnership(QualType T) const {
	while (!T.isNull()) {
	if (T.getObjCLifetime() != Qualifiers::OCL_None)
	return T.getObjCLifetime();
	if (T->isArrayType())
	T = getBaseElementType(T);
	else if (const PointerType *PT = T->getAs<PointerType>())
	T = PT->getPointeeType();
	else if (const ReferenceType *RT = T->getAs<ReferenceType>())
	T = RT->getPointeeType();
	else
	break;
	}

	return Qualifiers::OCL_None;
	}

	static const Type getIntegerTypeForEnum(const EnumType ET) {
	// Incomplete enum types are not treated as integer types.
	// FIXME: In C++, enum types are never integer types.
	if (ET->getDecl()->isComplete() && !ET->getDecl()->isScoped())
	return ET->getDecl()->getIntegerType().getTypePtr();
	return nullptr;
	}

	/// getIntegerTypeOrder - Returns the highest ranked integer type:
	/// C99 6.3.1.8p1. If LHS > RHS, return 1. If LHS == RHS, return 0. If
	/// LHS < RHS, return -1.
	int ASTContext::getIntegerTypeOrder(QualType LHS, QualType RHS) const {
	const Type *LHSC = getCanonicalType(LHS).getTypePtr();
	const Type *RHSC = getCanonicalType(RHS).getTypePtr();

	// Unwrap enums to their underlying type.
	if (const EnumType *ET = dyn_cast<EnumType>(LHSC))
	LHSC = getIntegerTypeForEnum(ET);
	if (const EnumType *ET = dyn_cast<EnumType>(RHSC))
	RHSC = getIntegerTypeForEnum(ET);

	if (LHSC == RHSC) return 0;

	bool LHSUnsigned = LHSC->isUnsignedIntegerType();
	bool RHSUnsigned = RHSC->isUnsignedIntegerType();

	unsigned LHSRank = getIntegerRank(LHSC);
	unsigned RHSRank = getIntegerRank(RHSC);

	if (LHSUnsigned == RHSUnsigned) { // Both signed or both unsigned.
	if (LHSRank == RHSRank) return 0;
	return LHSRank > RHSRank ? 1 : -1;
	}

	// Otherwise, the LHS is signed and the RHS is unsigned or visa versa.
	if (LHSUnsigned) {
	// If the unsigned [LHS] type is larger, return it.
	if (LHSRank >= RHSRank)
	return 1;

	// If the signed type can represent all values of the unsigned type, it
	// wins. Because we are dealing with 2's complement and types that are
	// powers of two larger than each other, this is always safe.
	return -1;
	}

	// If the unsigned [RHS] type is larger, return it.
	if (RHSRank >= LHSRank)
	return -1;

	// If the signed type can represent all values of the unsigned type, it
	// wins. Because we are dealing with 2's complement and types that are
	// powers of two larger than each other, this is always safe.
	return 1;
	}

	TypedefDecl *ASTContext::getCFConstantStringDecl() const {
	if (!CFConstantStringTypeDecl) {
	assert(!CFConstantStringTagDecl &&
	"tag and typedef should be initialized together");
	CFConstantStringTagDecl = buildImplicitRecord("__NSConstantString_tag");
	CFConstantStringTagDecl->startDefinition();

	QualType FieldTypes[4];
	const char *FieldNames[4];

	// const int *isa;
	FieldTypes[0] = getPointerType(IntTy.withConst());
	FieldNames[0] = "isa";
	// int flags;
	FieldTypes[1] = IntTy;
	FieldNames[1] = "flags";
	// const char *str;
	FieldTypes[2] = getPointerType(CharTy.withConst());
	FieldNames[2] = "str";
	// long length;
	FieldTypes[3] = LongTy;
	FieldNames[3] = "length";

	// Create fields
	for (unsigned i = 0; i < 4; ++i) {
	FieldDecl Field = FieldDecl::Create(this, CFConstantStringTagDecl,
	SourceLocation(),
	SourceLocation(),
	&Idents.get(FieldNames[i]),
	FieldTypes[i], /TInfo=/nullptr,
	/BitWidth=/nullptr,
	/Mutable=/false,
	ICIS_NoInit);
	Field->setAccess(AS_public);
	CFConstantStringTagDecl->addDecl(Field);
	}

	CFConstantStringTagDecl->completeDefinition();
	// This type is designed to be compatible with NSConstantString, but cannot
	// use the same name, since NSConstantString is an interface.
	auto tagType = getTagDeclType(CFConstantStringTagDecl);
	CFConstantStringTypeDecl =
	buildImplicitTypedef(tagType, "__NSConstantString");
	}

	return CFConstantStringTypeDecl;
	}

	RecordDecl *ASTContext::getCFConstantStringTagDecl() const {
	if (!CFConstantStringTagDecl)
	getCFConstantStringDecl(); // Build the tag and the typedef.
	return CFConstantStringTagDecl;
	}

	// getCFConstantStringType - Return the type used for constant CFStrings.
	QualType ASTContext::getCFConstantStringType() const {
	return getTypedefType(getCFConstantStringDecl());
	}

	QualType ASTContext::getObjCSuperType() const {
	if (ObjCSuperType.isNull()) {
	RecordDecl *ObjCSuperTypeDecl = buildImplicitRecord("objc_super");
	TUDecl->addDecl(ObjCSuperTypeDecl);
	ObjCSuperType = getTagDeclType(ObjCSuperTypeDecl);
	}
	return ObjCSuperType;
	}

	void ASTContext::setCFConstantStringType(QualType T) {
	const TypedefType *TD = T->getAs<TypedefType>();
	assert(TD && "Invalid CFConstantStringType");
	CFConstantStringTypeDecl = cast<TypedefDecl>(TD->getDecl());
	auto TagType =
	CFConstantStringTypeDecl->getUnderlyingType()->getAs<RecordType>();
	assert(TagType && "Invalid CFConstantStringType");
	CFConstantStringTagDecl = TagType->getDecl();
	}

	QualType ASTContext::getBlockDescriptorType() const {
	if (BlockDescriptorType)
	return getTagDeclType(BlockDescriptorType);

	RecordDecl *RD;
	// FIXME: Needs the FlagAppleBlock bit.
	RD = buildImplicitRecord("__block_descriptor");
	RD->startDefinition();

	QualType FieldTypes[] = {
	UnsignedLongTy,
	UnsignedLongTy,
	};

	static const char *const FieldNames[] = {
	"reserved",
	"Size"
	};

	for (size_t i = 0; i < 2; ++i) {
	FieldDecl *Field = FieldDecl::Create(
	*this, RD, SourceLocation(), SourceLocation(),
	&Idents.get(FieldNames[i]), FieldTypes[i], /TInfo=/nullptr,
	/BitWidth=/nullptr, /Mutable=/false, ICIS_NoInit);
	Field->setAccess(AS_public);
	RD->addDecl(Field);
	}

	RD->completeDefinition();

	BlockDescriptorType = RD;

	return getTagDeclType(BlockDescriptorType);
	}

	QualType ASTContext::getBlockDescriptorExtendedType() const {
	if (BlockDescriptorExtendedType)
	return getTagDeclType(BlockDescriptorExtendedType);

	RecordDecl *RD;
	// FIXME: Needs the FlagAppleBlock bit.
	RD = buildImplicitRecord("__block_descriptor_withcopydispose");
	RD->startDefinition();

	QualType FieldTypes[] = {
	UnsignedLongTy,
	UnsignedLongTy,
	getPointerType(VoidPtrTy),
	getPointerType(VoidPtrTy)
	};

	static const char *const FieldNames[] = {
	"reserved",
	"Size",
	"CopyFuncPtr",
	"DestroyFuncPtr"
	};

	for (size_t i = 0; i < 4; ++i) {
	FieldDecl *Field = FieldDecl::Create(
	*this, RD, SourceLocation(), SourceLocation(),
	&Idents.get(FieldNames[i]), FieldTypes[i], /TInfo=/nullptr,
	/BitWidth=/nullptr,
	/Mutable=/false, ICIS_NoInit);
	Field->setAccess(AS_public);
	RD->addDecl(Field);
	}

	RD->completeDefinition();

	BlockDescriptorExtendedType = RD;
	return getTagDeclType(BlockDescriptorExtendedType);
	}

	TargetInfo::OpenCLTypeKind ASTContext::getOpenCLTypeKind(const Type *T) const {
	auto BT = dyn_cast<BuiltinType>(T);

	if (!BT) {
	if (isa<PipeType>(T))
	return TargetInfo::OCLTK_Pipe;

	return TargetInfo::OCLTK_Default;
	}

	switch (BT->getKind()) {
	#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
	case BuiltinType::Id: \
	return TargetInfo::OCLTK_Image;
	#include "clang/Basic/OpenCLImageTypes.def"

	case BuiltinType::OCLClkEvent:
	return TargetInfo::OCLTK_ClkEvent;

	case BuiltinType::OCLEvent:
	return TargetInfo::OCLTK_Event;

	case BuiltinType::OCLQueue:
	return TargetInfo::OCLTK_Queue;

	case BuiltinType::OCLReserveID:
	return TargetInfo::OCLTK_ReserveID;

	case BuiltinType::OCLSampler:
	return TargetInfo::OCLTK_Sampler;

	default:
	return TargetInfo::OCLTK_Default;
	}
	}

	LangAS ASTContext::getOpenCLTypeAddrSpace(const Type *T) const {
	return Target->getOpenCLTypeAddrSpace(getOpenCLTypeKind(T));
	}

	/// BlockRequiresCopying - Returns true if byref variable "D" of type "Ty"
	/// requires copy/dispose. Note that this must match the logic
	/// in buildByrefHelpers.
	bool ASTContext::BlockRequiresCopying(QualType Ty,
	const VarDecl *D) {
	if (const CXXRecordDecl *record = Ty->getAsCXXRecordDecl()) {
	const Expr *copyExpr = getBlockVarCopyInits(D);
	if (!copyExpr && record->hasTrivialDestructor()) return false;

	return true;
	}

	if (!Ty->isObjCRetainableType()) return false;

	Qualifiers qs = Ty.getQualifiers();

	// If we have lifetime, that dominates.
	if (Qualifiers::ObjCLifetime lifetime = qs.getObjCLifetime()) {
	switch (lifetime) {
	case Qualifiers::OCL_None: llvm_unreachable("impossible");

	// These are just bits as far as the runtime is concerned.
	case Qualifiers::OCL_ExplicitNone:
	case Qualifiers::OCL_Autoreleasing:
	return false;

	// Tell the runtime that this is ARC __weak, called by the
	// byref routines.
	case Qualifiers::OCL_Weak:
	// ARC __strong __block variables need to be retained.
	case Qualifiers::OCL_Strong:
	return true;
	}
	llvm_unreachable("fell out of lifetime switch!");
	}
	return (Ty->isBlockPointerType() \|\| isObjCNSObjectType(Ty) \|\|
	Ty->isObjCObjectPointerType());
	}

	bool ASTContext::getByrefLifetime(QualType Ty,
	Qualifiers::ObjCLifetime &LifeTime,
	bool &HasByrefExtendedLayout) const {
	if (!getLangOpts().ObjC1 \|\|
	getLangOpts().getGC() != LangOptions::NonGC)
	return false;

	HasByrefExtendedLayout = false;
	if (Ty->isRecordType()) {
	HasByrefExtendedLayout = true;
	LifeTime = Qualifiers::OCL_None;
	} else if ((LifeTime = Ty.getObjCLifetime())) {
	// Honor the ARC qualifiers.
	} else if (Ty->isObjCObjectPointerType() \|\| Ty->isBlockPointerType()) {
	// The MRR rule.
	LifeTime = Qualifiers::OCL_ExplicitNone;
	} else {
	LifeTime = Qualifiers::OCL_None;
	}
	return true;
	}

	TypedefDecl *ASTContext::getObjCInstanceTypeDecl() {
	if (!ObjCInstanceTypeDecl)
	ObjCInstanceTypeDecl =
	buildImplicitTypedef(getObjCIdType(), "instancetype");
	return ObjCInstanceTypeDecl;
	}

	// This returns true if a type has been typedefed to BOOL:
	// typedef <type> BOOL;
	static bool isTypeTypedefedAsBOOL(QualType T) {
	if (const TypedefType *TT = dyn_cast<TypedefType>(T))
	if (IdentifierInfo *II = TT->getDecl()->getIdentifier())
	return II->isStr("BOOL");

	return false;
	}

	/// getObjCEncodingTypeSize returns size of type for objective-c encoding
	/// purpose.
	CharUnits ASTContext::getObjCEncodingTypeSize(QualType type) const {
	if (!type->isIncompleteArrayType() && type->isIncompleteType())
	return CharUnits::Zero();

	CharUnits sz = getTypeSizeInChars(type);

	// Make all integer and enum types at least as large as an int
	if (sz.isPositive() && type->isIntegralOrEnumerationType())
	sz = std::max(sz, getTypeSizeInChars(IntTy));
	// Treat arrays as pointers, since that's how they're passed in.
	else if (type->isArrayType())
	sz = getTypeSizeInChars(VoidPtrTy);
	return sz;
	}

	bool ASTContext::isMSStaticDataMemberInlineDefinition(const VarDecl *VD) const {
	return getTargetInfo().getCXXABI().isMicrosoft() &&
	VD->isStaticDataMember() &&
	VD->getType()->isIntegralOrEnumerationType() &&
	!VD->getFirstDecl()->isOutOfLine() && VD->getFirstDecl()->hasInit();
	}

	ASTContext::InlineVariableDefinitionKind
	ASTContext::getInlineVariableDefinitionKind(const VarDecl *VD) const {
	if (!VD->isInline())
	return InlineVariableDefinitionKind::None;

	// In almost all cases, it's a weak definition.
	auto *First = VD->getFirstDecl();
	if (First->isInlineSpecified() \|\| !First->isStaticDataMember())
	return InlineVariableDefinitionKind::Weak;

	// If there's a file-context declaration in this translation unit, it's a
	// non-discardable definition.
	for (auto *D : VD->redecls())
	if (D->getLexicalDeclContext()->isFileContext() &&
	!D->isInlineSpecified() && (D->isConstexpr() \|\| First->isConstexpr()))
	return InlineVariableDefinitionKind::Strong;

	// If we've not seen one yet, we don't know.
	return InlineVariableDefinitionKind::WeakUnknown;
	}

	static inline
	std::string charUnitsToString(const CharUnits &CU) {
	return llvm::itostr(CU.getQuantity());
	}

	/// getObjCEncodingForBlock - Return the encoded type for this block
	/// declaration.
	std::string ASTContext::getObjCEncodingForBlock(const BlockExpr *Expr) const {
	std::string S;

	const BlockDecl *Decl = Expr->getBlockDecl();
	QualType BlockTy =
	Expr->getType()->getAs<BlockPointerType>()->getPointeeType();
	// Encode result type.
	if (getLangOpts().EncodeExtendedBlockSig)
	getObjCEncodingForMethodParameter(
	Decl::OBJC_TQ_None, BlockTy->getAs<FunctionType>()->getReturnType(), S,
	true /Extended/);
	else
	getObjCEncodingForType(BlockTy->getAs<FunctionType>()->getReturnType(), S);
	// Compute size of all parameters.
	// Start with computing size of a pointer in number of bytes.
	// FIXME: There might(should) be a better way of doing this computation!
	CharUnits PtrSize = getTypeSizeInChars(VoidPtrTy);
	CharUnits ParmOffset = PtrSize;
	for (auto PI : Decl->parameters()) {
	QualType PType = PI->getType();
	CharUnits sz = getObjCEncodingTypeSize(PType);
	if (sz.isZero())
	continue;
	assert(sz.isPositive() && "BlockExpr - Incomplete param type");
	ParmOffset += sz;
	}
	// Size of the argument frame
	S += charUnitsToString(ParmOffset);
	// Block pointer and offset.
	S += "@?0";

	// Argument types.
	ParmOffset = PtrSize;
	for (auto PVDecl : Decl->parameters()) {
	QualType PType = PVDecl->getOriginalType();
	if (const ArrayType *AT =
	dyn_cast<ArrayType>(PType->getCanonicalTypeInternal())) {
	// Use array's original type only if it has known number of
	// elements.
	if (!isa<ConstantArrayType>(AT))
	PType = PVDecl->getType();
	} else if (PType->isFunctionType())
	PType = PVDecl->getType();
	if (getLangOpts().EncodeExtendedBlockSig)
	getObjCEncodingForMethodParameter(Decl::OBJC_TQ_None, PType,
	S, true /Extended/);
	else
	getObjCEncodingForType(PType, S);
	S += charUnitsToString(ParmOffset);
	ParmOffset += getObjCEncodingTypeSize(PType);
	}

	return S;
	}

	std::string
	ASTContext::getObjCEncodingForFunctionDecl(const FunctionDecl *Decl) const {
	std::string S;
	// Encode result type.
	getObjCEncodingForType(Decl->getReturnType(), S);
	CharUnits ParmOffset;
	// Compute size of all parameters.
	for (auto PI : Decl->parameters()) {
	QualType PType = PI->getType();
	CharUnits sz = getObjCEncodingTypeSize(PType);
	if (sz.isZero())
	continue;

	assert(sz.isPositive() &&
	"getObjCEncodingForFunctionDecl - Incomplete param type");
	ParmOffset += sz;
	}
	S += charUnitsToString(ParmOffset);
	ParmOffset = CharUnits::Zero();

	// Argument types.
	for (auto PVDecl : Decl->parameters()) {
	QualType PType = PVDecl->getOriginalType();
	if (const ArrayType *AT =
	dyn_cast<ArrayType>(PType->getCanonicalTypeInternal())) {
	// Use array's original type only if it has known number of
	// elements.
	if (!isa<ConstantArrayType>(AT))
	PType = PVDecl->getType();
	} else if (PType->isFunctionType())
	PType = PVDecl->getType();
	getObjCEncodingForType(PType, S);
	S += charUnitsToString(ParmOffset);
	ParmOffset += getObjCEncodingTypeSize(PType);
	}

	return S;
	}

	/// getObjCEncodingForMethodParameter - Return the encoded type for a single
	/// method parameter or return type. If Extended, include class names and
	/// block object types.
	void ASTContext::getObjCEncodingForMethodParameter(Decl::ObjCDeclQualifier QT,
	QualType T, std::string& S,
	bool Extended) const {
	// Encode type qualifer, 'in', 'inout', etc. for the parameter.
	getObjCEncodingForTypeQualifier(QT, S);
	// Encode parameter type.
	getObjCEncodingForTypeImpl(T, S, true, true, nullptr,
	true /OutermostType/,
	false /EncodingProperty/,
	false /StructField/,
	Extended /EncodeBlockParameters/,
	Extended /EncodeClassNames/);
	}

	/// getObjCEncodingForMethodDecl - Return the encoded type for this method
	/// declaration.
	std::string ASTContext::getObjCEncodingForMethodDecl(const ObjCMethodDecl *Decl,
	bool Extended) const {
	// FIXME: This is not very efficient.
	// Encode return type.
	std::string S;
	getObjCEncodingForMethodParameter(Decl->getObjCDeclQualifier(),
	Decl->getReturnType(), S, Extended);
	// Compute size of all parameters.
	// Start with computing size of a pointer in number of bytes.
	// FIXME: There might(should) be a better way of doing this computation!
	CharUnits PtrSize = getTypeSizeInChars(VoidPtrTy);
	// The first two arguments (self and _cmd) are pointers; account for
	// their size.
	CharUnits ParmOffset = 2 * PtrSize;
	for (ObjCMethodDecl::param_const_iterator PI = Decl->param_begin(),
	E = Decl->sel_param_end(); PI != E; ++PI) {
	QualType PType = (*PI)->getType();
	CharUnits sz = getObjCEncodingTypeSize(PType);
	if (sz.isZero())
	continue;

	assert(sz.isPositive() &&
	"getObjCEncodingForMethodDecl - Incomplete param type");
	ParmOffset += sz;
	}
	S += charUnitsToString(ParmOffset);
	S += "@0:";
	S += charUnitsToString(PtrSize);

	// Argument types.
	ParmOffset = 2 * PtrSize;
	for (ObjCMethodDecl::param_const_iterator PI = Decl->param_begin(),
	E = Decl->sel_param_end(); PI != E; ++PI) {
	const ParmVarDecl PVDecl = PI;
	QualType PType = PVDecl->getOriginalType();
	if (const ArrayType *AT =
	dyn_cast<ArrayType>(PType->getCanonicalTypeInternal())) {
	// Use array's original type only if it has known number of
	// elements.
	if (!isa<ConstantArrayType>(AT))
	PType = PVDecl->getType();
	} else if (PType->isFunctionType())
	PType = PVDecl->getType();
	getObjCEncodingForMethodParameter(PVDecl->getObjCDeclQualifier(),
	PType, S, Extended);
	S += charUnitsToString(ParmOffset);
	ParmOffset += getObjCEncodingTypeSize(PType);
	}

	return S;
	}

	ObjCPropertyImplDecl *
	ASTContext::getObjCPropertyImplDeclForPropertyDecl(
	const ObjCPropertyDecl *PD,
	const Decl *Container) const {
	if (!Container)
	return nullptr;
	if (const ObjCCategoryImplDecl *CID =
	dyn_cast<ObjCCategoryImplDecl>(Container)) {
	for (auto *PID : CID->property_impls())
	if (PID->getPropertyDecl() == PD)
	return PID;
	} else {
	const ObjCImplementationDecl *OID=cast<ObjCImplementationDecl>(Container);
	for (auto *PID : OID->property_impls())
	if (PID->getPropertyDecl() == PD)
	return PID;
	}
	return nullptr;
	}

	/// getObjCEncodingForPropertyDecl - Return the encoded type for this
	/// property declaration. If non-NULL, Container must be either an
	/// ObjCCategoryImplDecl or ObjCImplementationDecl; it should only be
	/// NULL when getting encodings for protocol properties.
	/// Property attributes are stored as a comma-delimited C string. The simple
	/// attributes readonly and bycopy are encoded as single characters. The
	/// parametrized attributes, getter=name, setter=name, and ivar=name, are
	/// encoded as single characters, followed by an identifier. Property types
	/// are also encoded as a parametrized attribute. The characters used to encode
	/// these attributes are defined by the following enumeration:
	/// @code
	/// enum PropertyAttributes {
	/// kPropertyReadOnly = 'R', // property is read-only.
	/// kPropertyBycopy = 'C', // property is a copy of the value last assigned
	/// kPropertyByref = '&', // property is a reference to the value last assigned
	/// kPropertyDynamic = 'D', // property is dynamic
	/// kPropertyGetter = 'G', // followed by getter selector name
	/// kPropertySetter = 'S', // followed by setter selector name
	/// kPropertyInstanceVariable = 'V' // followed by instance variable name
	/// kPropertyType = 'T' // followed by old-style type encoding.
	/// kPropertyWeak = 'W' // 'weak' property
	/// kPropertyStrong = 'P' // property GC'able
	/// kPropertyNonAtomic = 'N' // property non-atomic
	/// };
	/// @endcode
	std::string
	ASTContext::getObjCEncodingForPropertyDecl(const ObjCPropertyDecl *PD,
	const Decl *Container) const {
	// Collect information from the property implementation decl(s).
	bool Dynamic = false;
	ObjCPropertyImplDecl *SynthesizePID = nullptr;

	if (ObjCPropertyImplDecl *PropertyImpDecl =
	getObjCPropertyImplDeclForPropertyDecl(PD, Container)) {
	if (PropertyImpDecl->getPropertyImplementation() == ObjCPropertyImplDecl::Dynamic)
	Dynamic = true;
	else
	SynthesizePID = PropertyImpDecl;
	}

	// FIXME: This is not very efficient.
	std::string S = "T";

	// Encode result type.
	// GCC has some special rules regarding encoding of properties which
	// closely resembles encoding of ivars.
	getObjCEncodingForPropertyType(PD->getType(), S);

	if (PD->isReadOnly()) {
	S += ",R";
	if (PD->getPropertyAttributes() & ObjCPropertyDecl::OBJC_PR_copy)
	S += ",C";
	if (PD->getPropertyAttributes() & ObjCPropertyDecl::OBJC_PR_retain)
	S += ",&";
	if (PD->getPropertyAttributes() & ObjCPropertyDecl::OBJC_PR_weak)
	S += ",W";
	} else {
	switch (PD->getSetterKind()) {
	case ObjCPropertyDecl::Assign: break;
	case ObjCPropertyDecl::Copy: S += ",C"; break;
	case ObjCPropertyDecl::Retain: S += ",&"; break;
	case ObjCPropertyDecl::Weak: S += ",W"; break;
	}
	}

	// It really isn't clear at all what this means, since properties
	// are "dynamic by default".
	if (Dynamic)
	S += ",D";

	if (PD->getPropertyAttributes() & ObjCPropertyDecl::OBJC_PR_nonatomic)
	S += ",N";

	if (PD->getPropertyAttributes() & ObjCPropertyDecl::OBJC_PR_getter) {
	S += ",G";
	S += PD->getGetterName().getAsString();
	}

	if (PD->getPropertyAttributes() & ObjCPropertyDecl::OBJC_PR_setter) {
	S += ",S";
	S += PD->getSetterName().getAsString();
	}

	if (SynthesizePID) {
	const ObjCIvarDecl *OID = SynthesizePID->getPropertyIvarDecl();
	S += ",V";
	S += OID->getNameAsString();
	}

	// FIXME: OBJCGC: weak & strong
	return S;
	}

	/// getLegacyIntegralTypeEncoding -
	/// Another legacy compatibility encoding: 32-bit longs are encoded as
	/// 'l' or 'L' , but not always. For typedefs, we need to use
	/// 'i' or 'I' instead if encoding a struct field, or a pointer!
	void ASTContext::getLegacyIntegralTypeEncoding (QualType &PointeeTy) const {
	if (isa<TypedefType>(PointeeTy.getTypePtr())) {
	if (const BuiltinType *BT = PointeeTy->getAs<BuiltinType>()) {
	if (BT->getKind() == BuiltinType::ULong && getIntWidth(PointeeTy) == 32)
	PointeeTy = UnsignedIntTy;
	else
	if (BT->getKind() == BuiltinType::Long && getIntWidth(PointeeTy) == 32)
	PointeeTy = IntTy;
	}
	}
	}

	void ASTContext::getObjCEncodingForType(QualType T, std::string& S,
	const FieldDecl *Field,
	QualType *NotEncodedT) const {
	// We follow the behavior of gcc, expanding structures which are
	// directly pointed to, and expanding embedded structures. Note that
	// these rules are sufficient to prevent recursive encoding of the
	// same type.
	getObjCEncodingForTypeImpl(T, S, true, true, Field,
	true /* outermost type */, false, false,
	false, false, false, NotEncodedT);
	}

	void ASTContext::getObjCEncodingForPropertyType(QualType T,
	std::string& S) const {
	// Encode result type.
	// GCC has some special rules regarding encoding of properties which
	// closely resembles encoding of ivars.
	getObjCEncodingForTypeImpl(T, S, true, true, nullptr,
	true /* outermost type */,
	true /* encoding property */);
	}

	static char getObjCEncodingForPrimitiveKind(const ASTContext *C,
	BuiltinType::Kind kind) {
	switch (kind) {
	case BuiltinType::Void: return 'v';
	case BuiltinType::Bool: return 'B';
	case BuiltinType::Char_U:
	case BuiltinType::UChar: return 'C';
	case BuiltinType::Char16:
	case BuiltinType::UShort: return 'S';
	case BuiltinType::Char32:
	case BuiltinType::UInt: return 'I';
	case BuiltinType::ULong:
	return C->getTargetInfo().getLongWidth() == 32 ? 'L' : 'Q';
	case BuiltinType::UInt128: return 'T';
	case BuiltinType::ULongLong: return 'Q';
	case BuiltinType::Char_S:
	case BuiltinType::SChar: return 'c';
	case BuiltinType::Short: return 's';
	case BuiltinType::WChar_S:
	case BuiltinType::WChar_U:
	case BuiltinType::Int: return 'i';
	case BuiltinType::Long:
	return C->getTargetInfo().getLongWidth() == 32 ? 'l' : 'q';
	case BuiltinType::LongLong: return 'q';
	case BuiltinType::Int128: return 't';
	case BuiltinType::Float: return 'f';
	case BuiltinType::Double: return 'd';
	case BuiltinType::LongDouble: return 'D';
	case BuiltinType::NullPtr: return ''; // like char

	case BuiltinType::Float16:
	case BuiltinType::Float128:
	case BuiltinType::Half:
	// FIXME: potentially need @encodes for these!
	return ' ';

	case BuiltinType::ObjCId:
	case BuiltinType::ObjCClass:
	case BuiltinType::ObjCSel:
	llvm_unreachable("@encoding ObjC primitive type");

	// OpenCL and placeholder types don't need @encodings.
	#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
	case BuiltinType::Id:
	#include "clang/Basic/OpenCLImageTypes.def"
	case BuiltinType::OCLEvent:
	case BuiltinType::OCLClkEvent:
	case BuiltinType::OCLQueue:
	case BuiltinType::OCLReserveID:
	case BuiltinType::OCLSampler:
	case BuiltinType::Dependent:
	#define BUILTIN_TYPE(KIND, ID)
	#define PLACEHOLDER_TYPE(KIND, ID) \
	case BuiltinType::KIND:
	#include "clang/AST/BuiltinTypes.def"
	llvm_unreachable("invalid builtin type for @encode");
	}
	llvm_unreachable("invalid BuiltinType::Kind value");
	}

	static char ObjCEncodingForEnumType(const ASTContext C, const EnumType ET) {
	EnumDecl *Enum = ET->getDecl();

	// The encoding of an non-fixed enum type is always 'i', regardless of size.
	if (!Enum->isFixed())
	return 'i';

	// The encoding of a fixed enum type matches its fixed underlying type.
	const BuiltinType *BT = Enum->getIntegerType()->castAs<BuiltinType>();
	return getObjCEncodingForPrimitiveKind(C, BT->getKind());
	}

	static void EncodeBitField(const ASTContext *Ctx, std::string& S,
	QualType T, const FieldDecl *FD) {
	assert(FD->isBitField() && "not a bitfield - getObjCEncodingForTypeImpl");
	S += 'b';
	// The NeXT runtime encodes bit fields as b followed by the number of bits.
	// The GNU runtime requires more information; bitfields are encoded as b,
	// then the offset (in bits) of the first element, then the type of the
	// bitfield, then the size in bits. For example, in this structure:
	//
	// struct
	// {
	// int integer;
	// int flags:2;
	// };
	// On a 32-bit system, the encoding for flags would be b2 for the NeXT
	// runtime, but b32i2 for the GNU runtime. The reason for this extra
	// information is not especially sensible, but we're stuck with it for
	// compatibility with GCC, although providing it breaks anything that
	// actually uses runtime introspection and wants to work on both runtimes...
	if (Ctx->getLangOpts().ObjCRuntime.isGNUFamily()) {
	uint64_t Offset;

	if (const auto *IVD = dyn_cast<ObjCIvarDecl>(FD)) {
	Offset = Ctx->lookupFieldBitOffset(IVD->getContainingInterface(), nullptr,
	IVD);
	} else {
	const RecordDecl *RD = FD->getParent();
	const ASTRecordLayout &RL = Ctx->getASTRecordLayout(RD);
	Offset = RL.getFieldOffset(FD->getFieldIndex());
	}

	S += llvm::utostr(Offset);

	if (const EnumType *ET = T->getAs<EnumType>())
	S += ObjCEncodingForEnumType(Ctx, ET);
	else {
	const BuiltinType *BT = T->castAs<BuiltinType>();
	S += getObjCEncodingForPrimitiveKind(Ctx, BT->getKind());
	}
	}
	S += llvm::utostr(FD->getBitWidthValue(*Ctx));
	}

	// FIXME: Use SmallString for accumulating string.
	void ASTContext::getObjCEncodingForTypeImpl(QualType T, std::string& S,
	bool ExpandPointedToStructures,
	bool ExpandStructures,
	const FieldDecl *FD,
	bool OutermostType,
	bool EncodingProperty,
	bool StructField,
	bool EncodeBlockParameters,
	bool EncodeClassNames,
	bool EncodePointerToObjCTypedef,
	QualType *NotEncodedT) const {
	CanQualType CT = getCanonicalType(T);
	switch (CT->getTypeClass()) {
	case Type::Builtin:
	case Type::Enum:
	if (FD && FD->isBitField())
	return EncodeBitField(this, S, T, FD);
	if (const BuiltinType *BT = dyn_cast<BuiltinType>(CT))
	S += getObjCEncodingForPrimitiveKind(this, BT->getKind());
	else
	S += ObjCEncodingForEnumType(this, cast<EnumType>(CT));
	return;

	case Type::Complex: {
	const ComplexType *CT = T->castAs<ComplexType>();
	S += 'j';
	getObjCEncodingForTypeImpl(CT->getElementType(), S, false, false, nullptr);
	return;
	}

	case Type::Atomic: {
	const AtomicType *AT = T->castAs<AtomicType>();
	S += 'A';
	getObjCEncodingForTypeImpl(AT->getValueType(), S, false, false, nullptr);
	return;
	}

	// encoding for pointer or reference types.
	case Type::Pointer:
	case Type::LValueReference:
	case Type::RValueReference: {
	QualType PointeeTy;
	if (isa<PointerType>(CT)) {
	const PointerType *PT = T->castAs<PointerType>();
	if (PT->isObjCSelType()) {
	S += ':';
	return;
	}
	PointeeTy = PT->getPointeeType();
	} else {
	PointeeTy = T->castAs<ReferenceType>()->getPointeeType();
	}

	bool isReadOnly = false;
	// For historical/compatibility reasons, the read-only qualifier of the
	// pointee gets emitted _before_ the '^'. The read-only qualifier of
	// the pointer itself gets ignored, _unless_ we are looking at a typedef!
	// Also, do not emit the 'r' for anything but the outermost type!
	if (isa<TypedefType>(T.getTypePtr())) {
	if (OutermostType && T.isConstQualified()) {
	isReadOnly = true;
	S += 'r';
	}
	} else if (OutermostType) {
	QualType P = PointeeTy;
	while (P->getAs<PointerType>())
	P = P->getAs<PointerType>()->getPointeeType();
	if (P.isConstQualified()) {
	isReadOnly = true;
	S += 'r';
	}
	}
	if (isReadOnly) {
	// Another legacy compatibility encoding. Some ObjC qualifier and type
	// combinations need to be rearranged.
	// Rewrite "in const" from "nr" to "rn"
	if (StringRef(S).endswith("nr"))
	S.replace(S.end()-2, S.end(), "rn");
	}

	if (PointeeTy->isCharType()) {
	// char pointer types should be encoded as '*' unless it is a
	// type that has been typedef'd to 'BOOL'.
	if (!isTypeTypedefedAsBOOL(PointeeTy)) {
	S += '*';
	return;
	}
	} else if (const RecordType *RTy = PointeeTy->getAs<RecordType>()) {
	// GCC binary compat: Need to convert "struct objc_class *" to "#".
	if (RTy->getDecl()->getIdentifier() == &Idents.get("objc_class")) {
	S += '#';
	return;
	}
	// GCC binary compat: Need to convert "struct objc_object *" to "@".
	if (RTy->getDecl()->getIdentifier() == &Idents.get("objc_object")) {
	S += '@';
	return;
	}
	// fall through...
	}
	S += '^';
	getLegacyIntegralTypeEncoding(PointeeTy);

	getObjCEncodingForTypeImpl(PointeeTy, S, false, ExpandPointedToStructures,
	nullptr, false, false, false, false, false, false,
	NotEncodedT);
	return;
	}

	case Type::ConstantArray:
	case Type::IncompleteArray:
	case Type::VariableArray: {
	const ArrayType *AT = cast<ArrayType>(CT);

	if (isa<IncompleteArrayType>(AT) && !StructField) {
	// Incomplete arrays are encoded as a pointer to the array element.
	S += '^';

	getObjCEncodingForTypeImpl(AT->getElementType(), S,
	false, ExpandStructures, FD);
	} else {
	S += '[';

	if (const ConstantArrayType *CAT = dyn_cast<ConstantArrayType>(AT))
	S += llvm::utostr(CAT->getSize().getZExtValue());
	else {
	//Variable length arrays are encoded as a regular array with 0 elements.
	assert((isa<VariableArrayType>(AT) \|\| isa<IncompleteArrayType>(AT)) &&
	"Unknown array type!");
	S += '0';
	}

	getObjCEncodingForTypeImpl(AT->getElementType(), S,
	false, ExpandStructures, FD,
	false, false, false, false, false, false,
	NotEncodedT);
	S += ']';
	}
	return;
	}

	case Type::FunctionNoProto:
	case Type::FunctionProto:
	S += '?';
	return;

	case Type::Record: {
	RecordDecl *RDecl = cast<RecordType>(CT)->getDecl();
	S += RDecl->isUnion() ? '(' : '{';
	// Anonymous structures print as '?'
	if (const IdentifierInfo *II = RDecl->getIdentifier()) {
	S += II->getName();
	if (ClassTemplateSpecializationDecl *Spec
	= dyn_cast<ClassTemplateSpecializationDecl>(RDecl)) {
	const TemplateArgumentList &TemplateArgs = Spec->getTemplateArgs();
	llvm::raw_string_ostream OS(S);
	printTemplateArgumentList(OS, TemplateArgs.asArray(),
	getPrintingPolicy());
	}
	} else {
	S += '?';
	}
	if (ExpandStructures) {
	S += '=';
	if (!RDecl->isUnion()) {
	getObjCEncodingForStructureImpl(RDecl, S, FD, true, NotEncodedT);
	} else {
	for (const auto *Field : RDecl->fields()) {
	if (FD) {
	S += '"';
	S += Field->getNameAsString();
	S += '"';
	}

	// Special case bit-fields.
	if (Field->isBitField()) {
	getObjCEncodingForTypeImpl(Field->getType(), S, false, true,
	Field);
	} else {
	QualType qt = Field->getType();
	getLegacyIntegralTypeEncoding(qt);
	getObjCEncodingForTypeImpl(qt, S, false, true,
	FD, /OutermostType/false,
	/EncodingProperty/false,
	/StructField/true,
	false, false, false, NotEncodedT);
	}
	}
	}
	}
	S += RDecl->isUnion() ? ')' : '}';
	return;
	}

	case Type::BlockPointer: {
	const BlockPointerType *BT = T->castAs<BlockPointerType>();
	S += "@?"; // Unlike a pointer-to-function, which is "^?".
	if (EncodeBlockParameters) {
	const FunctionType *FT = BT->getPointeeType()->castAs<FunctionType>();

	S += '<';
	// Block return type
	getObjCEncodingForTypeImpl(
	FT->getReturnType(), S, ExpandPointedToStructures, ExpandStructures,
	FD, false /* OutermostType */, EncodingProperty,
	false /* StructField */, EncodeBlockParameters, EncodeClassNames, false,
	NotEncodedT);
	// Block self
	S += "@?";
	// Block parameters
	if (const FunctionProtoType *FPT = dyn_cast<FunctionProtoType>(FT)) {
	for (const auto &I : FPT->param_types())
	getObjCEncodingForTypeImpl(
	I, S, ExpandPointedToStructures, ExpandStructures, FD,
	false /* OutermostType */, EncodingProperty,
	false /* StructField */, EncodeBlockParameters, EncodeClassNames,
	false, NotEncodedT);
	}
	S += '>';
	}
	return;
	}

	case Type::ObjCObject: {
	// hack to match legacy encoding of id and Class
	QualType Ty = getObjCObjectPointerType(CT);
	if (Ty->isObjCIdType()) {
	S += "{objc_object=}";
	return;
	}
	else if (Ty->isObjCClassType()) {
	S += "{objc_class=}";
	return;
	}
	// TODO: Double check to make sure this intentially falls through.
	LLVM_FALLTHROUGH;
	}

	case Type::ObjCInterface: {
	// Ignore protocol qualifiers when mangling at this level.
	// @encode(class_name)
	ObjCInterfaceDecl *OI = T->castAs<ObjCObjectType>()->getInterface();
	S += '{';
	S += OI->getObjCRuntimeNameAsString();
	if (ExpandStructures) {
	S += '=';
	SmallVector<const ObjCIvarDecl*, 32> Ivars;
	DeepCollectObjCIvars(OI, true, Ivars);
	for (unsigned i = 0, e = Ivars.size(); i != e; ++i) {
	const FieldDecl *Field = cast<FieldDecl>(Ivars[i]);
	if (Field->isBitField())
	getObjCEncodingForTypeImpl(Field->getType(), S, false, true, Field);
	else
	getObjCEncodingForTypeImpl(Field->getType(), S, false, true, FD,
	false, false, false, false, false,
	EncodePointerToObjCTypedef,
	NotEncodedT);
	}
	}
	S += '}';
	return;
	}

	case Type::ObjCObjectPointer: {
	const ObjCObjectPointerType *OPT = T->castAs<ObjCObjectPointerType>();
	if (OPT->isObjCIdType()) {
	S += '@';
	return;
	}

	if (OPT->isObjCClassType() \|\| OPT->isObjCQualifiedClassType()) {
	// FIXME: Consider if we need to output qualifiers for 'Class<p>'.
	// Since this is a binary compatibility issue, need to consult with runtime
	// folks. Fortunately, this is a very obsure construct.
	S += '#';
	return;
	}

	if (OPT->isObjCQualifiedIdType()) {
	getObjCEncodingForTypeImpl(getObjCIdType(), S,
	ExpandPointedToStructures,
	ExpandStructures, FD);
	if (FD \|\| EncodingProperty \|\| EncodeClassNames) {
	// Note that we do extended encoding of protocol qualifer list
	// Only when doing ivar or property encoding.
	S += '"';
	for (const auto *I : OPT->quals()) {
	S += '<';
	S += I->getObjCRuntimeNameAsString();
	S += '>';
	}
	S += '"';
	}
	return;
	}

	QualType PointeeTy = OPT->getPointeeType();
	if (!EncodingProperty &&
	isa<TypedefType>(PointeeTy.getTypePtr()) &&
	!EncodePointerToObjCTypedef) {
	// Another historical/compatibility reason.
	// We encode the underlying type which comes out as
	// {...};
	S += '^';
	if (FD && OPT->getInterfaceDecl()) {
	// Prevent recursive encoding of fields in some rare cases.
	ObjCInterfaceDecl *OI = OPT->getInterfaceDecl();
	SmallVector<const ObjCIvarDecl*, 32> Ivars;
	DeepCollectObjCIvars(OI, true, Ivars);
	for (unsigned i = 0, e = Ivars.size(); i != e; ++i) {
	if (cast<FieldDecl>(Ivars[i]) == FD) {
	S += '{';
	S += OI->getObjCRuntimeNameAsString();
	S += '}';
	return;
	}
	}
	}
	getObjCEncodingForTypeImpl(PointeeTy, S,
	false, ExpandPointedToStructures,
	nullptr,
	false, false, false, false, false,
	/EncodePointerToObjCTypedef/true);
	return;
	}

	S += '@';
	if (OPT->getInterfaceDecl() &&
	(FD \|\| EncodingProperty \|\| EncodeClassNames)) {
	S += '"';
	S += OPT->getInterfaceDecl()->getObjCRuntimeNameAsString();
	for (const auto *I : OPT->quals()) {
	S += '<';
	S += I->getObjCRuntimeNameAsString();
	S += '>';
	}
	S += '"';
	}
	return;
	}

	// gcc just blithely ignores member pointers.
	// FIXME: we shoul do better than that. 'M' is available.
	case Type::MemberPointer:
	// This matches gcc's encoding, even though technically it is insufficient.
	//FIXME. We should do a better job than gcc.
	case Type::Vector:
	case Type::ExtVector:
	// Until we have a coherent encoding of these three types, issue warning.
	if (NotEncodedT)
	*NotEncodedT = T;
	return;

	// We could see an undeduced auto type here during error recovery.
	// Just ignore it.
	case Type::Auto:
	case Type::DeducedTemplateSpecialization:
	return;

	case Type::Pipe:
	#define ABSTRACT_TYPE(KIND, BASE)
	#define TYPE(KIND, BASE)
	#define DEPENDENT_TYPE(KIND, BASE) \
	case Type::KIND:
	#define NON_CANONICAL_TYPE(KIND, BASE) \
	case Type::KIND:
	#define NON_CANONICAL_UNLESS_DEPENDENT_TYPE(KIND, BASE) \
	case Type::KIND:
	#include "clang/AST/TypeNodes.def"
	llvm_unreachable("@encode for dependent type!");
	}
	llvm_unreachable("bad type kind!");
	}

	void ASTContext::getObjCEncodingForStructureImpl(RecordDecl *RDecl,
	std::string &S,
	const FieldDecl *FD,
	bool includeVBases,
	QualType *NotEncodedT) const {
	assert(RDecl && "Expected non-null RecordDecl");
	assert(!RDecl->isUnion() && "Should not be called for unions");
	if (!RDecl->getDefinition() \|\| RDecl->getDefinition()->isInvalidDecl())
	return;

	CXXRecordDecl *CXXRec = dyn_cast<CXXRecordDecl>(RDecl);
	std::multimap<uint64_t, NamedDecl *> FieldOrBaseOffsets;
	const ASTRecordLayout &layout = getASTRecordLayout(RDecl);

	if (CXXRec) {
	for (const auto &BI : CXXRec->bases()) {
	if (!BI.isVirtual()) {
	CXXRecordDecl *base = BI.getType()->getAsCXXRecordDecl();
	if (base->isEmpty())
	continue;
	uint64_t offs = toBits(layout.getBaseClassOffset(base));
	FieldOrBaseOffsets.insert(FieldOrBaseOffsets.upper_bound(offs),
	std::make_pair(offs, base));
	}
	}
	}

	unsigned i = 0;
	for (auto *Field : RDecl->fields()) {
	uint64_t offs = layout.getFieldOffset(i);
	FieldOrBaseOffsets.insert(FieldOrBaseOffsets.upper_bound(offs),
	std::make_pair(offs, Field));
	++i;
	}

	if (CXXRec && includeVBases) {
	for (const auto &BI : CXXRec->vbases()) {
	CXXRecordDecl *base = BI.getType()->getAsCXXRecordDecl();
	if (base->isEmpty())
	continue;
	uint64_t offs = toBits(layout.getVBaseClassOffset(base));
	if (offs >= uint64_t(toBits(layout.getNonVirtualSize())) &&
	FieldOrBaseOffsets.find(offs) == FieldOrBaseOffsets.end())
	FieldOrBaseOffsets.insert(FieldOrBaseOffsets.end(),
	std::make_pair(offs, base));
	}
	}

	CharUnits size;
	if (CXXRec) {
	size = includeVBases ? layout.getSize() : layout.getNonVirtualSize();
	} else {
	size = layout.getSize();
	}

	#ifndef NDEBUG
	uint64_t CurOffs = 0;
	#endif
	std::multimap<uint64_t, NamedDecl *>::iterator
	CurLayObj = FieldOrBaseOffsets.begin();

	if (CXXRec && CXXRec->isDynamicClass() &&
	(CurLayObj == FieldOrBaseOffsets.end() \|\| CurLayObj->first != 0)) {
	if (FD) {
	S += "\"_vptr$";
	std::string recname = CXXRec->getNameAsString();
	if (recname.empty()) recname = "?";
	S += recname;
	S += '"';
	}
	S += "^^?";
	#ifndef NDEBUG
	CurOffs += getTypeSize(VoidPtrTy);
	#endif
	}

	if (!RDecl->hasFlexibleArrayMember()) {
	// Mark the end of the structure.
	uint64_t offs = toBits(size);
	FieldOrBaseOffsets.insert(FieldOrBaseOffsets.upper_bound(offs),
	std::make_pair(offs, nullptr));
	}

	for (; CurLayObj != FieldOrBaseOffsets.end(); ++CurLayObj) {
	#ifndef NDEBUG
	assert(CurOffs <= CurLayObj->first);
	if (CurOffs < CurLayObj->first) {
	uint64_t padding = CurLayObj->first - CurOffs;
	// FIXME: There doesn't seem to be a way to indicate in the encoding that
	// packing/alignment of members is different that normal, in which case
	// the encoding will be out-of-sync with the real layout.
	// If the runtime switches to just consider the size of types without
	// taking into account alignment, we could make padding explicit in the
	// encoding (e.g. using arrays of chars). The encoding strings would be
	// longer then though.
	CurOffs += padding;
	}
	#endif

	NamedDecl *dcl = CurLayObj->second;
	if (!dcl)
	break; // reached end of structure.

	if (CXXRecordDecl *base = dyn_cast<CXXRecordDecl>(dcl)) {
	// We expand the bases without their virtual bases since those are going
	// in the initial structure. Note that this differs from gcc which
	// expands virtual bases each time one is encountered in the hierarchy,
	// making the encoding type bigger than it really is.
	getObjCEncodingForStructureImpl(base, S, FD, /includeVBases/false,
	NotEncodedT);
	assert(!base->isEmpty());
	#ifndef NDEBUG
	CurOffs += toBits(getASTRecordLayout(base).getNonVirtualSize());
	#endif
	} else {
	FieldDecl *field = cast<FieldDecl>(dcl);
	if (FD) {
	S += '"';
	S += field->getNameAsString();
	S += '"';
	}

	if (field->isBitField()) {
	EncodeBitField(this, S, field->getType(), field);
	#ifndef NDEBUG
	CurOffs += field->getBitWidthValue(*this);
	#endif
	} else {
	QualType qt = field->getType();
	getLegacyIntegralTypeEncoding(qt);
	getObjCEncodingForTypeImpl(qt, S, false, true, FD,
	/OutermostType/false,
	/EncodingProperty/false,
	/StructField/true,
	false, false, false, NotEncodedT);
	#ifndef NDEBUG
	CurOffs += getTypeSize(field->getType());
	#endif
	}
	}
	}
	}

	void ASTContext::getObjCEncodingForTypeQualifier(Decl::ObjCDeclQualifier QT,
	std::string& S) const {
	if (QT & Decl::OBJC_TQ_In)
	S += 'n';
	if (QT & Decl::OBJC_TQ_Inout)
	S += 'N';
	if (QT & Decl::OBJC_TQ_Out)
	S += 'o';
	if (QT & Decl::OBJC_TQ_Bycopy)
	S += 'O';
	if (QT & Decl::OBJC_TQ_Byref)
	S += 'R';
	if (QT & Decl::OBJC_TQ_Oneway)
	S += 'V';
	}

	TypedefDecl *ASTContext::getObjCIdDecl() const {
	if (!ObjCIdDecl) {
	QualType T = getObjCObjectType(ObjCBuiltinIdTy, {}, {});
	T = getObjCObjectPointerType(T);
	ObjCIdDecl = buildImplicitTypedef(T, "id");
	}
	return ObjCIdDecl;
	}

	TypedefDecl *ASTContext::getObjCSelDecl() const {
	if (!ObjCSelDecl) {
	QualType T = getPointerType(ObjCBuiltinSelTy);
	ObjCSelDecl = buildImplicitTypedef(T, "SEL");
	}
	return ObjCSelDecl;
	}

	TypedefDecl *ASTContext::getObjCClassDecl() const {
	if (!ObjCClassDecl) {
	QualType T = getObjCObjectType(ObjCBuiltinClassTy, {}, {});
	T = getObjCObjectPointerType(T);
	ObjCClassDecl = buildImplicitTypedef(T, "Class");
	}
	return ObjCClassDecl;
	}

	ObjCInterfaceDecl *ASTContext::getObjCProtocolDecl() const {
	if (!ObjCProtocolClassDecl) {
	ObjCProtocolClassDecl
	= ObjCInterfaceDecl::Create(*this, getTranslationUnitDecl(),
	SourceLocation(),
	&Idents.get("Protocol"),
	/typeParamList=/nullptr,
	/PrevDecl=/nullptr,
	SourceLocation(), true);
	}

	return ObjCProtocolClassDecl;
	}

	//===----------------------------------------------------------------------===//
	// __builtin_va_list Construction Functions
	//===----------------------------------------------------------------------===//

	static TypedefDecl CreateCharPtrNamedVaListDecl(const ASTContext Context,
	StringRef Name) {
	// typedef char* __builtin[_ms]_va_list;
	QualType T = Context->getPointerType(Context->CharTy);
	return Context->buildImplicitTypedef(T, Name);
	}

	static TypedefDecl CreateMSVaListDecl(const ASTContext Context) {
	return CreateCharPtrNamedVaListDecl(Context, "__builtin_ms_va_list");
	}

	static TypedefDecl CreateCharPtrBuiltinVaListDecl(const ASTContext Context) {
	return CreateCharPtrNamedVaListDecl(Context, "__builtin_va_list");
	}

	static TypedefDecl CreateVoidPtrBuiltinVaListDecl(const ASTContext Context) {
	// typedef void* __builtin_va_list;
	QualType T = Context->getPointerType(Context->VoidTy);
	return Context->buildImplicitTypedef(T, "__builtin_va_list");
	}

	static TypedefDecl *
	CreateAArch64ABIBuiltinVaListDecl(const ASTContext *Context) {
	// struct __va_list
	RecordDecl *VaListTagDecl = Context->buildImplicitRecord("__va_list");
	if (Context->getLangOpts().CPlusPlus) {
	// namespace std { struct __va_list {
	NamespaceDecl *NS;
	NS = NamespaceDecl::Create(const_cast<ASTContext &>(*Context),
	Context->getTranslationUnitDecl(),
	/Inline/ false, SourceLocation(),
	SourceLocation(), &Context->Idents.get("std"),
	/PrevDecl/ nullptr);
	NS->setImplicit();
	VaListTagDecl->setDeclContext(NS);
	}

	VaListTagDecl->startDefinition();

	const size_t NumFields = 5;
	QualType FieldTypes[NumFields];
	const char *FieldNames[NumFields];

	// void *__stack;
	FieldTypes[0] = Context->getPointerType(Context->VoidTy);
	FieldNames[0] = "__stack";

	// void *__gr_top;
	FieldTypes[1] = Context->getPointerType(Context->VoidTy);
	FieldNames[1] = "__gr_top";

	// void *__vr_top;
	FieldTypes[2] = Context->getPointerType(Context->VoidTy);
	FieldNames[2] = "__vr_top";

	// int __gr_offs;
	FieldTypes[3] = Context->IntTy;
	FieldNames[3] = "__gr_offs";

	// int __vr_offs;
	FieldTypes[4] = Context->IntTy;
	FieldNames[4] = "__vr_offs";

	// Create fields
	for (unsigned i = 0; i < NumFields; ++i) {
	FieldDecl Field = FieldDecl::Create(const_cast<ASTContext &>(Context),
	VaListTagDecl,
	SourceLocation(),
	SourceLocation(),
	&Context->Idents.get(FieldNames[i]),
	FieldTypes[i], /TInfo=/nullptr,
	/BitWidth=/nullptr,
	/Mutable=/false,
	ICIS_NoInit);
	Field->setAccess(AS_public);
	VaListTagDecl->addDecl(Field);
	}
	VaListTagDecl->completeDefinition();
	Context->VaListTagDecl = VaListTagDecl;
	QualType VaListTagType = Context->getRecordType(VaListTagDecl);

	// } __builtin_va_list;
	return Context->buildImplicitTypedef(VaListTagType, "__builtin_va_list");
	}

	static TypedefDecl CreatePowerABIBuiltinVaListDecl(const ASTContext Context) {
	// typedef struct __va_list_tag {
	RecordDecl *VaListTagDecl;

	VaListTagDecl = Context->buildImplicitRecord("__va_list_tag");
	VaListTagDecl->startDefinition();

	const size_t NumFields = 5;
	QualType FieldTypes[NumFields];
	const char *FieldNames[NumFields];

	// unsigned char gpr;
	FieldTypes[0] = Context->UnsignedCharTy;
	FieldNames[0] = "gpr";

	// unsigned char fpr;
	FieldTypes[1] = Context->UnsignedCharTy;
	FieldNames[1] = "fpr";

	// unsigned short reserved;
	FieldTypes[2] = Context->UnsignedShortTy;
	FieldNames[2] = "reserved";

	// void* overflow_arg_area;
	FieldTypes[3] = Context->getPointerType(Context->VoidTy);
	FieldNames[3] = "overflow_arg_area";

	// void* reg_save_area;
	FieldTypes[4] = Context->getPointerType(Context->VoidTy);
	FieldNames[4] = "reg_save_area";

	// Create fields
	for (unsigned i = 0; i < NumFields; ++i) {
	FieldDecl Field = FieldDecl::Create(Context, VaListTagDecl,
	SourceLocation(),
	SourceLocation(),
	&Context->Idents.get(FieldNames[i]),
	FieldTypes[i], /TInfo=/nullptr,
	/BitWidth=/nullptr,
	/Mutable=/false,
	ICIS_NoInit);
	Field->setAccess(AS_public);
	VaListTagDecl->addDecl(Field);
	}
	VaListTagDecl->completeDefinition();
	Context->VaListTagDecl = VaListTagDecl;
	QualType VaListTagType = Context->getRecordType(VaListTagDecl);

	// } __va_list_tag;
	TypedefDecl *VaListTagTypedefDecl =
	Context->buildImplicitTypedef(VaListTagType, "__va_list_tag");

	QualType VaListTagTypedefType =
	Context->getTypedefType(VaListTagTypedefDecl);

	// typedef __va_list_tag __builtin_va_list[1];
	llvm::APInt Size(Context->getTypeSize(Context->getSizeType()), 1);
	QualType VaListTagArrayType
	= Context->getConstantArrayType(VaListTagTypedefType,
	Size, ArrayType::Normal, 0);
	return Context->buildImplicitTypedef(VaListTagArrayType, "__builtin_va_list");
	}

	static TypedefDecl *
	CreateX86_64ABIBuiltinVaListDecl(const ASTContext *Context) {
	// struct __va_list_tag {
	RecordDecl *VaListTagDecl;
	VaListTagDecl = Context->buildImplicitRecord("__va_list_tag");
	VaListTagDecl->startDefinition();

	const size_t NumFields = 4;
	QualType FieldTypes[NumFields];
	const char *FieldNames[NumFields];

	// unsigned gp_offset;
	FieldTypes[0] = Context->UnsignedIntTy;
	FieldNames[0] = "gp_offset";

	// unsigned fp_offset;
	FieldTypes[1] = Context->UnsignedIntTy;
	FieldNames[1] = "fp_offset";

	// void* overflow_arg_area;
	FieldTypes[2] = Context->getPointerType(Context->VoidTy);
	FieldNames[2] = "overflow_arg_area";

	// void* reg_save_area;
	FieldTypes[3] = Context->getPointerType(Context->VoidTy);
	FieldNames[3] = "reg_save_area";

	// Create fields
	for (unsigned i = 0; i < NumFields; ++i) {
	FieldDecl Field = FieldDecl::Create(const_cast<ASTContext &>(Context),
	VaListTagDecl,
	SourceLocation(),
	SourceLocation(),
	&Context->Idents.get(FieldNames[i]),
	FieldTypes[i], /TInfo=/nullptr,
	/BitWidth=/nullptr,
	/Mutable=/false,
	ICIS_NoInit);
	Field->setAccess(AS_public);
	VaListTagDecl->addDecl(Field);
	}
	VaListTagDecl->completeDefinition();
	Context->VaListTagDecl = VaListTagDecl;
	QualType VaListTagType = Context->getRecordType(VaListTagDecl);

	// };

	// typedef struct __va_list_tag __builtin_va_list[1];
	llvm::APInt Size(Context->getTypeSize(Context->getSizeType()), 1);
	QualType VaListTagArrayType =
	Context->getConstantArrayType(VaListTagType, Size, ArrayType::Normal, 0);
	return Context->buildImplicitTypedef(VaListTagArrayType, "__builtin_va_list");
	}

	static TypedefDecl CreatePNaClABIBuiltinVaListDecl(const ASTContext Context) {
	// typedef int __builtin_va_list[4];
	llvm::APInt Size(Context->getTypeSize(Context->getSizeType()), 4);
	QualType IntArrayType =
	Context->getConstantArrayType(Context->IntTy, Size, ArrayType::Normal, 0);
	return Context->buildImplicitTypedef(IntArrayType, "__builtin_va_list");
	}

	static TypedefDecl *
	CreateAAPCSABIBuiltinVaListDecl(const ASTContext *Context) {
	// struct __va_list
	RecordDecl *VaListDecl = Context->buildImplicitRecord("__va_list");
	if (Context->getLangOpts().CPlusPlus) {
	// namespace std { struct __va_list {
	NamespaceDecl *NS;
	NS = NamespaceDecl::Create(const_cast<ASTContext &>(*Context),
	Context->getTranslationUnitDecl(),
	/Inline/false, SourceLocation(),
	SourceLocation(), &Context->Idents.get("std"),
	/PrevDecl/ nullptr);
	NS->setImplicit();
	VaListDecl->setDeclContext(NS);
	}

	VaListDecl->startDefinition();

	// void * __ap;
	FieldDecl Field = FieldDecl::Create(const_cast<ASTContext &>(Context),
	VaListDecl,
	SourceLocation(),
	SourceLocation(),
	&Context->Idents.get("__ap"),
	Context->getPointerType(Context->VoidTy),
	/TInfo=/nullptr,
	/BitWidth=/nullptr,
	/Mutable=/false,
	ICIS_NoInit);
	Field->setAccess(AS_public);
	VaListDecl->addDecl(Field);

	// };
	VaListDecl->completeDefinition();
	Context->VaListTagDecl = VaListDecl;

	// typedef struct __va_list __builtin_va_list;
	QualType T = Context->getRecordType(VaListDecl);
	return Context->buildImplicitTypedef(T, "__builtin_va_list");
	}

	static TypedefDecl *
	CreateSystemZBuiltinVaListDecl(const ASTContext *Context) {
	// struct __va_list_tag {
	RecordDecl *VaListTagDecl;
	VaListTagDecl = Context->buildImplicitRecord("__va_list_tag");
	VaListTagDecl->startDefinition();

	const size_t NumFields = 4;
	QualType FieldTypes[NumFields];
	const char *FieldNames[NumFields];

	// long __gpr;
	FieldTypes[0] = Context->LongTy;
	FieldNames[0] = "__gpr";

	// long __fpr;
	FieldTypes[1] = Context->LongTy;
	FieldNames[1] = "__fpr";

	// void *__overflow_arg_area;
	FieldTypes[2] = Context->getPointerType(Context->VoidTy);
	FieldNames[2] = "__overflow_arg_area";

	// void *__reg_save_area;
	FieldTypes[3] = Context->getPointerType(Context->VoidTy);
	FieldNames[3] = "__reg_save_area";

	// Create fields
	for (unsigned i = 0; i < NumFields; ++i) {
	FieldDecl Field = FieldDecl::Create(const_cast<ASTContext &>(Context),
	VaListTagDecl,
	SourceLocation(),
	SourceLocation(),
	&Context->Idents.get(FieldNames[i]),
	FieldTypes[i], /TInfo=/nullptr,
	/BitWidth=/nullptr,
	/Mutable=/false,
	ICIS_NoInit);
	Field->setAccess(AS_public);
	VaListTagDecl->addDecl(Field);
	}
	VaListTagDecl->completeDefinition();
	Context->VaListTagDecl = VaListTagDecl;
	QualType VaListTagType = Context->getRecordType(VaListTagDecl);

	// };

	// typedef __va_list_tag __builtin_va_list[1];
	llvm::APInt Size(Context->getTypeSize(Context->getSizeType()), 1);
	QualType VaListTagArrayType =
	Context->getConstantArrayType(VaListTagType, Size, ArrayType::Normal, 0);

	return Context->buildImplicitTypedef(VaListTagArrayType, "__builtin_va_list");
	}

	static TypedefDecl CreateVaListDecl(const ASTContext Context,
	TargetInfo::BuiltinVaListKind Kind) {
	switch (Kind) {
	case TargetInfo::CharPtrBuiltinVaList:
	return CreateCharPtrBuiltinVaListDecl(Context);
	case TargetInfo::VoidPtrBuiltinVaList:
	return CreateVoidPtrBuiltinVaListDecl(Context);
	case TargetInfo::AArch64ABIBuiltinVaList:
	return CreateAArch64ABIBuiltinVaListDecl(Context);
	case TargetInfo::PowerABIBuiltinVaList:
	return CreatePowerABIBuiltinVaListDecl(Context);
	case TargetInfo::X86_64ABIBuiltinVaList:
	return CreateX86_64ABIBuiltinVaListDecl(Context);
	case TargetInfo::PNaClABIBuiltinVaList:
	return CreatePNaClABIBuiltinVaListDecl(Context);
	case TargetInfo::AAPCSABIBuiltinVaList:
	return CreateAAPCSABIBuiltinVaListDecl(Context);
	case TargetInfo::SystemZBuiltinVaList:
	return CreateSystemZBuiltinVaListDecl(Context);
	}

	llvm_unreachable("Unhandled __builtin_va_list type kind");
	}

	TypedefDecl *ASTContext::getBuiltinVaListDecl() const {
	if (!BuiltinVaListDecl) {
	BuiltinVaListDecl = CreateVaListDecl(this, Target->getBuiltinVaListKind());
	assert(BuiltinVaListDecl->isImplicit());
	}

	return BuiltinVaListDecl;
	}

	Decl *ASTContext::getVaListTagDecl() const {
	// Force the creation of VaListTagDecl by building the __builtin_va_list
	// declaration.
	if (!VaListTagDecl)
	(void)getBuiltinVaListDecl();

	return VaListTagDecl;
	}

	TypedefDecl *ASTContext::getBuiltinMSVaListDecl() const {
	if (!BuiltinMSVaListDecl)
	BuiltinMSVaListDecl = CreateMSVaListDecl(this);

	return BuiltinMSVaListDecl;
	}

	void ASTContext::setObjCConstantStringInterface(ObjCInterfaceDecl *Decl) {
	assert(ObjCConstantStringType.isNull() &&
	"'NSConstantString' type already set!");

	ObjCConstantStringType = getObjCInterfaceType(Decl);
	}

	/// \brief Retrieve the template name that corresponds to a non-empty
	/// lookup.
	TemplateName
	ASTContext::getOverloadedTemplateName(UnresolvedSetIterator Begin,
	UnresolvedSetIterator End) const {
	unsigned size = End - Begin;
	assert(size > 1 && "set is not overloaded!");

	void *memory = Allocate(sizeof(OverloadedTemplateStorage) +
	size * sizeof(FunctionTemplateDecl*));
	OverloadedTemplateStorage *OT = new(memory) OverloadedTemplateStorage(size);

	NamedDecl **Storage = OT->getStorage();
	for (UnresolvedSetIterator I = Begin; I != End; ++I) {
	NamedDecl D = I;
	assert(isa<FunctionTemplateDecl>(D) \|\|
	(isa<UsingShadowDecl>(D) &&
	isa<FunctionTemplateDecl>(D->getUnderlyingDecl())));
	*Storage++ = D;
	}

	return TemplateName(OT);
	}

	/// \brief Retrieve the template name that represents a qualified
	/// template name such as \c std::vector.
	TemplateName
	ASTContext::getQualifiedTemplateName(NestedNameSpecifier *NNS,
	bool TemplateKeyword,
	TemplateDecl *Template) const {
	assert(NNS && "Missing nested-name-specifier in qualified template name");

	// FIXME: Canonicalization?
	llvm::FoldingSetNodeID ID;
	QualifiedTemplateName::Profile(ID, NNS, TemplateKeyword, Template);

	void *InsertPos = nullptr;
	QualifiedTemplateName *QTN =
	QualifiedTemplateNames.FindNodeOrInsertPos(ID, InsertPos);
	if (!QTN) {
	QTN = new (*this, alignof(QualifiedTemplateName))
	QualifiedTemplateName(NNS, TemplateKeyword, Template);
	QualifiedTemplateNames.InsertNode(QTN, InsertPos);
	}

	return TemplateName(QTN);
	}

	/// \brief Retrieve the template name that represents a dependent
	/// template name such as \c MetaFun::template apply.
	TemplateName
	ASTContext::getDependentTemplateName(NestedNameSpecifier *NNS,
	const IdentifierInfo *Name) const {
	assert((!NNS \|\| NNS->isDependent()) &&
	"Nested name specifier must be dependent");

	llvm::FoldingSetNodeID ID;
	DependentTemplateName::Profile(ID, NNS, Name);

	void *InsertPos = nullptr;
	DependentTemplateName *QTN =
	DependentTemplateNames.FindNodeOrInsertPos(ID, InsertPos);

	if (QTN)
	return TemplateName(QTN);

	NestedNameSpecifier *CanonNNS = getCanonicalNestedNameSpecifier(NNS);
	if (CanonNNS == NNS) {
	QTN = new (*this, alignof(DependentTemplateName))
	DependentTemplateName(NNS, Name);
	} else {
	TemplateName Canon = getDependentTemplateName(CanonNNS, Name);
	QTN = new (*this, alignof(DependentTemplateName))
	DependentTemplateName(NNS, Name, Canon);
	DependentTemplateName *CheckQTN =
	DependentTemplateNames.FindNodeOrInsertPos(ID, InsertPos);
	assert(!CheckQTN && "Dependent type name canonicalization broken");
	(void)CheckQTN;
	}

	DependentTemplateNames.InsertNode(QTN, InsertPos);
	return TemplateName(QTN);
	}

	/// \brief Retrieve the template name that represents a dependent
	/// template name such as \c MetaFun::template operator+.
	TemplateName
	ASTContext::getDependentTemplateName(NestedNameSpecifier *NNS,
	OverloadedOperatorKind Operator) const {
	assert((!NNS \|\| NNS->isDependent()) &&
	"Nested name specifier must be dependent");

	llvm::FoldingSetNodeID ID;
	DependentTemplateName::Profile(ID, NNS, Operator);

	void *InsertPos = nullptr;
	DependentTemplateName *QTN
	= DependentTemplateNames.FindNodeOrInsertPos(ID, InsertPos);

	if (QTN)
	return TemplateName(QTN);

	NestedNameSpecifier *CanonNNS = getCanonicalNestedNameSpecifier(NNS);
	if (CanonNNS == NNS) {
	QTN = new (*this, alignof(DependentTemplateName))
	DependentTemplateName(NNS, Operator);
	} else {
	TemplateName Canon = getDependentTemplateName(CanonNNS, Operator);
	QTN = new (*this, alignof(DependentTemplateName))
	DependentTemplateName(NNS, Operator, Canon);

	DependentTemplateName *CheckQTN
	= DependentTemplateNames.FindNodeOrInsertPos(ID, InsertPos);
	assert(!CheckQTN && "Dependent template name canonicalization broken");
	(void)CheckQTN;
	}

	DependentTemplateNames.InsertNode(QTN, InsertPos);
	return TemplateName(QTN);
	}

	TemplateName
	ASTContext::getSubstTemplateTemplateParm(TemplateTemplateParmDecl *param,
	TemplateName replacement) const {
	llvm::FoldingSetNodeID ID;
	SubstTemplateTemplateParmStorage::Profile(ID, param, replacement);

	void *insertPos = nullptr;
	SubstTemplateTemplateParmStorage *subst
	= SubstTemplateTemplateParms.FindNodeOrInsertPos(ID, insertPos);

	if (!subst) {
	subst = new (*this) SubstTemplateTemplateParmStorage(param, replacement);
	SubstTemplateTemplateParms.InsertNode(subst, insertPos);
	}

	return TemplateName(subst);
	}

	TemplateName
	ASTContext::getSubstTemplateTemplateParmPack(TemplateTemplateParmDecl *Param,
	const TemplateArgument &ArgPack) const {
	ASTContext &Self = const_cast<ASTContext &>(*this);
	llvm::FoldingSetNodeID ID;
	SubstTemplateTemplateParmPackStorage::Profile(ID, Self, Param, ArgPack);

	void *InsertPos = nullptr;
	SubstTemplateTemplateParmPackStorage *Subst
	= SubstTemplateTemplateParmPacks.FindNodeOrInsertPos(ID, InsertPos);

	if (!Subst) {
	Subst = new (*this) SubstTemplateTemplateParmPackStorage(Param,
	ArgPack.pack_size(),
	ArgPack.pack_begin());
	SubstTemplateTemplateParmPacks.InsertNode(Subst, InsertPos);
	}

	return TemplateName(Subst);
	}

	/// getFromTargetType - Given one of the integer types provided by
	/// TargetInfo, produce the corresponding type. The unsigned @p Type
	/// is actually a value of type @c TargetInfo::IntType.
	CanQualType ASTContext::getFromTargetType(unsigned Type) const {
	switch (Type) {
	case TargetInfo::NoInt: return CanQualType();
	case TargetInfo::SignedChar: return SignedCharTy;
	case TargetInfo::UnsignedChar: return UnsignedCharTy;
	case TargetInfo::SignedShort: return ShortTy;
	case TargetInfo::UnsignedShort: return UnsignedShortTy;
	case TargetInfo::SignedInt: return IntTy;
	case TargetInfo::UnsignedInt: return UnsignedIntTy;
	case TargetInfo::SignedLong: return LongTy;
	case TargetInfo::UnsignedLong: return UnsignedLongTy;
	case TargetInfo::SignedLongLong: return LongLongTy;
	case TargetInfo::UnsignedLongLong: return UnsignedLongLongTy;
	}

	llvm_unreachable("Unhandled TargetInfo::IntType value");
	}

	//===----------------------------------------------------------------------===//
	// Type Predicates.
	//===----------------------------------------------------------------------===//

	/// getObjCGCAttr - Returns one of GCNone, Weak or Strong objc's
	/// garbage collection attribute.
	///
	Qualifiers::GC ASTContext::getObjCGCAttrKind(QualType Ty) const {
	if (getLangOpts().getGC() == LangOptions::NonGC)
	return Qualifiers::GCNone;

	assert(getLangOpts().ObjC1);
	Qualifiers::GC GCAttrs = Ty.getObjCGCAttr();

	// Default behaviour under objective-C's gc is for ObjC pointers
	// (or pointers to them) be treated as though they were declared
	// as __strong.
	if (GCAttrs == Qualifiers::GCNone) {
	if (Ty->isObjCObjectPointerType() \|\| Ty->isBlockPointerType())
	return Qualifiers::Strong;
	else if (Ty->isPointerType())
	return getObjCGCAttrKind(Ty->getAs<PointerType>()->getPointeeType());
	} else {
	// It's not valid to set GC attributes on anything that isn't a
	// pointer.
	#ifndef NDEBUG
	QualType CT = Ty->getCanonicalTypeInternal();
	while (const ArrayType *AT = dyn_cast<ArrayType>(CT))
	CT = AT->getElementType();
	assert(CT->isAnyPointerType() \|\| CT->isBlockPointerType());
	#endif
	}
	return GCAttrs;
	}

	//===----------------------------------------------------------------------===//
	// Type Compatibility Testing
	//===----------------------------------------------------------------------===//

	/// areCompatVectorTypes - Return true if the two specified vector types are
	/// compatible.
	static bool areCompatVectorTypes(const VectorType *LHS,
	const VectorType *RHS) {
	assert(LHS->isCanonicalUnqualified() && RHS->isCanonicalUnqualified());
	return LHS->getElementType() == RHS->getElementType() &&
	LHS->getNumElements() == RHS->getNumElements();
	}

	bool ASTContext::areCompatibleVectorTypes(QualType FirstVec,
	QualType SecondVec) {
	assert(FirstVec->isVectorType() && "FirstVec should be a vector type");
	assert(SecondVec->isVectorType() && "SecondVec should be a vector type");

	if (hasSameUnqualifiedType(FirstVec, SecondVec))
	return true;

	// Treat Neon vector types and most AltiVec vector types as if they are the
	// equivalent GCC vector types.
	const VectorType *First = FirstVec->getAs<VectorType>();
	const VectorType *Second = SecondVec->getAs<VectorType>();
	if (First->getNumElements() == Second->getNumElements() &&
	hasSameType(First->getElementType(), Second->getElementType()) &&
	First->getVectorKind() != VectorType::AltiVecPixel &&
	First->getVectorKind() != VectorType::AltiVecBool &&
	Second->getVectorKind() != VectorType::AltiVecPixel &&
	Second->getVectorKind() != VectorType::AltiVecBool)
	return true;

	return false;
	}

	//===----------------------------------------------------------------------===//
	// ObjCQualifiedIdTypesAreCompatible - Compatibility testing for qualified id's.
	//===----------------------------------------------------------------------===//

	/// ProtocolCompatibleWithProtocol - return 'true' if 'lProto' is in the
	/// inheritance hierarchy of 'rProto'.
	bool
	ASTContext::ProtocolCompatibleWithProtocol(ObjCProtocolDecl *lProto,
	ObjCProtocolDecl *rProto) const {
	if (declaresSameEntity(lProto, rProto))
	return true;
	for (auto *PI : rProto->protocols())
	if (ProtocolCompatibleWithProtocol(lProto, PI))
	return true;
	return false;
	}

	/// ObjCQualifiedClassTypesAreCompatible - compare Class<pr,...> and
	/// Class<pr1, ...>.
	bool ASTContext::ObjCQualifiedClassTypesAreCompatible(QualType lhs,
	QualType rhs) {
	const ObjCObjectPointerType *lhsQID = lhs->getAs<ObjCObjectPointerType>();
	const ObjCObjectPointerType *rhsOPT = rhs->getAs<ObjCObjectPointerType>();
	assert((lhsQID && rhsOPT) && "ObjCQualifiedClassTypesAreCompatible");

	for (auto *lhsProto : lhsQID->quals()) {
	bool match = false;
	for (auto *rhsProto : rhsOPT->quals()) {
	if (ProtocolCompatibleWithProtocol(lhsProto, rhsProto)) {
	match = true;
	break;
	}
	}
	if (!match)
	return false;
	}
	return true;
	}

	/// ObjCQualifiedIdTypesAreCompatible - We know that one of lhs/rhs is an
	/// ObjCQualifiedIDType.
	bool ASTContext::ObjCQualifiedIdTypesAreCompatible(QualType lhs, QualType rhs,
	bool compare) {
	// Allow id<P..> and an 'id' or void* type in all cases.
	if (lhs->isVoidPointerType() \|\|
	lhs->isObjCIdType() \|\| lhs->isObjCClassType())
	return true;
	else if (rhs->isVoidPointerType() \|\|
	rhs->isObjCIdType() \|\| rhs->isObjCClassType())
	return true;

	if (const ObjCObjectPointerType *lhsQID = lhs->getAsObjCQualifiedIdType()) {
	const ObjCObjectPointerType *rhsOPT = rhs->getAs<ObjCObjectPointerType>();

	if (!rhsOPT) return false;

	if (rhsOPT->qual_empty()) {
	// If the RHS is a unqualified interface pointer "NSString*",
	// make sure we check the class hierarchy.
	if (ObjCInterfaceDecl *rhsID = rhsOPT->getInterfaceDecl()) {
	for (auto *I : lhsQID->quals()) {
	// when comparing an id<P> on lhs with a static type on rhs,
	// see if static class implements all of id's protocols, directly or
	// through its super class and categories.
	if (!rhsID->ClassImplementsProtocol(I, true))
	return false;
	}
	}
	// If there are no qualifiers and no interface, we have an 'id'.
	return true;
	}
	// Both the right and left sides have qualifiers.
	for (auto *lhsProto : lhsQID->quals()) {
	bool match = false;

	// when comparing an id<P> on lhs with a static type on rhs,
	// see if static class implements all of id's protocols, directly or
	// through its super class and categories.
	for (auto *rhsProto : rhsOPT->quals()) {
	if (ProtocolCompatibleWithProtocol(lhsProto, rhsProto) \|\|
	(compare && ProtocolCompatibleWithProtocol(rhsProto, lhsProto))) {
	match = true;
	break;
	}
	}
	// If the RHS is a qualified interface pointer "NSString<P>*",
	// make sure we check the class hierarchy.
	if (ObjCInterfaceDecl *rhsID = rhsOPT->getInterfaceDecl()) {
	for (auto *I : lhsQID->quals()) {
	// when comparing an id<P> on lhs with a static type on rhs,
	// see if static class implements all of id's protocols, directly or
	// through its super class and categories.
	if (rhsID->ClassImplementsProtocol(I, true)) {
	match = true;
	break;
	}
	}
	}
	if (!match)
	return false;
	}

	return true;
	}

	const ObjCObjectPointerType *rhsQID = rhs->getAsObjCQualifiedIdType();
	assert(rhsQID && "One of the LHS/RHS should be id<x>");

	if (const ObjCObjectPointerType *lhsOPT =
	lhs->getAsObjCInterfacePointerType()) {
	// If both the right and left sides have qualifiers.
	for (auto *lhsProto : lhsOPT->quals()) {
	bool match = false;

	// when comparing an id<P> on rhs with a static type on lhs,
	// see if static class implements all of id's protocols, directly or
	// through its super class and categories.
	// First, lhs protocols in the qualifier list must be found, direct
	// or indirect in rhs's qualifier list or it is a mismatch.
	for (auto *rhsProto : rhsQID->quals()) {
	if (ProtocolCompatibleWithProtocol(lhsProto, rhsProto) \|\|
	(compare && ProtocolCompatibleWithProtocol(rhsProto, lhsProto))) {
	match = true;
	break;
	}
	}
	if (!match)
	return false;
	}

	// Static class's protocols, or its super class or category protocols
	// must be found, direct or indirect in rhs's qualifier list or it is a mismatch.
	if (ObjCInterfaceDecl *lhsID = lhsOPT->getInterfaceDecl()) {
	llvm::SmallPtrSet<ObjCProtocolDecl *, 8> LHSInheritedProtocols;
	CollectInheritedProtocols(lhsID, LHSInheritedProtocols);
	// This is rather dubious but matches gcc's behavior. If lhs has
	// no type qualifier and its class has no static protocol(s)
	// assume that it is mismatch.
	if (LHSInheritedProtocols.empty() && lhsOPT->qual_empty())
	return false;
	for (auto *lhsProto : LHSInheritedProtocols) {
	bool match = false;
	for (auto *rhsProto : rhsQID->quals()) {
	if (ProtocolCompatibleWithProtocol(lhsProto, rhsProto) \|\|
	(compare && ProtocolCompatibleWithProtocol(rhsProto, lhsProto))) {
	match = true;
	break;
	}
	}
	if (!match)
	return false;
	}
	}
	return true;
	}
	return false;
	}

	/// canAssignObjCInterfaces - Return true if the two interface types are
	/// compatible for assignment from RHS to LHS. This handles validation of any
	/// protocol qualifiers on the LHS or RHS.
	bool ASTContext::canAssignObjCInterfaces(const ObjCObjectPointerType *LHSOPT,
	const ObjCObjectPointerType *RHSOPT) {
	const ObjCObjectType* LHS = LHSOPT->getObjectType();
	const ObjCObjectType* RHS = RHSOPT->getObjectType();

	// If either type represents the built-in 'id' or 'Class' types, return true.
	if (LHS->isObjCUnqualifiedIdOrClass() \|\|
	RHS->isObjCUnqualifiedIdOrClass())
	return true;

	// Function object that propagates a successful result or handles
	// __kindof types.
	auto finish = [&](bool succeeded) -> bool {
	if (succeeded)
	return true;

	if (!RHS->isKindOfType())
	return false;

	// Strip off __kindof and protocol qualifiers, then check whether
	// we can assign the other way.
	return canAssignObjCInterfaces(RHSOPT->stripObjCKindOfTypeAndQuals(*this),
	LHSOPT->stripObjCKindOfTypeAndQuals(*this));
	};

	if (LHS->isObjCQualifiedId() \|\| RHS->isObjCQualifiedId()) {
	return finish(ObjCQualifiedIdTypesAreCompatible(QualType(LHSOPT,0),
	QualType(RHSOPT,0),
	false));
	}

	if (LHS->isObjCQualifiedClass() && RHS->isObjCQualifiedClass()) {
	return finish(ObjCQualifiedClassTypesAreCompatible(QualType(LHSOPT,0),
	QualType(RHSOPT,0)));
	}

	// If we have 2 user-defined types, fall into that path.
	if (LHS->getInterface() && RHS->getInterface()) {
	return finish(canAssignObjCInterfaces(LHS, RHS));
	}

	return false;
	}

	/// canAssignObjCInterfacesInBlockPointer - This routine is specifically written
	/// for providing type-safety for objective-c pointers used to pass/return
	/// arguments in block literals. When passed as arguments, passing 'A*' where
	/// 'id' is expected is not OK. Passing 'Sub " where 'Super " is expected is
	/// not OK. For the return type, the opposite is not OK.
	bool ASTContext::canAssignObjCInterfacesInBlockPointer(
	const ObjCObjectPointerType *LHSOPT,
	const ObjCObjectPointerType *RHSOPT,
	bool BlockReturnType) {

	// Function object that propagates a successful result or handles
	// __kindof types.
	auto finish = [&](bool succeeded) -> bool {
	if (succeeded)
	return true;

	const ObjCObjectPointerType *Expected = BlockReturnType ? RHSOPT : LHSOPT;
	if (!Expected->isKindOfType())
	return false;

	// Strip off __kindof and protocol qualifiers, then check whether
	// we can assign the other way.
	return canAssignObjCInterfacesInBlockPointer(
	RHSOPT->stripObjCKindOfTypeAndQuals(*this),
	LHSOPT->stripObjCKindOfTypeAndQuals(*this),
	BlockReturnType);
	};

	if (RHSOPT->isObjCBuiltinType() \|\| LHSOPT->isObjCIdType())
	return true;

	if (LHSOPT->isObjCBuiltinType()) {
	return finish(RHSOPT->isObjCBuiltinType() \|\|
	RHSOPT->isObjCQualifiedIdType());
	}

	if (LHSOPT->isObjCQualifiedIdType() \|\| RHSOPT->isObjCQualifiedIdType())
	return finish(ObjCQualifiedIdTypesAreCompatible(QualType(LHSOPT,0),
	QualType(RHSOPT,0),
	false));

	const ObjCInterfaceType* LHS = LHSOPT->getInterfaceType();
	const ObjCInterfaceType* RHS = RHSOPT->getInterfaceType();
	if (LHS && RHS) { // We have 2 user-defined types.
	if (LHS != RHS) {
	if (LHS->getDecl()->isSuperClassOf(RHS->getDecl()))
	return finish(BlockReturnType);
	if (RHS->getDecl()->isSuperClassOf(LHS->getDecl()))
	return finish(!BlockReturnType);
	}
	else
	return true;
	}
	return false;
	}

	/// Comparison routine for Objective-C protocols to be used with
	/// llvm::array_pod_sort.
	static int compareObjCProtocolsByName(ObjCProtocolDecl * const *lhs,
	ObjCProtocolDecl * const *rhs) {
	return (lhs)->getName().compare((rhs)->getName());
	}

	/// getIntersectionOfProtocols - This routine finds the intersection of set
	/// of protocols inherited from two distinct objective-c pointer objects with
	/// the given common base.
	/// It is used to build composite qualifier list of the composite type of
	/// the conditional expression involving two objective-c pointer objects.
	static
	void getIntersectionOfProtocols(ASTContext &Context,
	const ObjCInterfaceDecl *CommonBase,
	const ObjCObjectPointerType *LHSOPT,
	const ObjCObjectPointerType *RHSOPT,
	SmallVectorImpl<ObjCProtocolDecl *> &IntersectionSet) {

	const ObjCObjectType* LHS = LHSOPT->getObjectType();
	const ObjCObjectType* RHS = RHSOPT->getObjectType();
	assert(LHS->getInterface() && "LHS must have an interface base");
	assert(RHS->getInterface() && "RHS must have an interface base");

	// Add all of the protocols for the LHS.
	llvm::SmallPtrSet<ObjCProtocolDecl *, 8> LHSProtocolSet;

	// Start with the protocol qualifiers.
	for (auto proto : LHS->quals()) {
	Context.CollectInheritedProtocols(proto, LHSProtocolSet);
	}

	// Also add the protocols associated with the LHS interface.
	Context.CollectInheritedProtocols(LHS->getInterface(), LHSProtocolSet);

	// Add all of the protocls for the RHS.
	llvm::SmallPtrSet<ObjCProtocolDecl *, 8> RHSProtocolSet;

	// Start with the protocol qualifiers.
	for (auto proto : RHS->quals()) {
	Context.CollectInheritedProtocols(proto, RHSProtocolSet);
	}

	// Also add the protocols associated with the RHS interface.
	Context.CollectInheritedProtocols(RHS->getInterface(), RHSProtocolSet);

	// Compute the intersection of the collected protocol sets.
	for (auto proto : LHSProtocolSet) {
	if (RHSProtocolSet.count(proto))
	IntersectionSet.push_back(proto);
	}

	// Compute the set of protocols that is implied by either the common type or
	// the protocols within the intersection.
	llvm::SmallPtrSet<ObjCProtocolDecl *, 8> ImpliedProtocols;
	Context.CollectInheritedProtocols(CommonBase, ImpliedProtocols);

	// Remove any implied protocols from the list of inherited protocols.
	if (!ImpliedProtocols.empty()) {
	IntersectionSet.erase(
	std::remove_if(IntersectionSet.begin(),
	IntersectionSet.end(),
	[&](ObjCProtocolDecl *proto) -> bool {
	return ImpliedProtocols.count(proto) > 0;
	}),
	IntersectionSet.end());
	}

	// Sort the remaining protocols by name.
	llvm::array_pod_sort(IntersectionSet.begin(), IntersectionSet.end(),
	compareObjCProtocolsByName);
	}

	/// Determine whether the first type is a subtype of the second.
	static bool canAssignObjCObjectTypes(ASTContext &ctx, QualType lhs,
	QualType rhs) {
	// Common case: two object pointers.
	const ObjCObjectPointerType *lhsOPT = lhs->getAs<ObjCObjectPointerType>();
	const ObjCObjectPointerType *rhsOPT = rhs->getAs<ObjCObjectPointerType>();
	if (lhsOPT && rhsOPT)
	return ctx.canAssignObjCInterfaces(lhsOPT, rhsOPT);

	// Two block pointers.
	const BlockPointerType *lhsBlock = lhs->getAs<BlockPointerType>();
	const BlockPointerType *rhsBlock = rhs->getAs<BlockPointerType>();
	if (lhsBlock && rhsBlock)
	return ctx.typesAreBlockPointerCompatible(lhs, rhs);

	// If either is an unqualified 'id' and the other is a block, it's
	// acceptable.
	if ((lhsOPT && lhsOPT->isObjCIdType() && rhsBlock) \|\|
	(rhsOPT && rhsOPT->isObjCIdType() && lhsBlock))
	return true;

	return false;
	}

	// Check that the given Objective-C type argument lists are equivalent.
	static bool sameObjCTypeArgs(ASTContext &ctx,
	const ObjCInterfaceDecl *iface,
	ArrayRef<QualType> lhsArgs,
	ArrayRef<QualType> rhsArgs,
	bool stripKindOf) {
	if (lhsArgs.size() != rhsArgs.size())
	return false;

	ObjCTypeParamList *typeParams = iface->getTypeParamList();
	for (unsigned i = 0, n = lhsArgs.size(); i != n; ++i) {
	if (ctx.hasSameType(lhsArgs[i], rhsArgs[i]))
	continue;

	switch (typeParams->begin()[i]->getVariance()) {
	case ObjCTypeParamVariance::Invariant:
	if (!stripKindOf \|\|
	!ctx.hasSameType(lhsArgs[i].stripObjCKindOfType(ctx),
	rhsArgs[i].stripObjCKindOfType(ctx))) {
	return false;
	}
	break;

	case ObjCTypeParamVariance::Covariant:
	if (!canAssignObjCObjectTypes(ctx, lhsArgs[i], rhsArgs[i]))
	return false;
	break;

	case ObjCTypeParamVariance::Contravariant:
	if (!canAssignObjCObjectTypes(ctx, rhsArgs[i], lhsArgs[i]))
	return false;
	break;
	}
	}

	return true;
	}

	QualType ASTContext::areCommonBaseCompatible(
	const ObjCObjectPointerType *Lptr,
	const ObjCObjectPointerType *Rptr) {
	const ObjCObjectType *LHS = Lptr->getObjectType();
	const ObjCObjectType *RHS = Rptr->getObjectType();
	const ObjCInterfaceDecl* LDecl = LHS->getInterface();
	const ObjCInterfaceDecl* RDecl = RHS->getInterface();

	if (!LDecl \|\| !RDecl)
	return QualType();

	// When either LHS or RHS is a kindof type, we should return a kindof type.
	// For example, for common base of kindof(ASub1) and kindof(ASub2), we return
	// kindof(A).
	bool anyKindOf = LHS->isKindOfType() \|\| RHS->isKindOfType();

	// Follow the left-hand side up the class hierarchy until we either hit a
	// root or find the RHS. Record the ancestors in case we don't find it.
	llvm::SmallDenseMap<const ObjCInterfaceDecl , const ObjCObjectType , 4>
	LHSAncestors;
	while (true) {
	// Record this ancestor. We'll need this if the common type isn't in the
	// path from the LHS to the root.
	LHSAncestors[LHS->getInterface()->getCanonicalDecl()] = LHS;

	if (declaresSameEntity(LHS->getInterface(), RDecl)) {
	// Get the type arguments.
	ArrayRef<QualType> LHSTypeArgs = LHS->getTypeArgsAsWritten();
	bool anyChanges = false;
	if (LHS->isSpecialized() && RHS->isSpecialized()) {
	// Both have type arguments, compare them.
	if (!sameObjCTypeArgs(*this, LHS->getInterface(),
	LHS->getTypeArgs(), RHS->getTypeArgs(),
	/stripKindOf=/true))
	return QualType();
	} else if (LHS->isSpecialized() != RHS->isSpecialized()) {
	// If only one has type arguments, the result will not have type
	// arguments.
	LHSTypeArgs = {};
	anyChanges = true;
	}

	// Compute the intersection of protocols.
	SmallVector<ObjCProtocolDecl *, 8> Protocols;
	getIntersectionOfProtocols(*this, LHS->getInterface(), Lptr, Rptr,
	Protocols);
	if (!Protocols.empty())
	anyChanges = true;

	// If anything in the LHS will have changed, build a new result type.
	// If we need to return a kindof type but LHS is not a kindof type, we
	// build a new result type.
	if (anyChanges \|\| LHS->isKindOfType() != anyKindOf) {
	QualType Result = getObjCInterfaceType(LHS->getInterface());
	Result = getObjCObjectType(Result, LHSTypeArgs, Protocols,
	anyKindOf \|\| LHS->isKindOfType());
	return getObjCObjectPointerType(Result);
	}

	return getObjCObjectPointerType(QualType(LHS, 0));
	}

	// Find the superclass.
	QualType LHSSuperType = LHS->getSuperClassType();
	if (LHSSuperType.isNull())
	break;

	LHS = LHSSuperType->castAs<ObjCObjectType>();
	}

	// We didn't find anything by following the LHS to its root; now check
	// the RHS against the cached set of ancestors.
	while (true) {
	auto KnownLHS = LHSAncestors.find(RHS->getInterface()->getCanonicalDecl());
	if (KnownLHS != LHSAncestors.end()) {
	LHS = KnownLHS->second;

	// Get the type arguments.
	ArrayRef<QualType> RHSTypeArgs = RHS->getTypeArgsAsWritten();
	bool anyChanges = false;
	if (LHS->isSpecialized() && RHS->isSpecialized()) {
	// Both have type arguments, compare them.
	if (!sameObjCTypeArgs(*this, LHS->getInterface(),
	LHS->getTypeArgs(), RHS->getTypeArgs(),
	/stripKindOf=/true))
	return QualType();
	} else if (LHS->isSpecialized() != RHS->isSpecialized()) {
	// If only one has type arguments, the result will not have type
	// arguments.
	RHSTypeArgs = {};
	anyChanges = true;
	}

	// Compute the intersection of protocols.
	SmallVector<ObjCProtocolDecl *, 8> Protocols;
	getIntersectionOfProtocols(*this, RHS->getInterface(), Lptr, Rptr,
	Protocols);
	if (!Protocols.empty())
	anyChanges = true;

	// If we need to return a kindof type but RHS is not a kindof type, we
	// build a new result type.
	if (anyChanges \|\| RHS->isKindOfType() != anyKindOf) {
	QualType Result = getObjCInterfaceType(RHS->getInterface());
	Result = getObjCObjectType(Result, RHSTypeArgs, Protocols,
	anyKindOf \|\| RHS->isKindOfType());
	return getObjCObjectPointerType(Result);
	}

	return getObjCObjectPointerType(QualType(RHS, 0));
	}

	// Find the superclass of the RHS.
	QualType RHSSuperType = RHS->getSuperClassType();
	if (RHSSuperType.isNull())
	break;

	RHS = RHSSuperType->castAs<ObjCObjectType>();
	}

	return QualType();
	}

	bool ASTContext::canAssignObjCInterfaces(const ObjCObjectType *LHS,
	const ObjCObjectType *RHS) {
	assert(LHS->getInterface() && "LHS is not an interface type");
	assert(RHS->getInterface() && "RHS is not an interface type");

	// Verify that the base decls are compatible: the RHS must be a subclass of
	// the LHS.
	ObjCInterfaceDecl *LHSInterface = LHS->getInterface();
	bool IsSuperClass = LHSInterface->isSuperClassOf(RHS->getInterface());
	if (!IsSuperClass)
	return false;

	// If the LHS has protocol qualifiers, determine whether all of them are
	// satisfied by the RHS (i.e., the RHS has a superset of the protocols in the
	// LHS).
	if (LHS->getNumProtocols() > 0) {
	// OK if conversion of LHS to SuperClass results in narrowing of types
	// ; i.e., SuperClass may implement at least one of the protocols
	// in LHS's protocol list. Example, SuperObj<P1> = lhs<P1,P2> is ok.
	// But not SuperObj<P1,P2,P3> = lhs<P1,P2>.
	llvm::SmallPtrSet<ObjCProtocolDecl *, 8> SuperClassInheritedProtocols;
	CollectInheritedProtocols(RHS->getInterface(), SuperClassInheritedProtocols);
	// Also, if RHS has explicit quelifiers, include them for comparing with LHS's
	// qualifiers.
	for (auto *RHSPI : RHS->quals())
	CollectInheritedProtocols(RHSPI, SuperClassInheritedProtocols);
	// If there is no protocols associated with RHS, it is not a match.
	if (SuperClassInheritedProtocols.empty())
	return false;

	for (const auto *LHSProto : LHS->quals()) {
	bool SuperImplementsProtocol = false;
	for (auto *SuperClassProto : SuperClassInheritedProtocols)
	if (SuperClassProto->lookupProtocolNamed(LHSProto->getIdentifier())) {
	SuperImplementsProtocol = true;
	break;
	}
	if (!SuperImplementsProtocol)
	return false;
	}
	}

	// If the LHS is specialized, we may need to check type arguments.
	if (LHS->isSpecialized()) {
	// Follow the superclass chain until we've matched the LHS class in the
	// hierarchy. This substitutes type arguments through.
	const ObjCObjectType *RHSSuper = RHS;
	while (!declaresSameEntity(RHSSuper->getInterface(), LHSInterface))
	RHSSuper = RHSSuper->getSuperClassType()->castAs<ObjCObjectType>();

	// If the RHS is specializd, compare type arguments.
	if (RHSSuper->isSpecialized() &&
	!sameObjCTypeArgs(*this, LHS->getInterface(),
	LHS->getTypeArgs(), RHSSuper->getTypeArgs(),
	/stripKindOf=/true)) {
	return false;
	}
	}

	return true;
	}

	bool ASTContext::areComparableObjCPointerTypes(QualType LHS, QualType RHS) {
	// get the "pointed to" types
	const ObjCObjectPointerType *LHSOPT = LHS->getAs<ObjCObjectPointerType>();
	const ObjCObjectPointerType *RHSOPT = RHS->getAs<ObjCObjectPointerType>();

	if (!LHSOPT \|\| !RHSOPT)
	return false;

	return canAssignObjCInterfaces(LHSOPT, RHSOPT) \|\|
	canAssignObjCInterfaces(RHSOPT, LHSOPT);
	}

	bool ASTContext::canBindObjCObjectType(QualType To, QualType From) {
	return canAssignObjCInterfaces(
	getObjCObjectPointerType(To)->getAs<ObjCObjectPointerType>(),
	getObjCObjectPointerType(From)->getAs<ObjCObjectPointerType>());
	}

	/// typesAreCompatible - C99 6.7.3p9: For two qualified types to be compatible,
	/// both shall have the identically qualified version of a compatible type.
	/// C99 6.2.7p1: Two types have compatible types if their types are the
	/// same. See 6.7.[2,3,5] for additional rules.
	bool ASTContext::typesAreCompatible(QualType LHS, QualType RHS,
	bool CompareUnqualified) {
	if (getLangOpts().CPlusPlus)
	return hasSameType(LHS, RHS);

	return !mergeTypes(LHS, RHS, false, CompareUnqualified).isNull();
	}

	bool ASTContext::propertyTypesAreCompatible(QualType LHS, QualType RHS) {
	return typesAreCompatible(LHS, RHS);
	}

	bool ASTContext::typesAreBlockPointerCompatible(QualType LHS, QualType RHS) {
	return !mergeTypes(LHS, RHS, true).isNull();
	}

	/// mergeTransparentUnionType - if T is a transparent union type and a member
	/// of T is compatible with SubType, return the merged type, else return
	/// QualType()
	QualType ASTContext::mergeTransparentUnionType(QualType T, QualType SubType,
	bool OfBlockPointer,
	bool Unqualified) {
	if (const RecordType *UT = T->getAsUnionType()) {
	RecordDecl *UD = UT->getDecl();
	if (UD->hasAttr<TransparentUnionAttr>()) {
	for (const auto *I : UD->fields()) {
	QualType ET = I->getType().getUnqualifiedType();
	QualType MT = mergeTypes(ET, SubType, OfBlockPointer, Unqualified);
	if (!MT.isNull())
	return MT;
	}
	}
	}

	return QualType();
	}

	/// mergeFunctionParameterTypes - merge two types which appear as function
	/// parameter types
	QualType ASTContext::mergeFunctionParameterTypes(QualType lhs, QualType rhs,
	bool OfBlockPointer,
	bool Unqualified) {
	// GNU extension: two types are compatible if they appear as a function
	// argument, one of the types is a transparent union type and the other
	// type is compatible with a union member
	QualType lmerge = mergeTransparentUnionType(lhs, rhs, OfBlockPointer,
	Unqualified);
	if (!lmerge.isNull())
	return lmerge;

	QualType rmerge = mergeTransparentUnionType(rhs, lhs, OfBlockPointer,
	Unqualified);
	if (!rmerge.isNull())
	return rmerge;

	return mergeTypes(lhs, rhs, OfBlockPointer, Unqualified);
	}

	QualType ASTContext::mergeFunctionTypes(QualType lhs, QualType rhs,
	bool OfBlockPointer,
	bool Unqualified) {
	const FunctionType *lbase = lhs->getAs<FunctionType>();
	const FunctionType *rbase = rhs->getAs<FunctionType>();
	const FunctionProtoType *lproto = dyn_cast<FunctionProtoType>(lbase);
	const FunctionProtoType *rproto = dyn_cast<FunctionProtoType>(rbase);
	bool allLTypes = true;
	bool allRTypes = true;

	// Check return type
	QualType retType;
	if (OfBlockPointer) {
	QualType RHS = rbase->getReturnType();
	QualType LHS = lbase->getReturnType();
	bool UnqualifiedResult = Unqualified;
	if (!UnqualifiedResult)
	UnqualifiedResult = (!RHS.hasQualifiers() && LHS.hasQualifiers());
	retType = mergeTypes(LHS, RHS, true, UnqualifiedResult, true);
	}
	else
	retType = mergeTypes(lbase->getReturnType(), rbase->getReturnType(), false,
	Unqualified);
	if (retType.isNull()) return QualType();

	if (Unqualified)
	retType = retType.getUnqualifiedType();

	CanQualType LRetType = getCanonicalType(lbase->getReturnType());
	CanQualType RRetType = getCanonicalType(rbase->getReturnType());
	if (Unqualified) {
	LRetType = LRetType.getUnqualifiedType();
	RRetType = RRetType.getUnqualifiedType();
	}

	if (getCanonicalType(retType) != LRetType)
	allLTypes = false;
	if (getCanonicalType(retType) != RRetType)
	allRTypes = false;

	// FIXME: double check this
	// FIXME: should we error if lbase->getRegParmAttr() != 0 &&
	// rbase->getRegParmAttr() != 0 &&
	// lbase->getRegParmAttr() != rbase->getRegParmAttr()?
	FunctionType::ExtInfo lbaseInfo = lbase->getExtInfo();
	FunctionType::ExtInfo rbaseInfo = rbase->getExtInfo();

	// Compatible functions must have compatible calling conventions
	if (lbaseInfo.getCC() != rbaseInfo.getCC())
	return QualType();

	// Regparm is part of the calling convention.
	if (lbaseInfo.getHasRegParm() != rbaseInfo.getHasRegParm())
	return QualType();
	if (lbaseInfo.getRegParm() != rbaseInfo.getRegParm())
	return QualType();

	if (lbaseInfo.getProducesResult() != rbaseInfo.getProducesResult())
	return QualType();
	if (lbaseInfo.getNoCallerSavedRegs() != rbaseInfo.getNoCallerSavedRegs())
	return QualType();

	// FIXME: some uses, e.g. conditional exprs, really want this to be 'both'.
	bool NoReturn = lbaseInfo.getNoReturn() \|\| rbaseInfo.getNoReturn();

	if (lbaseInfo.getNoReturn() != NoReturn)
	allLTypes = false;
	if (rbaseInfo.getNoReturn() != NoReturn)
	allRTypes = false;

	FunctionType::ExtInfo einfo = lbaseInfo.withNoReturn(NoReturn);

	if (lproto && rproto) { // two C99 style function prototypes
	assert(!lproto->hasExceptionSpec() && !rproto->hasExceptionSpec() &&
	"C++ shouldn't be here");
	// Compatible functions must have the same number of parameters
	if (lproto->getNumParams() != rproto->getNumParams())
	return QualType();

	// Variadic and non-variadic functions aren't compatible
	if (lproto->isVariadic() != rproto->isVariadic())
	return QualType();

	if (lproto->getTypeQuals() != rproto->getTypeQuals())
	return QualType();

	SmallVector<FunctionProtoType::ExtParameterInfo, 4> newParamInfos;
	bool canUseLeft, canUseRight;
	if (!mergeExtParameterInfo(lproto, rproto, canUseLeft, canUseRight,
	newParamInfos))
	return QualType();

	if (!canUseLeft)
	allLTypes = false;
	if (!canUseRight)
	allRTypes = false;

	// Check parameter type compatibility
	SmallVector<QualType, 10> types;
	for (unsigned i = 0, n = lproto->getNumParams(); i < n; i++) {
	QualType lParamType = lproto->getParamType(i).getUnqualifiedType();
	QualType rParamType = rproto->getParamType(i).getUnqualifiedType();
	QualType paramType = mergeFunctionParameterTypes(
	lParamType, rParamType, OfBlockPointer, Unqualified);
	if (paramType.isNull())
	return QualType();

	if (Unqualified)
	paramType = paramType.getUnqualifiedType();

	types.push_back(paramType);
	if (Unqualified) {
	lParamType = lParamType.getUnqualifiedType();
	rParamType = rParamType.getUnqualifiedType();
	}

	if (getCanonicalType(paramType) != getCanonicalType(lParamType))
	allLTypes = false;
	if (getCanonicalType(paramType) != getCanonicalType(rParamType))
	allRTypes = false;
	}

	if (allLTypes) return lhs;
	if (allRTypes) return rhs;

	FunctionProtoType::ExtProtoInfo EPI = lproto->getExtProtoInfo();
	EPI.ExtInfo = einfo;
	EPI.ExtParameterInfos =
	newParamInfos.empty() ? nullptr : newParamInfos.data();
	return getFunctionType(retType, types, EPI);
	}

	if (lproto) allRTypes = false;
	if (rproto) allLTypes = false;

	const FunctionProtoType *proto = lproto ? lproto : rproto;
	if (proto) {
	assert(!proto->hasExceptionSpec() && "C++ shouldn't be here");
	if (proto->isVariadic()) return QualType();
	// Check that the types are compatible with the types that
	// would result from default argument promotions (C99 6.7.5.3p15).
	// The only types actually affected are promotable integer
	// types and floats, which would be passed as a different
	// type depending on whether the prototype is visible.
	for (unsigned i = 0, n = proto->getNumParams(); i < n; ++i) {
	QualType paramTy = proto->getParamType(i);

	// Look at the converted type of enum types, since that is the type used
	// to pass enum values.
	if (const EnumType *Enum = paramTy->getAs<EnumType>()) {
	paramTy = Enum->getDecl()->getIntegerType();
	if (paramTy.isNull())
	return QualType();
	}

	if (paramTy->isPromotableIntegerType() \|\|
	getCanonicalType(paramTy).getUnqualifiedType() == FloatTy)
	return QualType();
	}

	if (allLTypes) return lhs;
	if (allRTypes) return rhs;

	FunctionProtoType::ExtProtoInfo EPI = proto->getExtProtoInfo();
	EPI.ExtInfo = einfo;
	return getFunctionType(retType, proto->getParamTypes(), EPI);
	}

	if (allLTypes) return lhs;
	if (allRTypes) return rhs;
	return getFunctionNoProtoType(retType, einfo);
	}

	/// Given that we have an enum type and a non-enum type, try to merge them.
	static QualType mergeEnumWithInteger(ASTContext &Context, const EnumType *ET,
	QualType other, bool isBlockReturnType) {
	// C99 6.7.2.2p4: Each enumerated type shall be compatible with char,
	// a signed integer type, or an unsigned integer type.
	// Compatibility is based on the underlying type, not the promotion
	// type.
	QualType underlyingType = ET->getDecl()->getIntegerType();
	if (underlyingType.isNull()) return QualType();
	if (Context.hasSameType(underlyingType, other))
	return other;

	// In block return types, we're more permissive and accept any
	// integral type of the same size.
	if (isBlockReturnType && other->isIntegerType() &&
	Context.getTypeSize(underlyingType) == Context.getTypeSize(other))
	return other;

	return QualType();
	}

	QualType ASTContext::mergeTypes(QualType LHS, QualType RHS,
	bool OfBlockPointer,
	bool Unqualified, bool BlockReturnType) {
	// C++ [expr]: If an expression initially has the type "reference to T", the
	// type is adjusted to "T" prior to any further analysis, the expression
	// designates the object or function denoted by the reference, and the
	// expression is an lvalue unless the reference is an rvalue reference and
	// the expression is a function call (possibly inside parentheses).
	assert(!LHS->getAs<ReferenceType>() && "LHS is a reference type?");
	assert(!RHS->getAs<ReferenceType>() && "RHS is a reference type?");

	if (Unqualified) {
	LHS = LHS.getUnqualifiedType();
	RHS = RHS.getUnqualifiedType();
	}

	QualType LHSCan = getCanonicalType(LHS),
	RHSCan = getCanonicalType(RHS);

	// If two types are identical, they are compatible.
	if (LHSCan == RHSCan)
	return LHS;

	// If the qualifiers are different, the types aren't compatible... mostly.
	Qualifiers LQuals = LHSCan.getLocalQualifiers();
	Qualifiers RQuals = RHSCan.getLocalQualifiers();
	if (LQuals != RQuals) {
	// If any of these qualifiers are different, we have a type
	// mismatch.
	if (LQuals.getCVRQualifiers() != RQuals.getCVRQualifiers() \|\|
	LQuals.getAddressSpace() != RQuals.getAddressSpace() \|\|
	LQuals.getObjCLifetime() != RQuals.getObjCLifetime() \|\|
	LQuals.hasUnaligned() != RQuals.hasUnaligned())
	return QualType();

	// Exactly one GC qualifier difference is allowed: __strong is
	// okay if the other type has no GC qualifier but is an Objective
	// C object pointer (i.e. implicitly strong by default). We fix
	// this by pretending that the unqualified type was actually
	// qualified __strong.
	Qualifiers::GC GC_L = LQuals.getObjCGCAttr();
	Qualifiers::GC GC_R = RQuals.getObjCGCAttr();
	assert((GC_L != GC_R) && "unequal qualifier sets had only equal elements");

	if (GC_L == Qualifiers::Weak \|\| GC_R == Qualifiers::Weak)
	return QualType();

	if (GC_L == Qualifiers::Strong && RHSCan->isObjCObjectPointerType()) {
	return mergeTypes(LHS, getObjCGCQualType(RHS, Qualifiers::Strong));
	}
	if (GC_R == Qualifiers::Strong && LHSCan->isObjCObjectPointerType()) {
	return mergeTypes(getObjCGCQualType(LHS, Qualifiers::Strong), RHS);
	}
	return QualType();
	}

	// Okay, qualifiers are equal.

	Type::TypeClass LHSClass = LHSCan->getTypeClass();
	Type::TypeClass RHSClass = RHSCan->getTypeClass();

	// We want to consider the two function types to be the same for these
	// comparisons, just force one to the other.
	if (LHSClass == Type::FunctionProto) LHSClass = Type::FunctionNoProto;
	if (RHSClass == Type::FunctionProto) RHSClass = Type::FunctionNoProto;

	// Same as above for arrays
	if (LHSClass == Type::VariableArray \|\| LHSClass == Type::IncompleteArray)
	LHSClass = Type::ConstantArray;
	if (RHSClass == Type::VariableArray \|\| RHSClass == Type::IncompleteArray)
	RHSClass = Type::ConstantArray;

	// ObjCInterfaces are just specialized ObjCObjects.
	if (LHSClass == Type::ObjCInterface) LHSClass = Type::ObjCObject;
	if (RHSClass == Type::ObjCInterface) RHSClass = Type::ObjCObject;

	// Canonicalize ExtVector -> Vector.
	if (LHSClass == Type::ExtVector) LHSClass = Type::Vector;
	if (RHSClass == Type::ExtVector) RHSClass = Type::Vector;

	// If the canonical type classes don't match.
	if (LHSClass != RHSClass) {
	// Note that we only have special rules for turning block enum
	// returns into block int returns, not vice-versa.
	if (const EnumType* ETy = LHS->getAs<EnumType>()) {
	return mergeEnumWithInteger(*this, ETy, RHS, false);
	}
	if (const EnumType* ETy = RHS->getAs<EnumType>()) {
	return mergeEnumWithInteger(*this, ETy, LHS, BlockReturnType);
	}
	// allow block pointer type to match an 'id' type.
	if (OfBlockPointer && !BlockReturnType) {
	if (LHS->isObjCIdType() && RHS->isBlockPointerType())
	return LHS;
	if (RHS->isObjCIdType() && LHS->isBlockPointerType())
	return RHS;
	}

	return QualType();
	}

	// The canonical type classes match.
	switch (LHSClass) {
	#define TYPE(Class, Base)
	#define ABSTRACT_TYPE(Class, Base)
	#define NON_CANONICAL_UNLESS_DEPENDENT_TYPE(Class, Base) case Type::Class:
	#define NON_CANONICAL_TYPE(Class, Base) case Type::Class:
	#define DEPENDENT_TYPE(Class, Base) case Type::Class:
	#include "clang/AST/TypeNodes.def"
	llvm_unreachable("Non-canonical and dependent types shouldn't get here");

	case Type::Auto:
	case Type::DeducedTemplateSpecialization:
	case Type::LValueReference:
	case Type::RValueReference:
	case Type::MemberPointer:
	llvm_unreachable("C++ should never be in mergeTypes");

	case Type::ObjCInterface:
	case Type::IncompleteArray:
	case Type::VariableArray:
	case Type::FunctionProto:
	case Type::ExtVector:
	llvm_unreachable("Types are eliminated above");

	case Type::Pointer:
	{
	// Merge two pointer types, while trying to preserve typedef info
	QualType LHSPointee = LHS->getAs<PointerType>()->getPointeeType();
	QualType RHSPointee = RHS->getAs<PointerType>()->getPointeeType();
	if (Unqualified) {
	LHSPointee = LHSPointee.getUnqualifiedType();
	RHSPointee = RHSPointee.getUnqualifiedType();
	}
	QualType ResultType = mergeTypes(LHSPointee, RHSPointee, false,
	Unqualified);
	if (ResultType.isNull()) return QualType();
	if (getCanonicalType(LHSPointee) == getCanonicalType(ResultType))
	return LHS;
	if (getCanonicalType(RHSPointee) == getCanonicalType(ResultType))
	return RHS;
	return getPointerType(ResultType);
	}
	case Type::BlockPointer:
	{
	// Merge two block pointer types, while trying to preserve typedef info
	QualType LHSPointee = LHS->getAs<BlockPointerType>()->getPointeeType();
	QualType RHSPointee = RHS->getAs<BlockPointerType>()->getPointeeType();
	if (Unqualified) {
	LHSPointee = LHSPointee.getUnqualifiedType();
	RHSPointee = RHSPointee.getUnqualifiedType();
	}
	if (getLangOpts().OpenCL) {
	Qualifiers LHSPteeQual = LHSPointee.getQualifiers();
	Qualifiers RHSPteeQual = RHSPointee.getQualifiers();
	// Blocks can't be an expression in a ternary operator (OpenCL v2.0
	// 6.12.5) thus the following check is asymmetric.
	if (!LHSPteeQual.isAddressSpaceSupersetOf(RHSPteeQual))
	return QualType();
	LHSPteeQual.removeAddressSpace();
	RHSPteeQual.removeAddressSpace();
	LHSPointee =
	QualType(LHSPointee.getTypePtr(), LHSPteeQual.getAsOpaqueValue());
	RHSPointee =
	QualType(RHSPointee.getTypePtr(), RHSPteeQual.getAsOpaqueValue());
	}
	QualType ResultType = mergeTypes(LHSPointee, RHSPointee, OfBlockPointer,
	Unqualified);
	if (ResultType.isNull()) return QualType();
	if (getCanonicalType(LHSPointee) == getCanonicalType(ResultType))
	return LHS;
	if (getCanonicalType(RHSPointee) == getCanonicalType(ResultType))
	return RHS;
	return getBlockPointerType(ResultType);
	}
	case Type::Atomic:
	{
	// Merge two pointer types, while trying to preserve typedef info
	QualType LHSValue = LHS->getAs<AtomicType>()->getValueType();
	QualType RHSValue = RHS->getAs<AtomicType>()->getValueType();
	if (Unqualified) {
	LHSValue = LHSValue.getUnqualifiedType();
	RHSValue = RHSValue.getUnqualifiedType();
	}
	QualType ResultType = mergeTypes(LHSValue, RHSValue, false,
	Unqualified);
	if (ResultType.isNull()) return QualType();
	if (getCanonicalType(LHSValue) == getCanonicalType(ResultType))
	return LHS;
	if (getCanonicalType(RHSValue) == getCanonicalType(ResultType))
	return RHS;
	return getAtomicType(ResultType);
	}
	case Type::ConstantArray:
	{
	const ConstantArrayType* LCAT = getAsConstantArrayType(LHS);
	const ConstantArrayType* RCAT = getAsConstantArrayType(RHS);
	if (LCAT && RCAT && RCAT->getSize() != LCAT->getSize())
	return QualType();

	QualType LHSElem = getAsArrayType(LHS)->getElementType();
	QualType RHSElem = getAsArrayType(RHS)->getElementType();
	if (Unqualified) {
	LHSElem = LHSElem.getUnqualifiedType();
	RHSElem = RHSElem.getUnqualifiedType();
	}

	QualType ResultType = mergeTypes(LHSElem, RHSElem, false, Unqualified);
	if (ResultType.isNull()) return QualType();
	if (LCAT && getCanonicalType(LHSElem) == getCanonicalType(ResultType))
	return LHS;
	if (RCAT && getCanonicalType(RHSElem) == getCanonicalType(ResultType))
	return RHS;
	if (LCAT) return getConstantArrayType(ResultType, LCAT->getSize(),
	ArrayType::ArraySizeModifier(), 0);
	if (RCAT) return getConstantArrayType(ResultType, RCAT->getSize(),
	ArrayType::ArraySizeModifier(), 0);
	const VariableArrayType* LVAT = getAsVariableArrayType(LHS);
	const VariableArrayType* RVAT = getAsVariableArrayType(RHS);
	if (LVAT && getCanonicalType(LHSElem) == getCanonicalType(ResultType))
	return LHS;
	if (RVAT && getCanonicalType(RHSElem) == getCanonicalType(ResultType))
	return RHS;
	if (LVAT) {
	// FIXME: This isn't correct! But tricky to implement because
	// the array's size has to be the size of LHS, but the type
	// has to be different.
	return LHS;
	}
	if (RVAT) {
	// FIXME: This isn't correct! But tricky to implement because
	// the array's size has to be the size of RHS, but the type
	// has to be different.
	return RHS;
	}
	if (getCanonicalType(LHSElem) == getCanonicalType(ResultType)) return LHS;
	if (getCanonicalType(RHSElem) == getCanonicalType(ResultType)) return RHS;
	return getIncompleteArrayType(ResultType,
	ArrayType::ArraySizeModifier(), 0);
	}
	case Type::FunctionNoProto:
	return mergeFunctionTypes(LHS, RHS, OfBlockPointer, Unqualified);
	case Type::Record:
	case Type::Enum:
	return QualType();
	case Type::Builtin:
	// Only exactly equal builtin types are compatible, which is tested above.
	return QualType();
	case Type::Complex:
	// Distinct complex types are incompatible.
	return QualType();
	case Type::Vector:
	// FIXME: The merged type should be an ExtVector!
	if (areCompatVectorTypes(LHSCan->getAs<VectorType>(),
	RHSCan->getAs<VectorType>()))
	return LHS;
	return QualType();
	case Type::ObjCObject: {
	// Check if the types are assignment compatible.
	// FIXME: This should be type compatibility, e.g. whether
	// "LHS x; RHS x;" at global scope is legal.
	const ObjCObjectType* LHSIface = LHS->getAs<ObjCObjectType>();
	const ObjCObjectType* RHSIface = RHS->getAs<ObjCObjectType>();
	if (canAssignObjCInterfaces(LHSIface, RHSIface))
	return LHS;

	return QualType();
	}
	case Type::ObjCObjectPointer:
	if (OfBlockPointer) {
	if (canAssignObjCInterfacesInBlockPointer(
	LHS->getAs<ObjCObjectPointerType>(),
	RHS->getAs<ObjCObjectPointerType>(),
	BlockReturnType))
	return LHS;
	return QualType();
	}
	if (canAssignObjCInterfaces(LHS->getAs<ObjCObjectPointerType>(),
	RHS->getAs<ObjCObjectPointerType>()))
	return LHS;

	return QualType();
	case Type::Pipe:
	assert(LHS != RHS &&
	"Equivalent pipe types should have already been handled!");
	return QualType();
	}

	llvm_unreachable("Invalid Type::Class!");
	}

	bool ASTContext::mergeExtParameterInfo(
	const FunctionProtoType FirstFnType, const FunctionProtoType SecondFnType,
	bool &CanUseFirst, bool &CanUseSecond,
	SmallVectorImpl<FunctionProtoType::ExtParameterInfo> &NewParamInfos) {
	assert(NewParamInfos.empty() && "param info list not empty");
	CanUseFirst = CanUseSecond = true;
	bool FirstHasInfo = FirstFnType->hasExtParameterInfos();
	bool SecondHasInfo = SecondFnType->hasExtParameterInfos();

	// Fast path: if the first type doesn't have ext parameter infos,
	// we match if and only if the second type also doesn't have them.
	if (!FirstHasInfo && !SecondHasInfo)
	return true;

	bool NeedParamInfo = false;
	size_t E = FirstHasInfo ? FirstFnType->getExtParameterInfos().size()
	: SecondFnType->getExtParameterInfos().size();

	for (size_t I = 0; I < E; ++I) {
	FunctionProtoType::ExtParameterInfo FirstParam, SecondParam;
	if (FirstHasInfo)
	FirstParam = FirstFnType->getExtParameterInfo(I);
	if (SecondHasInfo)
	SecondParam = SecondFnType->getExtParameterInfo(I);

	// Cannot merge unless everything except the noescape flag matches.
	if (FirstParam.withIsNoEscape(false) != SecondParam.withIsNoEscape(false))
	return false;

	bool FirstNoEscape = FirstParam.isNoEscape();
	bool SecondNoEscape = SecondParam.isNoEscape();
	bool IsNoEscape = FirstNoEscape && SecondNoEscape;
	NewParamInfos.push_back(FirstParam.withIsNoEscape(IsNoEscape));
	if (NewParamInfos.back().getOpaqueValue())
	NeedParamInfo = true;
	if (FirstNoEscape != IsNoEscape)
	CanUseFirst = false;
	if (SecondNoEscape != IsNoEscape)
	CanUseSecond = false;
	}

	if (!NeedParamInfo)
	NewParamInfos.clear();

	return true;
	}

	void ASTContext::ResetObjCLayout(const ObjCContainerDecl *CD) {
	ObjCLayouts[CD] = nullptr;
	}

	/// mergeObjCGCQualifiers - This routine merges ObjC's GC attribute of 'LHS' and
	/// 'RHS' attributes and returns the merged version; including for function
	/// return types.
	QualType ASTContext::mergeObjCGCQualifiers(QualType LHS, QualType RHS) {
	QualType LHSCan = getCanonicalType(LHS),
	RHSCan = getCanonicalType(RHS);
	// If two types are identical, they are compatible.
	if (LHSCan == RHSCan)
	return LHS;
	if (RHSCan->isFunctionType()) {
	if (!LHSCan->isFunctionType())
	return QualType();
	QualType OldReturnType =
	cast<FunctionType>(RHSCan.getTypePtr())->getReturnType();
	QualType NewReturnType =
	cast<FunctionType>(LHSCan.getTypePtr())->getReturnType();
	QualType ResReturnType =
	mergeObjCGCQualifiers(NewReturnType, OldReturnType);
	if (ResReturnType.isNull())
	return QualType();
	if (ResReturnType == NewReturnType \|\| ResReturnType == OldReturnType) {
	// id foo(); ... __strong id foo(); or: __strong id foo(); ... id foo();
	// In either case, use OldReturnType to build the new function type.
	const FunctionType *F = LHS->getAs<FunctionType>();
	if (const FunctionProtoType *FPT = cast<FunctionProtoType>(F)) {
	FunctionProtoType::ExtProtoInfo EPI = FPT->getExtProtoInfo();
	EPI.ExtInfo = getFunctionExtInfo(LHS);
	QualType ResultType =
	getFunctionType(OldReturnType, FPT->getParamTypes(), EPI);
	return ResultType;
	}
	}
	return QualType();
	}

	// If the qualifiers are different, the types can still be merged.
	Qualifiers LQuals = LHSCan.getLocalQualifiers();
	Qualifiers RQuals = RHSCan.getLocalQualifiers();
	if (LQuals != RQuals) {
	// If any of these qualifiers are different, we have a type mismatch.
	if (LQuals.getCVRQualifiers() != RQuals.getCVRQualifiers() \|\|
	LQuals.getAddressSpace() != RQuals.getAddressSpace())
	return QualType();

	// Exactly one GC qualifier difference is allowed: __strong is
	// okay if the other type has no GC qualifier but is an Objective
	// C object pointer (i.e. implicitly strong by default). We fix
	// this by pretending that the unqualified type was actually
	// qualified __strong.
	Qualifiers::GC GC_L = LQuals.getObjCGCAttr();
	Qualifiers::GC GC_R = RQuals.getObjCGCAttr();
	assert((GC_L != GC_R) && "unequal qualifier sets had only equal elements");

	if (GC_L == Qualifiers::Weak \|\| GC_R == Qualifiers::Weak)
	return QualType();

	if (GC_L == Qualifiers::Strong)
	return LHS;
	if (GC_R == Qualifiers::Strong)
	return RHS;
	return QualType();
	}

	if (LHSCan->isObjCObjectPointerType() && RHSCan->isObjCObjectPointerType()) {
	QualType LHSBaseQT = LHS->getAs<ObjCObjectPointerType>()->getPointeeType();
	QualType RHSBaseQT = RHS->getAs<ObjCObjectPointerType>()->getPointeeType();
	QualType ResQT = mergeObjCGCQualifiers(LHSBaseQT, RHSBaseQT);
	if (ResQT == LHSBaseQT)
	return LHS;
	if (ResQT == RHSBaseQT)
	return RHS;
	}
	return QualType();
	}

	//===----------------------------------------------------------------------===//
	// Integer Predicates
	//===----------------------------------------------------------------------===//

	unsigned ASTContext::getIntWidth(QualType T) const {
	if (const EnumType *ET = T->getAs<EnumType>())
	T = ET->getDecl()->getIntegerType();
	if (T->isBooleanType())
	return 1;
	// For builtin types, just use the standard type sizing method
	return (unsigned)getTypeSize(T);
	}

	QualType ASTContext::getCorrespondingUnsignedType(QualType T) const {
	assert(T->hasSignedIntegerRepresentation() && "Unexpected type");

	// Turn <4 x signed int> -> <4 x unsigned int>
	if (const VectorType *VTy = T->getAs<VectorType>())
	return getVectorType(getCorrespondingUnsignedType(VTy->getElementType()),
	VTy->getNumElements(), VTy->getVectorKind());

	// For enums, we return the unsigned version of the base type.
	if (const EnumType *ETy = T->getAs<EnumType>())
	T = ETy->getDecl()->getIntegerType();

	const BuiltinType *BTy = T->getAs<BuiltinType>();
	assert(BTy && "Unexpected signed integer type");
	switch (BTy->getKind()) {
	case BuiltinType::Char_S:
	case BuiltinType::SChar:
	return UnsignedCharTy;
	case BuiltinType::Short:
	return UnsignedShortTy;
	case BuiltinType::Int:
	return UnsignedIntTy;
	case BuiltinType::Long:
	return UnsignedLongTy;
	case BuiltinType::LongLong:
	return UnsignedLongLongTy;
	case BuiltinType::Int128:
	return UnsignedInt128Ty;
	default:
	llvm_unreachable("Unexpected signed integer type");
	}
	}

	ASTMutationListener::~ASTMutationListener() = default;

	void ASTMutationListener::DeducedReturnType(const FunctionDecl *FD,
	QualType ReturnType) {}

	//===----------------------------------------------------------------------===//
	// Builtin Type Computation
	//===----------------------------------------------------------------------===//

	/// DecodeTypeFromStr - This decodes one type descriptor from Str, advancing the
	/// pointer over the consumed characters. This returns the resultant type. If
	/// AllowTypeModifiers is false then modifier like * are not parsed, just basic
	/// types. This allows "v2i*" to be parsed as a pointer to a v2i instead of
	/// a vector of "i*".
	///
	/// RequiresICE is filled in on return to indicate whether the value is required
	/// to be an Integer Constant Expression.
	static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context,
	ASTContext::GetBuiltinTypeError &Error,
	bool &RequiresICE,
	bool AllowTypeModifiers) {
	// Modifiers.
	int HowLong = 0;
	bool Signed = false, Unsigned = false;
	RequiresICE = false;

	// Read the prefixed modifiers first.
	bool Done = false;
	#ifndef NDEBUG
	bool IsSpecialLong = false;
	#endif
	while (!Done) {
	switch (*Str++) {
	default: Done = true; --Str; break;
	case 'I':
	RequiresICE = true;
	break;
	case 'S':
	assert(!Unsigned && "Can't use both 'S' and 'U' modifiers!");
	assert(!Signed && "Can't use 'S' modifier multiple times!");
	Signed = true;
	break;
	case 'U':
	assert(!Signed && "Can't use both 'S' and 'U' modifiers!");
	assert(!Unsigned && "Can't use 'U' modifier multiple times!");
	Unsigned = true;
	break;
	case 'L':
	assert(!IsSpecialLong && "Can't use 'L' with 'W' or 'N' modifiers");
	assert(HowLong <= 2 && "Can't have LLLL modifier");
	++HowLong;
	break;
	case 'N':
	// 'N' behaves like 'L' for all non LP64 targets and 'int' otherwise.
	assert(!IsSpecialLong && "Can't use two 'N' or 'W' modifiers!");
	assert(HowLong == 0 && "Can't use both 'L' and 'N' modifiers!");
	#ifndef NDEBUG
	IsSpecialLong = true;
	#endif
	if (Context.getTargetInfo().getLongWidth() == 32)
	++HowLong;
	break;
	case 'W':
	// This modifier represents int64 type.
	assert(!IsSpecialLong && "Can't use two 'N' or 'W' modifiers!");
	assert(HowLong == 0 && "Can't use both 'L' and 'W' modifiers!");
	#ifndef NDEBUG
	IsSpecialLong = true;
	#endif
	switch (Context.getTargetInfo().getInt64Type()) {
	default:
	llvm_unreachable("Unexpected integer type");
	case TargetInfo::SignedLong:
	HowLong = 1;
	break;
	case TargetInfo::SignedLongLong:
	HowLong = 2;
	break;
	}
	break;
	}
	}

	QualType Type;

	// Read the base type.
	switch (*Str++) {
	default: llvm_unreachable("Unknown builtin type letter!");
	case 'v':
	assert(HowLong == 0 && !Signed && !Unsigned &&
	"Bad modifiers used with 'v'!");
	Type = Context.VoidTy;
	break;
	case 'h':
	assert(HowLong == 0 && !Signed && !Unsigned &&
	"Bad modifiers used with 'h'!");
	Type = Context.HalfTy;
	break;
	case 'f':
	assert(HowLong == 0 && !Signed && !Unsigned &&
	"Bad modifiers used with 'f'!");
	Type = Context.FloatTy;
	break;
	case 'd':
	assert(HowLong < 2 && !Signed && !Unsigned &&
	"Bad modifiers used with 'd'!");
	if (HowLong)
	Type = Context.LongDoubleTy;
	else
	Type = Context.DoubleTy;
	break;
	case 's':
	assert(HowLong == 0 && "Bad modifiers used with 's'!");
	if (Unsigned)
	Type = Context.UnsignedShortTy;
	else
	Type = Context.ShortTy;
	break;
	case 'i':
	if (HowLong == 3)
	Type = Unsigned ? Context.UnsignedInt128Ty : Context.Int128Ty;
	else if (HowLong == 2)
	Type = Unsigned ? Context.UnsignedLongLongTy : Context.LongLongTy;
	else if (HowLong == 1)
	Type = Unsigned ? Context.UnsignedLongTy : Context.LongTy;
	else
	Type = Unsigned ? Context.UnsignedIntTy : Context.IntTy;
	break;
	case 'c':
	assert(HowLong == 0 && "Bad modifiers used with 'c'!");
	if (Signed)
	Type = Context.SignedCharTy;
	else if (Unsigned)
	Type = Context.UnsignedCharTy;
	else
	Type = Context.CharTy;
	break;
	case 'b': // boolean
	assert(HowLong == 0 && !Signed && !Unsigned && "Bad modifiers for 'b'!");
	Type = Context.BoolTy;
	break;
	case 'z': // size_t.
	assert(HowLong == 0 && !Signed && !Unsigned && "Bad modifiers for 'z'!");
	Type = Context.getSizeType();
	break;
	case 'w': // wchar_t.
	assert(HowLong == 0 && !Signed && !Unsigned && "Bad modifiers for 'w'!");
	Type = Context.getWideCharType();
	break;
	case 'F':
	Type = Context.getCFConstantStringType();
	break;
	case 'G':
	Type = Context.getObjCIdType();
	break;
	case 'H':
	Type = Context.getObjCSelType();
	break;
	case 'M':
	Type = Context.getObjCSuperType();
	break;
	case 'a':
	Type = Context.getBuiltinVaListType();
	assert(!Type.isNull() && "builtin va list type not initialized!");
	break;
	case 'A':
	// This is a "reference" to a va_list; however, what exactly
	// this means depends on how va_list is defined. There are two
	// different kinds of va_list: ones passed by value, and ones
	// passed by reference. An example of a by-value va_list is
	// x86, where va_list is a char*. An example of by-ref va_list
	// is x86-64, where va_list is a __va_list_tag[1]. For x86,
	// we want this argument to be a char*&; for x86-64, we want
	// it to be a __va_list_tag*.
	Type = Context.getBuiltinVaListType();
	assert(!Type.isNull() && "builtin va list type not initialized!");
	if (Type->isArrayType())
	Type = Context.getArrayDecayedType(Type);
	else
	Type = Context.getLValueReferenceType(Type);
	break;
	case 'V': {
	char *End;
	unsigned NumElements = strtoul(Str, &End, 10);
	assert(End != Str && "Missing vector size");
	Str = End;

	QualType ElementType = DecodeTypeFromStr(Str, Context, Error,
	RequiresICE, false);
	assert(!RequiresICE && "Can't require vector ICE");

	// TODO: No way to make AltiVec vectors in builtins yet.
	Type = Context.getVectorType(ElementType, NumElements,
	VectorType::GenericVector);
	break;
	}
	case 'E': {
	char *End;

	unsigned NumElements = strtoul(Str, &End, 10);
	assert(End != Str && "Missing vector size");

	Str = End;

	QualType ElementType = DecodeTypeFromStr(Str, Context, Error, RequiresICE,
	false);
	Type = Context.getExtVectorType(ElementType, NumElements);
	break;
	}
	case 'X': {
	QualType ElementType = DecodeTypeFromStr(Str, Context, Error, RequiresICE,
	false);
	assert(!RequiresICE && "Can't require complex ICE");
	Type = Context.getComplexType(ElementType);
	break;
	}
	case 'Y':
	Type = Context.getPointerDiffType();
	break;
	case 'P':
	Type = Context.getFILEType();
	if (Type.isNull()) {
	Error = ASTContext::GE_Missing_stdio;
	return QualType();
	}
	break;
	case 'J':
	if (Signed)
	Type = Context.getsigjmp_bufType();
	else
	Type = Context.getjmp_bufType();

	if (Type.isNull()) {
	Error = ASTContext::GE_Missing_setjmp;
	return QualType();
	}
	break;
	case 'K':
	assert(HowLong == 0 && !Signed && !Unsigned && "Bad modifiers for 'K'!");
	Type = Context.getucontext_tType();

	if (Type.isNull()) {
	Error = ASTContext::GE_Missing_ucontext;
	return QualType();
	}
	break;
	case 'p':
	Type = Context.getProcessIDType();
	break;
	}

	// If there are modifiers and if we're allowed to parse them, go for it.
	Done = !AllowTypeModifiers;
	while (!Done) {
	switch (char c = *Str++) {
	default: Done = true; --Str; break;
	case '*':
	case '&': {
	// Both pointers and references can have their pointee types
	// qualified with an address space.
	char *End;
	unsigned AddrSpace = strtoul(Str, &End, 10);
	if (End != Str && AddrSpace != 0) {
	Type = Context.getAddrSpaceQualType(Type,
	getLangASFromTargetAS(AddrSpace));
	Str = End;
	}
	if (c == '*')
	Type = Context.getPointerType(Type);
	else
	Type = Context.getLValueReferenceType(Type);
	break;
	}
	// FIXME: There's no way to have a built-in with an rvalue ref arg.
	case 'C':
	Type = Type.withConst();
	break;
	case 'D':
	Type = Context.getVolatileType(Type);
	break;
	case 'R':
	Type = Type.withRestrict();
	break;
	}
	}

	assert((!RequiresICE \|\| Type->isIntegralOrEnumerationType()) &&
	"Integer constant 'I' type must be an integer");

	return Type;
	}

	/// GetBuiltinType - Return the type for the specified builtin.
	QualType ASTContext::GetBuiltinType(unsigned Id,
	GetBuiltinTypeError &Error,
	unsigned *IntegerConstantArgs) const {
	const char *TypeStr = BuiltinInfo.getTypeString(Id);

	SmallVector<QualType, 8> ArgTypes;

	bool RequiresICE = false;
	Error = GE_None;
	QualType ResType = DecodeTypeFromStr(TypeStr, *this, Error,
	RequiresICE, true);
	if (Error != GE_None)
	return QualType();

	assert(!RequiresICE && "Result of intrinsic cannot be required to be an ICE");

	while (TypeStr[0] && TypeStr[0] != '.') {
	QualType Ty = DecodeTypeFromStr(TypeStr, *this, Error, RequiresICE, true);
	if (Error != GE_None)
	return QualType();

	// If this argument is required to be an IntegerConstantExpression and the
	// caller cares, fill in the bitmask we return.
	if (RequiresICE && IntegerConstantArgs)
	*IntegerConstantArgs \|= 1 << ArgTypes.size();

	// Do array -> pointer decay. The builtin should use the decayed type.
	if (Ty->isArrayType())
	Ty = getArrayDecayedType(Ty);

	ArgTypes.push_back(Ty);
	}

	if (Id == Builtin::BI__GetExceptionInfo)
	return QualType();

	assert((TypeStr[0] != '.' \|\| TypeStr[1] == 0) &&
	"'.' should only occur at end of builtin type list!");

	FunctionType::ExtInfo EI(CC_C);
	if (BuiltinInfo.isNoReturn(Id)) EI = EI.withNoReturn(true);

	bool Variadic = (TypeStr[0] == '.');

	// We really shouldn't be making a no-proto type here.
	if (ArgTypes.empty() && Variadic && !getLangOpts().CPlusPlus)
	return getFunctionNoProtoType(ResType, EI);

	FunctionProtoType::ExtProtoInfo EPI;
	EPI.ExtInfo = EI;
	EPI.Variadic = Variadic;
	if (getLangOpts().CPlusPlus && BuiltinInfo.isNoThrow(Id))
	EPI.ExceptionSpec.Type =
	getLangOpts().CPlusPlus11 ? EST_BasicNoexcept : EST_DynamicNone;

	return getFunctionType(ResType, ArgTypes, EPI);
	}

	static GVALinkage basicGVALinkageForFunction(const ASTContext &Context,
	const FunctionDecl *FD) {
	if (!FD->isExternallyVisible())
	return GVA_Internal;

	// Non-user-provided functions get emitted as weak definitions with every
	// use, no matter whether they've been explicitly instantiated etc.
	if (auto *MD = dyn_cast<CXXMethodDecl>(FD))
	if (!MD->isUserProvided())
	return GVA_DiscardableODR;

	GVALinkage External;
	switch (FD->getTemplateSpecializationKind()) {
	case TSK_Undeclared:
	case TSK_ExplicitSpecialization:
	External = GVA_StrongExternal;
	break;

	case TSK_ExplicitInstantiationDefinition:
	return GVA_StrongODR;

	// C++11 [temp.explicit]p10:
	// [ Note: The intent is that an inline function that is the subject of
	// an explicit instantiation declaration will still be implicitly
	// instantiated when used so that the body can be considered for
	// inlining, but that no out-of-line copy of the inline function would be
	// generated in the translation unit. -- end note ]
	case TSK_ExplicitInstantiationDeclaration:
	return GVA_AvailableExternally;

	case TSK_ImplicitInstantiation:
	External = GVA_DiscardableODR;
	break;
	}

	if (!FD->isInlined())
	return External;

	if ((!Context.getLangOpts().CPlusPlus &&
	!Context.getTargetInfo().getCXXABI().isMicrosoft() &&
	!FD->hasAttr<DLLExportAttr>()) \|\|
	FD->hasAttr<GNUInlineAttr>()) {
	// FIXME: This doesn't match gcc's behavior for dllexport inline functions.

	// GNU or C99 inline semantics. Determine whether this symbol should be
	// externally visible.
	if (FD->isInlineDefinitionExternallyVisible())
	return External;

	// C99 inline semantics, where the symbol is not externally visible.
	return GVA_AvailableExternally;
	}

	// Functions specified with extern and inline in -fms-compatibility mode
	// forcibly get emitted. While the body of the function cannot be later
	// replaced, the function definition cannot be discarded.
	if (FD->isMSExternInline())
	return GVA_StrongODR;

	return GVA_DiscardableODR;
	}

	static GVALinkage adjustGVALinkageForAttributes(const ASTContext &Context,
	const Decl *D, GVALinkage L) {
	// See http://msdn.microsoft.com/en-us/library/xa0d9ste.aspx
	// dllexport/dllimport on inline functions.
	if (D->hasAttr<DLLImportAttr>()) {
	if (L == GVA_DiscardableODR \|\| L == GVA_StrongODR)
	return GVA_AvailableExternally;
	} else if (D->hasAttr<DLLExportAttr>()) {
	if (L == GVA_DiscardableODR)
	return GVA_StrongODR;
	} else if (Context.getLangOpts().CUDA && Context.getLangOpts().CUDAIsDevice &&
	D->hasAttr<CUDAGlobalAttr>()) {
	// Device-side functions with __global__ attribute must always be
	// visible externally so they can be launched from host.
	if (L == GVA_DiscardableODR \|\| L == GVA_Internal)
	return GVA_StrongODR;
	}
	return L;
	}

	/// Adjust the GVALinkage for a declaration based on what an external AST source
	/// knows about whether there can be other definitions of this declaration.
	static GVALinkage
	adjustGVALinkageForExternalDefinitionKind(const ASTContext &Ctx, const Decl *D,
	GVALinkage L) {
	ExternalASTSource *Source = Ctx.getExternalSource();
	if (!Source)
	return L;

	switch (Source->hasExternalDefinitions(D)) {
	case ExternalASTSource::EK_Never:
	// Other translation units rely on us to provide the definition.
	if (L == GVA_DiscardableODR)
	return GVA_StrongODR;
	break;

	case ExternalASTSource::EK_Always:
	return GVA_AvailableExternally;

	case ExternalASTSource::EK_ReplyHazy:
	break;
	}
	return L;
	}

	GVALinkage ASTContext::GetGVALinkageForFunction(const FunctionDecl *FD) const {
	return adjustGVALinkageForExternalDefinitionKind(*this, FD,
	adjustGVALinkageForAttributes(*this, FD,
	basicGVALinkageForFunction(*this, FD)));
	}

	static GVALinkage basicGVALinkageForVariable(const ASTContext &Context,
	const VarDecl *VD) {
	if (!VD->isExternallyVisible())
	return GVA_Internal;

	if (VD->isStaticLocal()) {
	const DeclContext *LexicalContext = VD->getParentFunctionOrMethod();
	while (LexicalContext && !isa<FunctionDecl>(LexicalContext))
	LexicalContext = LexicalContext->getLexicalParent();

	// ObjC Blocks can create local variables that don't have a FunctionDecl
	// LexicalContext.
	if (!LexicalContext)
	return GVA_DiscardableODR;

	// Otherwise, let the static local variable inherit its linkage from the
	// nearest enclosing function.
	auto StaticLocalLinkage =
	Context.GetGVALinkageForFunction(cast<FunctionDecl>(LexicalContext));

	// Itanium ABI 5.2.2: "Each COMDAT group [for a static local variable] must
	// be emitted in any object with references to the symbol for the object it
	// contains, whether inline or out-of-line."
	// Similar behavior is observed with MSVC. An alternative ABI could use
	// StrongODR/AvailableExternally to match the function, but none are
	// known/supported currently.
	if (StaticLocalLinkage == GVA_StrongODR \|\|
	StaticLocalLinkage == GVA_AvailableExternally)
	return GVA_DiscardableODR;
	return StaticLocalLinkage;
	}

	// MSVC treats in-class initialized static data members as definitions.
	// By giving them non-strong linkage, out-of-line definitions won't
	// cause link errors.
	if (Context.isMSStaticDataMemberInlineDefinition(VD))
	return GVA_DiscardableODR;

	// Most non-template variables have strong linkage; inline variables are
	// linkonce_odr or (occasionally, for compatibility) weak_odr.
	GVALinkage StrongLinkage;
	switch (Context.getInlineVariableDefinitionKind(VD)) {
	case ASTContext::InlineVariableDefinitionKind::None:
	StrongLinkage = GVA_StrongExternal;
	break;
	case ASTContext::InlineVariableDefinitionKind::Weak:
	case ASTContext::InlineVariableDefinitionKind::WeakUnknown:
	StrongLinkage = GVA_DiscardableODR;
	break;
	case ASTContext::InlineVariableDefinitionKind::Strong:
	StrongLinkage = GVA_StrongODR;
	break;
	}

	switch (VD->getTemplateSpecializationKind()) {
	case TSK_Undeclared:
	return StrongLinkage;

	case TSK_ExplicitSpecialization:
	return Context.getTargetInfo().getCXXABI().isMicrosoft() &&
	VD->isStaticDataMember()
	? GVA_StrongODR
	: StrongLinkage;

	case TSK_ExplicitInstantiationDefinition:
	return GVA_StrongODR;

	case TSK_ExplicitInstantiationDeclaration:
	return GVA_AvailableExternally;

	case TSK_ImplicitInstantiation:
	return GVA_DiscardableODR;
	}

	llvm_unreachable("Invalid Linkage!");
	}

	GVALinkage ASTContext::GetGVALinkageForVariable(const VarDecl *VD) {
	return adjustGVALinkageForExternalDefinitionKind(*this, VD,
	adjustGVALinkageForAttributes(*this, VD,
	basicGVALinkageForVariable(*this, VD)));
	}

	bool ASTContext::DeclMustBeEmitted(const Decl *D) {
	if (const VarDecl *VD = dyn_cast<VarDecl>(D)) {
	if (!VD->isFileVarDecl())
	return false;
	// Global named register variables (GNU extension) are never emitted.
	if (VD->getStorageClass() == SC_Register)
	return false;
	if (VD->getDescribedVarTemplate() \|\|
	isa<VarTemplatePartialSpecializationDecl>(VD))
	return false;
	} else if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(D)) {
	// We never need to emit an uninstantiated function template.
	if (FD->getTemplatedKind() == FunctionDecl::TK_FunctionTemplate)
	return false;
	} else if (isa<PragmaCommentDecl>(D))
	return true;
	else if (isa<OMPThreadPrivateDecl>(D) \|\|
	D->hasAttr<OMPDeclareTargetDeclAttr>())
	return true;
	else if (isa<PragmaDetectMismatchDecl>(D))
	return true;
	else if (isa<OMPThreadPrivateDecl>(D))
	return !D->getDeclContext()->isDependentContext();
	else if (isa<OMPDeclareReductionDecl>(D))
	return !D->getDeclContext()->isDependentContext();
	else if (isa<ImportDecl>(D))
	return true;
	else
	return false;

	// If this is a member of a class template, we do not need to emit it.
	if (D->getDeclContext()->isDependentContext())
	return false;

	// Weak references don't produce any output by themselves.
	if (D->hasAttr<WeakRefAttr>())
	return false;

	// Aliases and used decls are required.
	if (D->hasAttr<AliasAttr>() \|\| D->hasAttr<UsedAttr>())
	return true;

	if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(D)) {
	// Forward declarations aren't required.
	if (!FD->doesThisDeclarationHaveABody())
	return FD->doesDeclarationForceExternallyVisibleDefinition();

	// Constructors and destructors are required.
	if (FD->hasAttr<ConstructorAttr>() \|\| FD->hasAttr<DestructorAttr>())
	return true;

	// The key function for a class is required. This rule only comes
	// into play when inline functions can be key functions, though.
	if (getTargetInfo().getCXXABI().canKeyFunctionBeInline()) {
	if (const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(FD)) {
	const CXXRecordDecl *RD = MD->getParent();
	if (MD->isOutOfLine() && RD->isDynamicClass()) {
	const CXXMethodDecl *KeyFunc = getCurrentKeyFunction(RD);
	if (KeyFunc && KeyFunc->getCanonicalDecl() == MD->getCanonicalDecl())
	return true;
	}
	}
	}

	GVALinkage Linkage = GetGVALinkageForFunction(FD);

	// static, static inline, always_inline, and extern inline functions can
	// always be deferred. Normal inline functions can be deferred in C99/C++.
	// Implicit template instantiations can also be deferred in C++.
	return !isDiscardableGVALinkage(Linkage);
	}

	const VarDecl *VD = cast<VarDecl>(D);
	assert(VD->isFileVarDecl() && "Expected file scoped var");

	if (VD->isThisDeclarationADefinition() == VarDecl::DeclarationOnly &&
	!isMSStaticDataMemberInlineDefinition(VD))
	return false;

	// Variables that can be needed in other TUs are required.
	auto Linkage = GetGVALinkageForVariable(VD);
	if (!isDiscardableGVALinkage(Linkage))
	return true;

	// We never need to emit a variable that is available in another TU.
	if (Linkage == GVA_AvailableExternally)
	return false;

	// Variables that have destruction with side-effects are required.
	if (VD->getType().isDestructedType())
	return true;

	// Variables that have initialization with side-effects are required.
	if (VD->getInit() && VD->getInit()->HasSideEffects(*this) &&
	// We can get a value-dependent initializer during error recovery.
	(VD->getInit()->isValueDependent() \|\| !VD->evaluateValue()))
	return true;

	// Likewise, variables with tuple-like bindings are required if their
	// bindings have side-effects.
	if (auto *DD = dyn_cast<DecompositionDecl>(VD))
	for (auto *BD : DD->bindings())
	if (auto *BindingVD = BD->getHoldingVar())
	if (DeclMustBeEmitted(BindingVD))
	return true;

	return false;
	}

	CallingConv ASTContext::getDefaultCallingConvention(bool IsVariadic,
	bool IsCXXMethod) const {
	// Pass through to the C++ ABI object
	if (IsCXXMethod)
	return ABI->getDefaultMethodCallConv(IsVariadic);

	switch (LangOpts.getDefaultCallingConv()) {
	case LangOptions::DCC_None:
	break;
	case LangOptions::DCC_CDecl:
	return CC_C;
	case LangOptions::DCC_FastCall:
	if (getTargetInfo().hasFeature("sse2") && !IsVariadic)
	return CC_X86FastCall;
	break;
	case LangOptions::DCC_StdCall:
	if (!IsVariadic)
	return CC_X86StdCall;
	break;
	case LangOptions::DCC_VectorCall:
	// __vectorcall cannot be applied to variadic functions.
	if (!IsVariadic)
	return CC_X86VectorCall;
	break;
	case LangOptions::DCC_RegCall:
	// __regcall cannot be applied to variadic functions.
	if (!IsVariadic)
	return CC_X86RegCall;
	break;
	}
	return Target->getDefaultCallingConv(TargetInfo::CCMT_Unknown);
	}

	bool ASTContext::isNearlyEmpty(const CXXRecordDecl *RD) const {
	// Pass through to the C++ ABI object
	return ABI->isNearlyEmpty(RD);
	}

	VTableContextBase *ASTContext::getVTableContext() {
	if (!VTContext.get()) {
	if (Target->getCXXABI().isMicrosoft())
	VTContext.reset(new MicrosoftVTableContext(*this));
	else
	VTContext.reset(new ItaniumVTableContext(*this));
	}
	return VTContext.get();
	}

	MangleContext *ASTContext::createMangleContext() {
	switch (Target->getCXXABI().getKind()) {
	case TargetCXXABI::GenericAArch64:
	case TargetCXXABI::GenericItanium:
	case TargetCXXABI::GenericARM:
	case TargetCXXABI::GenericMIPS:
	case TargetCXXABI::iOS:
	case TargetCXXABI::iOS64:
	case TargetCXXABI::WebAssembly:
	case TargetCXXABI::WatchOS:
	return ItaniumMangleContext::create(*this, getDiagnostics());
	case TargetCXXABI::Microsoft:
	return MicrosoftMangleContext::create(*this, getDiagnostics());
	}
	llvm_unreachable("Unsupported ABI");
	}

	CXXABI::~CXXABI() = default;

	size_t ASTContext::getSideTableAllocatedMemory() const {
	return ASTRecordLayouts.getMemorySize() +
	llvm::capacity_in_bytes(ObjCLayouts) +
	llvm::capacity_in_bytes(KeyFunctions) +
	llvm::capacity_in_bytes(ObjCImpls) +
	llvm::capacity_in_bytes(BlockVarCopyInits) +
	llvm::capacity_in_bytes(DeclAttrs) +
	llvm::capacity_in_bytes(TemplateOrInstantiation) +
	llvm::capacity_in_bytes(InstantiatedFromUsingDecl) +
	llvm::capacity_in_bytes(InstantiatedFromUsingShadowDecl) +
	llvm::capacity_in_bytes(InstantiatedFromUnnamedFieldDecl) +
	llvm::capacity_in_bytes(OverriddenMethods) +
	llvm::capacity_in_bytes(Types) +
	llvm::capacity_in_bytes(VariableArrayTypes) +
	llvm::capacity_in_bytes(ClassScopeSpecializationPattern);
	}

	/// getIntTypeForBitwidth -
	/// sets integer QualTy according to specified details:
	/// bitwidth, signed/unsigned.
	/// Returns empty type if there is no appropriate target types.
	QualType ASTContext::getIntTypeForBitwidth(unsigned DestWidth,
	unsigned Signed) const {
	TargetInfo::IntType Ty = getTargetInfo().getIntTypeByWidth(DestWidth, Signed);
	CanQualType QualTy = getFromTargetType(Ty);
	if (!QualTy && DestWidth == 128)
	return Signed ? Int128Ty : UnsignedInt128Ty;
	return QualTy;
	}

	/// getRealTypeForBitwidth -
	/// sets floating point QualTy according to specified bitwidth.
	/// Returns empty type if there is no appropriate target types.
	QualType ASTContext::getRealTypeForBitwidth(unsigned DestWidth) const {
	TargetInfo::RealType Ty = getTargetInfo().getRealTypeByWidth(DestWidth);
	switch (Ty) {
	case TargetInfo::Float:
	return FloatTy;
	case TargetInfo::Double:
	return DoubleTy;
	case TargetInfo::LongDouble:
	return LongDoubleTy;
	case TargetInfo::Float128:
	return Float128Ty;
	case TargetInfo::NoFloat:
	return QualType();
	}

	llvm_unreachable("Unhandled TargetInfo::RealType value");
	}

	void ASTContext::setManglingNumber(const NamedDecl *ND, unsigned Number) {
	if (Number > 1)
	MangleNumbers[ND] = Number;
	}

	unsigned ASTContext::getManglingNumber(const NamedDecl *ND) const {
	auto I = MangleNumbers.find(ND);
	return I != MangleNumbers.end() ? I->second : 1;
	}

	void ASTContext::setStaticLocalNumber(const VarDecl *VD, unsigned Number) {
	if (Number > 1)
	StaticLocalNumbers[VD] = Number;
	}

	unsigned ASTContext::getStaticLocalNumber(const VarDecl *VD) const {
	auto I = StaticLocalNumbers.find(VD);
	return I != StaticLocalNumbers.end() ? I->second : 1;
	}

	MangleNumberingContext &
	ASTContext::getManglingNumberContext(const DeclContext *DC) {
	assert(LangOpts.CPlusPlus); // We don't need mangling numbers for plain C.
	std::unique_ptr<MangleNumberingContext> &MCtx = MangleNumberingContexts[DC];
	if (!MCtx)
	MCtx = createMangleNumberingContext();
	return *MCtx;
	}

	std::unique_ptr<MangleNumberingContext>
	ASTContext::createMangleNumberingContext() const {
	return ABI->createMangleNumberingContext();
	}

	const CXXConstructorDecl *
	ASTContext::getCopyConstructorForExceptionObject(CXXRecordDecl *RD) {
	return ABI->getCopyConstructorForExceptionObject(
	cast<CXXRecordDecl>(RD->getFirstDecl()));
	}

	void ASTContext::addCopyConstructorForExceptionObject(CXXRecordDecl *RD,
	CXXConstructorDecl *CD) {
	return ABI->addCopyConstructorForExceptionObject(
	cast<CXXRecordDecl>(RD->getFirstDecl()),
	cast<CXXConstructorDecl>(CD->getFirstDecl()));
	}

	void ASTContext::addTypedefNameForUnnamedTagDecl(TagDecl *TD,
	TypedefNameDecl *DD) {
	return ABI->addTypedefNameForUnnamedTagDecl(TD, DD);
	}

	TypedefNameDecl *
	ASTContext::getTypedefNameForUnnamedTagDecl(const TagDecl *TD) {
	return ABI->getTypedefNameForUnnamedTagDecl(TD);
	}

	void ASTContext::addDeclaratorForUnnamedTagDecl(TagDecl *TD,
	DeclaratorDecl *DD) {
	return ABI->addDeclaratorForUnnamedTagDecl(TD, DD);
	}

	DeclaratorDecl ASTContext::getDeclaratorForUnnamedTagDecl(const TagDecl TD) {
	return ABI->getDeclaratorForUnnamedTagDecl(TD);
	}

	void ASTContext::setParameterIndex(const ParmVarDecl *D, unsigned int index) {
	ParamIndices[D] = index;
	}

	unsigned ASTContext::getParameterIndex(const ParmVarDecl *D) const {
	ParameterIndexTable::const_iterator I = ParamIndices.find(D);
	assert(I != ParamIndices.end() &&
	"ParmIndices lacks entry set by ParmVarDecl");
	return I->second;
	}

	APValue *
	ASTContext::getMaterializedTemporaryValue(const MaterializeTemporaryExpr *E,
	bool MayCreate) {
	assert(E && E->getStorageDuration() == SD_Static &&
	"don't need to cache the computed value for this temporary");
	if (MayCreate) {
	APValue *&MTVI = MaterializedTemporaryValues[E];
	if (!MTVI)
	MTVI = new (*this) APValue;
	return MTVI;
	}

	return MaterializedTemporaryValues.lookup(E);
	}

	bool ASTContext::AtomicUsesUnsupportedLibcall(const AtomicExpr *E) const {
	const llvm::Triple &T = getTargetInfo().getTriple();
	if (!T.isOSDarwin())
	return false;

	if (!(T.isiOS() && T.isOSVersionLT(7)) &&
	!(T.isMacOSX() && T.isOSVersionLT(10, 9)))
	return false;

	QualType AtomicTy = E->getPtr()->getType()->getPointeeType();
	CharUnits sizeChars = getTypeSizeInChars(AtomicTy);
	uint64_t Size = sizeChars.getQuantity();
	CharUnits alignChars = getTypeAlignInChars(AtomicTy);
	unsigned Align = alignChars.getQuantity();
	unsigned MaxInlineWidthInBits = getTargetInfo().getMaxAtomicInlineWidth();
	return (Size != Align \|\| toBits(sizeChars) > MaxInlineWidthInBits);
	}

	static ast_type_traits::DynTypedNode getSingleDynTypedNodeFromParentMap(
	ASTContext::ParentMapPointers::mapped_type U) {
	if (const auto D = U.dyn_cast<const Decl >())
	return ast_type_traits::DynTypedNode::create(*D);
	if (const auto S = U.dyn_cast<const Stmt >())
	return ast_type_traits::DynTypedNode::create(*S);
	return U.get<ast_type_traits::DynTypedNode >();
	}

	namespace {

	/// Template specializations to abstract away from pointers and TypeLocs.
	/// @{
	template <typename T>
	ast_type_traits::DynTypedNode createDynTypedNode(const T &Node) {
	return ast_type_traits::DynTypedNode::create(*Node);
	}
	template <>
	ast_type_traits::DynTypedNode createDynTypedNode(const TypeLoc &Node) {
	return ast_type_traits::DynTypedNode::create(Node);
	}
	template <>
	ast_type_traits::DynTypedNode
	createDynTypedNode(const NestedNameSpecifierLoc &Node) {
	return ast_type_traits::DynTypedNode::create(Node);
	}
	/// @}

	/// \brief A \c RecursiveASTVisitor that builds a map from nodes to their
	/// parents as defined by the \c RecursiveASTVisitor.
	///
	/// Note that the relationship described here is purely in terms of AST
	/// traversal - there are other relationships (for example declaration context)
	/// in the AST that are better modeled by special matchers.
	///
	/// FIXME: Currently only builds up the map using \c Stmt and \c Decl nodes.
	class ParentMapASTVisitor : public RecursiveASTVisitor<ParentMapASTVisitor> {
	public:
	/// \brief Builds and returns the translation unit's parent map.
	///
	/// The caller takes ownership of the returned \c ParentMap.
	static std::pair<ASTContext::ParentMapPointers *,
	ASTContext::ParentMapOtherNodes *>
	buildMap(TranslationUnitDecl &TU) {
	ParentMapASTVisitor Visitor(new ASTContext::ParentMapPointers,
	new ASTContext::ParentMapOtherNodes);
	Visitor.TraverseDecl(&TU);
	return std::make_pair(Visitor.Parents, Visitor.OtherParents);
	}

	private:
	friend class RecursiveASTVisitor<ParentMapASTVisitor>;

	using VisitorBase = RecursiveASTVisitor<ParentMapASTVisitor>;

	ParentMapASTVisitor(ASTContext::ParentMapPointers *Parents,
	ASTContext::ParentMapOtherNodes *OtherParents)
	: Parents(Parents), OtherParents(OtherParents) {}

	bool shouldVisitTemplateInstantiations() const {
	return true;
	}

	bool shouldVisitImplicitCode() const {
	return true;
	}

	template <typename T, typename MapNodeTy, typename BaseTraverseFn,
	typename MapTy>
	bool TraverseNode(T Node, MapNodeTy MapNode,
	BaseTraverseFn BaseTraverse, MapTy *Parents) {
	if (!Node)
	return true;
	if (ParentStack.size() > 0) {
	// FIXME: Currently we add the same parent multiple times, but only
	// when no memoization data is available for the type.
	// For example when we visit all subexpressions of template
	// instantiations; this is suboptimal, but benign: the only way to
	// visit those is with hasAncestor / hasParent, and those do not create
	// new matches.
	// The plan is to enable DynTypedNode to be storable in a map or hash
	// map. The main problem there is to implement hash functions /
	// comparison operators for all types that DynTypedNode supports that
	// do not have pointer identity.
	auto &NodeOrVector = (*Parents)[MapNode];
	if (NodeOrVector.isNull()) {
	if (const auto *D = ParentStack.back().get<Decl>())
	NodeOrVector = D;
	else if (const auto *S = ParentStack.back().get<Stmt>())
	NodeOrVector = S;
	else
	NodeOrVector =
	new ast_type_traits::DynTypedNode(ParentStack.back());
	} else {
	if (!NodeOrVector.template is<ASTContext::ParentVector *>()) {
	auto *Vector = new ASTContext::ParentVector(
	1, getSingleDynTypedNodeFromParentMap(NodeOrVector));
	delete NodeOrVector
	.template dyn_cast<ast_type_traits::DynTypedNode *>();
	NodeOrVector = Vector;
	}

	auto *Vector =
	NodeOrVector.template get<ASTContext::ParentVector *>();
	// Skip duplicates for types that have memoization data.
	// We must check that the type has memoization data before calling
	// std::find() because DynTypedNode::operator== can't compare all
	// types.
	bool Found = ParentStack.back().getMemoizationData() &&
	std::find(Vector->begin(), Vector->end(),
	ParentStack.back()) != Vector->end();
	if (!Found)
	Vector->push_back(ParentStack.back());
	}
	}
	ParentStack.push_back(createDynTypedNode(Node));
	bool Result = BaseTraverse();
	ParentStack.pop_back();
	return Result;
	}

	bool TraverseDecl(Decl *DeclNode) {
	return TraverseNode(DeclNode, DeclNode,
	[&] { return VisitorBase::TraverseDecl(DeclNode); },
	Parents);
	}

	bool TraverseStmt(Stmt *StmtNode) {
	return TraverseNode(StmtNode, StmtNode,
	[&] { return VisitorBase::TraverseStmt(StmtNode); },
	Parents);
	}

	bool TraverseTypeLoc(TypeLoc TypeLocNode) {
	return TraverseNode(
	TypeLocNode, ast_type_traits::DynTypedNode::create(TypeLocNode),
	[&] { return VisitorBase::TraverseTypeLoc(TypeLocNode); },
	OtherParents);
	}

	bool TraverseNestedNameSpecifierLoc(NestedNameSpecifierLoc NNSLocNode) {
	return TraverseNode(
	NNSLocNode, ast_type_traits::DynTypedNode::create(NNSLocNode),
	[&] {
	return VisitorBase::TraverseNestedNameSpecifierLoc(NNSLocNode);
	},
	OtherParents);
	}

	ASTContext::ParentMapPointers *Parents;
	ASTContext::ParentMapOtherNodes *OtherParents;
	llvm::SmallVector<ast_type_traits::DynTypedNode, 16> ParentStack;
	};

	} // namespace

	template <typename NodeTy, typename MapTy>
	static ASTContext::DynTypedNodeList getDynNodeFromMap(const NodeTy &Node,
	const MapTy &Map) {
	auto I = Map.find(Node);
	if (I == Map.end()) {
	return llvm::ArrayRef<ast_type_traits::DynTypedNode>();
	}
	if (auto V = I->second.template dyn_cast<ASTContext::ParentVector >()) {
	return llvm::makeArrayRef(*V);
	}
	return getSingleDynTypedNodeFromParentMap(I->second);
	}

	ASTContext::DynTypedNodeList
	ASTContext::getParents(const ast_type_traits::DynTypedNode &Node) {
	if (!PointerParents) {
	// We always need to run over the whole translation unit, as
	// hasAncestor can escape any subtree.
	auto Maps = ParentMapASTVisitor::buildMap(*getTranslationUnitDecl());
	PointerParents.reset(Maps.first);
	OtherParents.reset(Maps.second);
	}
	if (Node.getNodeKind().hasPointerIdentity())
	return getDynNodeFromMap(Node.getMemoizationData(), *PointerParents);
	return getDynNodeFromMap(Node, *OtherParents);
	}

	bool
	ASTContext::ObjCMethodsAreEqual(const ObjCMethodDecl *MethodDecl,
	const ObjCMethodDecl *MethodImpl) {
	// No point trying to match an unavailable/deprecated mothod.
	if (MethodDecl->hasAttr<UnavailableAttr>()
	\|\| MethodDecl->hasAttr<DeprecatedAttr>())
	return false;
	if (MethodDecl->getObjCDeclQualifier() !=
	MethodImpl->getObjCDeclQualifier())
	return false;
	if (!hasSameType(MethodDecl->getReturnType(), MethodImpl->getReturnType()))
	return false;

	if (MethodDecl->param_size() != MethodImpl->param_size())
	return false;

	for (ObjCMethodDecl::param_const_iterator IM = MethodImpl->param_begin(),
	IF = MethodDecl->param_begin(), EM = MethodImpl->param_end(),
	EF = MethodDecl->param_end();
	IM != EM && IF != EF; ++IM, ++IF) {
	const ParmVarDecl DeclVar = (IF);
	const ParmVarDecl ImplVar = (IM);
	if (ImplVar->getObjCDeclQualifier() != DeclVar->getObjCDeclQualifier())
	return false;
	if (!hasSameType(DeclVar->getType(), ImplVar->getType()))
	return false;
	}

	return (MethodDecl->isVariadic() == MethodImpl->isVariadic());
	}

	uint64_t ASTContext::getTargetNullPointerValue(QualType QT) const {
	LangAS AS;
	if (QT->getUnqualifiedDesugaredType()->isNullPtrType())
	AS = LangAS::Default;
	else
	AS = QT->getPointeeType().getAddressSpace();

	return getTargetInfo().getNullPointerValue(AS);
	}

	unsigned ASTContext::getTargetAddressSpace(LangAS AS) const {
	if (isTargetAddressSpace(AS))
	return toTargetAddressSpace(AS);
	else
	return (*AddrSpaceMap)[(unsigned)AS];
	}

	// Explicitly instantiate this in case a Redeclarable<T> is used from a TU that
	// doesn't include ASTContext.h
	template
	clang::LazyGenerationalUpdatePtr<
	const Decl , Decl , &ExternalASTSource::CompleteRedeclChain>::ValueType
	clang::LazyGenerationalUpdatePtr<
	const Decl , Decl , &ExternalASTSource::CompleteRedeclChain>::makeValue(
	const clang::ASTContext &Ctx, Decl *Value);
	Index: head/contrib/llvm/tools/clang/lib/AST/DeclBase.cpp
	===================================================================
	--- head/contrib/llvm/tools/clang/lib/AST/DeclBase.cpp (revision 329409)
	+++ head/contrib/llvm/tools/clang/lib/AST/DeclBase.cpp (revision 329410)
	@@ -1,1879 +1,1892 @@
	//===- DeclBase.cpp - Declaration AST Node Implementation -----------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the Decl and DeclContext classes.
	//
	//===----------------------------------------------------------------------===//

	#include "clang/AST/DeclBase.h"
	#include "clang/AST/ASTContext.h"
	#include "clang/AST/ASTMutationListener.h"
	#include "clang/AST/Attr.h"
	#include "clang/AST/AttrIterator.h"
	#include "clang/AST/Decl.h"
	#include "clang/AST/DeclCXX.h"
	#include "clang/AST/DeclContextInternals.h"
	#include "clang/AST/DeclFriend.h"
	#include "clang/AST/DeclObjC.h"
	#include "clang/AST/DeclOpenMP.h"
	#include "clang/AST/DeclTemplate.h"
	#include "clang/AST/DependentDiagnostic.h"
	#include "clang/AST/ExternalASTSource.h"
	#include "clang/AST/Stmt.h"
	#include "clang/AST/Type.h"
	#include "clang/Basic/IdentifierTable.h"
	#include "clang/Basic/LLVM.h"
	#include "clang/Basic/LangOptions.h"
	#include "clang/Basic/ObjCRuntime.h"
	#include "clang/Basic/PartialDiagnostic.h"
	#include "clang/Basic/SourceLocation.h"
	#include "clang/Basic/TargetInfo.h"
	#include "clang/Basic/VersionTuple.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/PointerIntPair.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include <algorithm>
	#include <cassert>
	#include <cstddef>
	#include <string>
	#include <tuple>
	#include <utility>

	using namespace clang;

	//===----------------------------------------------------------------------===//
	// Statistics
	//===----------------------------------------------------------------------===//

	#define DECL(DERIVED, BASE) static int n##DERIVED##s = 0;
	#define ABSTRACT_DECL(DECL)
	#include "clang/AST/DeclNodes.inc"

	void Decl::updateOutOfDate(IdentifierInfo &II) const {
	getASTContext().getExternalSource()->updateOutOfDateIdentifier(II);
	}

	#define DECL(DERIVED, BASE) \
	static_assert(alignof(Decl) >= alignof(DERIVED##Decl), \
	"Alignment sufficient after objects prepended to " #DERIVED);
	#define ABSTRACT_DECL(DECL)
	#include "clang/AST/DeclNodes.inc"

	void *Decl::operator new(std::size_t Size, const ASTContext &Context,
	unsigned ID, std::size_t Extra) {
	// Allocate an extra 8 bytes worth of storage, which ensures that the
	// resulting pointer will still be 8-byte aligned.
	static_assert(sizeof(unsigned) * 2 >= alignof(Decl),
	"Decl won't be misaligned");
	void *Start = Context.Allocate(Size + Extra + 8);
	void Result = (char)Start + 8;

	unsigned PrefixPtr = (unsigned )Result - 2;

	// Zero out the first 4 bytes; this is used to store the owning module ID.
	PrefixPtr[0] = 0;

	// Store the global declaration ID in the second 4 bytes.
	PrefixPtr[1] = ID;

	return Result;
	}

	void *Decl::operator new(std::size_t Size, const ASTContext &Ctx,
	DeclContext *Parent, std::size_t Extra) {
	assert(!Parent \|\| &Parent->getParentASTContext() == &Ctx);
	// With local visibility enabled, we track the owning module even for local
	// declarations. We create the TU decl early and may not yet know what the
	// LangOpts are, so conservatively allocate the storage.
	if (Ctx.getLangOpts().trackLocalOwningModule() \|\| !Parent) {
	// Ensure required alignment of the resulting object by adding extra
	// padding at the start if required.
	size_t ExtraAlign =
	llvm::OffsetToAlignment(sizeof(Module *), alignof(Decl));
	char Buffer = reinterpret_cast<char >(
	::operator new(ExtraAlign + sizeof(Module *) + Size + Extra, Ctx));
	Buffer += ExtraAlign;
	auto *ParentModule =
	Parent ? cast<Decl>(Parent)->getOwningModule() : nullptr;
	return new (Buffer) Module*(ParentModule) + 1;
	}
	return ::operator new(Size + Extra, Ctx);
	}

	Module *Decl::getOwningModuleSlow() const {
	assert(isFromASTFile() && "Not from AST file?");
	return getASTContext().getExternalSource()->getModule(getOwningModuleID());
	}

	bool Decl::hasLocalOwningModuleStorage() const {
	return getASTContext().getLangOpts().trackLocalOwningModule();
	}

	const char *Decl::getDeclKindName() const {
	switch (DeclKind) {
	default: llvm_unreachable("Declaration not in DeclNodes.inc!");
	#define DECL(DERIVED, BASE) case DERIVED: return #DERIVED;
	#define ABSTRACT_DECL(DECL)
	#include "clang/AST/DeclNodes.inc"
	}
	}

	void Decl::setInvalidDecl(bool Invalid) {
	InvalidDecl = Invalid;
	assert(!isa<TagDecl>(this) \|\| !cast<TagDecl>(this)->isCompleteDefinition());
	if (!Invalid) {
	return;
	}

	if (!isa<ParmVarDecl>(this)) {
	// Defensive maneuver for ill-formed code: we're likely not to make it to
	// a point where we set the access specifier, so default it to "public"
	// to avoid triggering asserts elsewhere in the front end.
	setAccess(AS_public);
	}

	// Marking a DecompositionDecl as invalid implies all the child BindingDecl's
	// are invalid too.
	if (DecompositionDecl *DD = dyn_cast<DecompositionDecl>(this)) {
	for (BindingDecl *Binding : DD->bindings()) {
	Binding->setInvalidDecl();
	}
	}
	}

	const char *DeclContext::getDeclKindName() const {
	switch (DeclKind) {
	default: llvm_unreachable("Declaration context not in DeclNodes.inc!");
	#define DECL(DERIVED, BASE) case Decl::DERIVED: return #DERIVED;
	#define ABSTRACT_DECL(DECL)
	#include "clang/AST/DeclNodes.inc"
	}
	}

	bool Decl::StatisticsEnabled = false;
	void Decl::EnableStatistics() {
	StatisticsEnabled = true;
	}

	void Decl::PrintStats() {
	llvm::errs() << "\n*** Decl Stats:\n";

	int totalDecls = 0;
	#define DECL(DERIVED, BASE) totalDecls += n##DERIVED##s;
	#define ABSTRACT_DECL(DECL)
	#include "clang/AST/DeclNodes.inc"
	llvm::errs() << " " << totalDecls << " decls total.\n";

	int totalBytes = 0;
	#define DECL(DERIVED, BASE) \
	if (n##DERIVED##s > 0) { \
	totalBytes += (int)(n##DERIVED##s * sizeof(DERIVED##Decl)); \
	llvm::errs() << " " << n##DERIVED##s << " " #DERIVED " decls, " \
	<< sizeof(DERIVED##Decl) << " each (" \
	<< n##DERIVED##s * sizeof(DERIVED##Decl) \
	<< " bytes)\n"; \
	}
	#define ABSTRACT_DECL(DECL)
	#include "clang/AST/DeclNodes.inc"

	llvm::errs() << "Total bytes = " << totalBytes << "\n";
	}

	void Decl::add(Kind k) {
	switch (k) {
	#define DECL(DERIVED, BASE) case DERIVED: ++n##DERIVED##s; break;
	#define ABSTRACT_DECL(DECL)
	#include "clang/AST/DeclNodes.inc"
	}
	}

	bool Decl::isTemplateParameterPack() const {
	if (const TemplateTypeParmDecl *TTP = dyn_cast<TemplateTypeParmDecl>(this))
	return TTP->isParameterPack();
	if (const NonTypeTemplateParmDecl *NTTP
	= dyn_cast<NonTypeTemplateParmDecl>(this))
	return NTTP->isParameterPack();
	if (const TemplateTemplateParmDecl *TTP
	= dyn_cast<TemplateTemplateParmDecl>(this))
	return TTP->isParameterPack();
	return false;
	}

	bool Decl::isParameterPack() const {
	if (const ParmVarDecl *Parm = dyn_cast<ParmVarDecl>(this))
	return Parm->isParameterPack();

	return isTemplateParameterPack();
	}

	FunctionDecl *Decl::getAsFunction() {
	if (FunctionDecl *FD = dyn_cast<FunctionDecl>(this))
	return FD;
	if (const FunctionTemplateDecl *FTD = dyn_cast<FunctionTemplateDecl>(this))
	return FTD->getTemplatedDecl();
	return nullptr;
	}

	bool Decl::isTemplateDecl() const {
	return isa<TemplateDecl>(this);
	}

	TemplateDecl *Decl::getDescribedTemplate() const {
	if (auto *FD = dyn_cast<FunctionDecl>(this))
	return FD->getDescribedFunctionTemplate();
	else if (auto *RD = dyn_cast<CXXRecordDecl>(this))
	return RD->getDescribedClassTemplate();
	else if (auto *VD = dyn_cast<VarDecl>(this))
	return VD->getDescribedVarTemplate();
	+ else if (auto *AD = dyn_cast<TypeAliasDecl>(this))
	+ return AD->getDescribedAliasTemplate();

	return nullptr;
	+}
	+
	+bool Decl::isTemplated() const {
	+ // A declaration is dependent if it is a template or a template pattern, or
	+ // is within (lexcially for a friend, semantically otherwise) a dependent
	+ // context.
	+ // FIXME: Should local extern declarations be treated like friends?
	+ if (auto *AsDC = dyn_cast<DeclContext>(this))
	+ return AsDC->isDependentContext();
	+ auto *DC = getFriendObjectKind() ? getLexicalDeclContext() : getDeclContext();
	+ return DC->isDependentContext() \|\| isTemplateDecl() \|\| getDescribedTemplate();
	}

	const DeclContext *Decl::getParentFunctionOrMethod() const {
	for (const DeclContext *DC = getDeclContext();
	DC && !DC->isTranslationUnit() && !DC->isNamespace();
	DC = DC->getParent())
	if (DC->isFunctionOrMethod())
	return DC;

	return nullptr;
	}

	//===----------------------------------------------------------------------===//
	// PrettyStackTraceDecl Implementation
	//===----------------------------------------------------------------------===//

	void PrettyStackTraceDecl::print(raw_ostream &OS) const {
	SourceLocation TheLoc = Loc;
	if (TheLoc.isInvalid() && TheDecl)
	TheLoc = TheDecl->getLocation();

	if (TheLoc.isValid()) {
	TheLoc.print(OS, SM);
	OS << ": ";
	}

	OS << Message;

	if (const NamedDecl *DN = dyn_cast_or_null<NamedDecl>(TheDecl)) {
	OS << " '";
	DN->printQualifiedName(OS);
	OS << '\'';
	}
	OS << '\n';
	}

	//===----------------------------------------------------------------------===//
	// Decl Implementation
	//===----------------------------------------------------------------------===//

	// Out-of-line virtual method providing a home for Decl.
	Decl::~Decl() = default;

	void Decl::setDeclContext(DeclContext *DC) {
	DeclCtx = DC;
	}

	void Decl::setLexicalDeclContext(DeclContext *DC) {
	if (DC == getLexicalDeclContext())
	return;

	if (isInSemaDC()) {
	setDeclContextsImpl(getDeclContext(), DC, getASTContext());
	} else {
	getMultipleDC()->LexicalDC = DC;
	}

	// FIXME: We shouldn't be changing the lexical context of declarations
	// imported from AST files.
	if (!isFromASTFile()) {
	setModuleOwnershipKind(getModuleOwnershipKindForChildOf(DC));
	if (hasOwningModule())
	setLocalOwningModule(cast<Decl>(DC)->getOwningModule());
	}

	assert(
	(getModuleOwnershipKind() != ModuleOwnershipKind::VisibleWhenImported \|\|
	getOwningModule()) &&
	"hidden declaration has no owning module");
	}

	void Decl::setDeclContextsImpl(DeclContext SemaDC, DeclContext LexicalDC,
	ASTContext &Ctx) {
	if (SemaDC == LexicalDC) {
	DeclCtx = SemaDC;
	} else {
	Decl::MultipleDC *MDC = new (Ctx) Decl::MultipleDC();
	MDC->SemanticDC = SemaDC;
	MDC->LexicalDC = LexicalDC;
	DeclCtx = MDC;
	}
	}

	bool Decl::isLexicallyWithinFunctionOrMethod() const {
	const DeclContext *LDC = getLexicalDeclContext();
	while (true) {
	if (LDC->isFunctionOrMethod())
	return true;
	if (!isa<TagDecl>(LDC))
	return false;
	LDC = LDC->getLexicalParent();
	}
	return false;
	}

	bool Decl::isInAnonymousNamespace() const {
	for (const DeclContext *DC = getDeclContext(); DC; DC = DC->getParent()) {
	if (const NamespaceDecl *ND = dyn_cast<NamespaceDecl>(DC))
	if (ND->isAnonymousNamespace())
	return true;
	}

	return false;
	}

	bool Decl::isInStdNamespace() const {
	return getDeclContext()->isStdNamespace();
	}

	TranslationUnitDecl *Decl::getTranslationUnitDecl() {
	if (TranslationUnitDecl *TUD = dyn_cast<TranslationUnitDecl>(this))
	return TUD;

	DeclContext *DC = getDeclContext();
	assert(DC && "This decl is not contained in a translation unit!");

	while (!DC->isTranslationUnit()) {
	DC = DC->getParent();
	assert(DC && "This decl is not contained in a translation unit!");
	}

	return cast<TranslationUnitDecl>(DC);
	}

	ASTContext &Decl::getASTContext() const {
	return getTranslationUnitDecl()->getASTContext();
	}

	ASTMutationListener *Decl::getASTMutationListener() const {
	return getASTContext().getASTMutationListener();
	}

	unsigned Decl::getMaxAlignment() const {
	if (!hasAttrs())
	return 0;

	unsigned Align = 0;
	const AttrVec &V = getAttrs();
	ASTContext &Ctx = getASTContext();
	specific_attr_iterator<AlignedAttr> I(V.begin()), E(V.end());
	for (; I != E; ++I)
	Align = std::max(Align, I->getAlignment(Ctx));
	return Align;
	}

	bool Decl::isUsed(bool CheckUsedAttr) const {
	const Decl *CanonD = getCanonicalDecl();
	if (CanonD->Used)
	return true;

	// Check for used attribute.
	// Ask the most recent decl, since attributes accumulate in the redecl chain.
	if (CheckUsedAttr && getMostRecentDecl()->hasAttr<UsedAttr>())
	return true;

	// The information may have not been deserialized yet. Force deserialization
	// to complete the needed information.
	return getMostRecentDecl()->getCanonicalDecl()->Used;
	}

	void Decl::markUsed(ASTContext &C) {
	if (isUsed(false))
	return;

	if (C.getASTMutationListener())
	C.getASTMutationListener()->DeclarationMarkedUsed(this);

	setIsUsed();
	}

	bool Decl::isReferenced() const {
	if (Referenced)
	return true;

	// Check redeclarations.
	for (auto I : redecls())
	if (I->Referenced)
	return true;

	return false;
	}

	bool Decl::isExported() const {
	if (isModulePrivate())
	return false;
	// Namespaces are always exported.
	if (isa<TranslationUnitDecl>(this) \|\| isa<NamespaceDecl>(this))
	return true;
	// Otherwise, this is a strictly lexical check.
	for (auto *DC = getLexicalDeclContext(); DC; DC = DC->getLexicalParent()) {
	if (cast<Decl>(DC)->isModulePrivate())
	return false;
	if (isa<ExportDecl>(DC))
	return true;
	}
	return false;
	}

	ExternalSourceSymbolAttr *Decl::getExternalSourceSymbolAttr() const {
	const Decl *Definition = nullptr;
	if (auto ID = dyn_cast<ObjCInterfaceDecl>(this)) {
	Definition = ID->getDefinition();
	} else if (auto PD = dyn_cast<ObjCProtocolDecl>(this)) {
	Definition = PD->getDefinition();
	} else if (auto TD = dyn_cast<TagDecl>(this)) {
	Definition = TD->getDefinition();
	}
	if (!Definition)
	Definition = this;

	if (auto *attr = Definition->getAttr<ExternalSourceSymbolAttr>())
	return attr;
	if (auto *dcd = dyn_cast<Decl>(getDeclContext())) {
	return dcd->getAttr<ExternalSourceSymbolAttr>();
	}

	return nullptr;
	}

	bool Decl::hasDefiningAttr() const {
	return hasAttr<AliasAttr>() \|\| hasAttr<IFuncAttr>();
	}

	const Attr *Decl::getDefiningAttr() const {
	if (AliasAttr *AA = getAttr<AliasAttr>())
	return AA;
	if (IFuncAttr *IFA = getAttr<IFuncAttr>())
	return IFA;
	return nullptr;
	}

	static StringRef getRealizedPlatform(const AvailabilityAttr *A,
	const ASTContext &Context) {
	// Check if this is an App Extension "platform", and if so chop off
	// the suffix for matching with the actual platform.
	StringRef RealizedPlatform = A->getPlatform()->getName();
	if (!Context.getLangOpts().AppExt)
	return RealizedPlatform;
	size_t suffix = RealizedPlatform.rfind("_app_extension");
	if (suffix != StringRef::npos)
	return RealizedPlatform.slice(0, suffix);
	return RealizedPlatform;
	}

	/// \brief Determine the availability of the given declaration based on
	/// the target platform.
	///
	/// When it returns an availability result other than \c AR_Available,
	/// if the \p Message parameter is non-NULL, it will be set to a
	/// string describing why the entity is unavailable.
	///
	/// FIXME: Make these strings localizable, since they end up in
	/// diagnostics.
	static AvailabilityResult CheckAvailability(ASTContext &Context,
	const AvailabilityAttr *A,
	std::string *Message,
	VersionTuple EnclosingVersion) {
	if (EnclosingVersion.empty())
	EnclosingVersion = Context.getTargetInfo().getPlatformMinVersion();

	if (EnclosingVersion.empty())
	return AR_Available;

	StringRef ActualPlatform = A->getPlatform()->getName();
	StringRef TargetPlatform = Context.getTargetInfo().getPlatformName();

	// Match the platform name.
	if (getRealizedPlatform(A, Context) != TargetPlatform)
	return AR_Available;

	StringRef PrettyPlatformName
	= AvailabilityAttr::getPrettyPlatformName(ActualPlatform);

	if (PrettyPlatformName.empty())
	PrettyPlatformName = ActualPlatform;

	std::string HintMessage;
	if (!A->getMessage().empty()) {
	HintMessage = " - ";
	HintMessage += A->getMessage();
	}

	// Make sure that this declaration has not been marked 'unavailable'.
	if (A->getUnavailable()) {
	if (Message) {
	Message->clear();
	llvm::raw_string_ostream Out(*Message);
	Out << "not available on " << PrettyPlatformName
	<< HintMessage;
	}

	return AR_Unavailable;
	}

	// Make sure that this declaration has already been introduced.
	if (!A->getIntroduced().empty() &&
	EnclosingVersion < A->getIntroduced()) {
	if (Message) {
	Message->clear();
	llvm::raw_string_ostream Out(*Message);
	VersionTuple VTI(A->getIntroduced());
	VTI.UseDotAsSeparator();
	Out << "introduced in " << PrettyPlatformName << ' '
	<< VTI << HintMessage;
	}

	return A->getStrict() ? AR_Unavailable : AR_NotYetIntroduced;
	}

	// Make sure that this declaration hasn't been obsoleted.
	if (!A->getObsoleted().empty() && EnclosingVersion >= A->getObsoleted()) {
	if (Message) {
	Message->clear();
	llvm::raw_string_ostream Out(*Message);
	VersionTuple VTO(A->getObsoleted());
	VTO.UseDotAsSeparator();
	Out << "obsoleted in " << PrettyPlatformName << ' '
	<< VTO << HintMessage;
	}

	return AR_Unavailable;
	}

	// Make sure that this declaration hasn't been deprecated.
	if (!A->getDeprecated().empty() && EnclosingVersion >= A->getDeprecated()) {
	if (Message) {
	Message->clear();
	llvm::raw_string_ostream Out(*Message);
	VersionTuple VTD(A->getDeprecated());
	VTD.UseDotAsSeparator();
	Out << "first deprecated in " << PrettyPlatformName << ' '
	<< VTD << HintMessage;
	}

	return AR_Deprecated;
	}

	return AR_Available;
	}

	AvailabilityResult Decl::getAvailability(std::string *Message,
	VersionTuple EnclosingVersion) const {
	if (auto *FTD = dyn_cast<FunctionTemplateDecl>(this))
	return FTD->getTemplatedDecl()->getAvailability(Message, EnclosingVersion);

	AvailabilityResult Result = AR_Available;
	std::string ResultMessage;

	for (const auto *A : attrs()) {
	if (const auto *Deprecated = dyn_cast<DeprecatedAttr>(A)) {
	if (Result >= AR_Deprecated)
	continue;

	if (Message)
	ResultMessage = Deprecated->getMessage();

	Result = AR_Deprecated;
	continue;
	}

	if (const auto *Unavailable = dyn_cast<UnavailableAttr>(A)) {
	if (Message)
	*Message = Unavailable->getMessage();
	return AR_Unavailable;
	}

	if (const auto *Availability = dyn_cast<AvailabilityAttr>(A)) {
	AvailabilityResult AR = CheckAvailability(getASTContext(), Availability,
	Message, EnclosingVersion);

	if (AR == AR_Unavailable)
	return AR_Unavailable;

	if (AR > Result) {
	Result = AR;
	if (Message)
	ResultMessage.swap(*Message);
	}
	continue;
	}
	}

	if (Message)
	Message->swap(ResultMessage);
	return Result;
	}

	VersionTuple Decl::getVersionIntroduced() const {
	const ASTContext &Context = getASTContext();
	StringRef TargetPlatform = Context.getTargetInfo().getPlatformName();
	for (const auto *A : attrs()) {
	if (const auto *Availability = dyn_cast<AvailabilityAttr>(A)) {
	if (getRealizedPlatform(Availability, Context) != TargetPlatform)
	continue;
	if (!Availability->getIntroduced().empty())
	return Availability->getIntroduced();
	}
	}
	return VersionTuple();
	}

	bool Decl::canBeWeakImported(bool &IsDefinition) const {
	IsDefinition = false;

	// Variables, if they aren't definitions.
	if (const VarDecl *Var = dyn_cast<VarDecl>(this)) {
	if (Var->isThisDeclarationADefinition()) {
	IsDefinition = true;
	return false;
	}
	return true;

	// Functions, if they aren't definitions.
	} else if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(this)) {
	if (FD->hasBody()) {
	IsDefinition = true;
	return false;
	}
	return true;

	// Objective-C classes, if this is the non-fragile runtime.
	} else if (isa<ObjCInterfaceDecl>(this) &&
	getASTContext().getLangOpts().ObjCRuntime.hasWeakClassImport()) {
	return true;

	// Nothing else.
	} else {
	return false;
	}
	}

	bool Decl::isWeakImported() const {
	bool IsDefinition;
	if (!canBeWeakImported(IsDefinition))
	return false;

	for (const auto *A : attrs()) {
	if (isa<WeakImportAttr>(A))
	return true;

	if (const auto *Availability = dyn_cast<AvailabilityAttr>(A)) {
	if (CheckAvailability(getASTContext(), Availability, nullptr,
	VersionTuple()) == AR_NotYetIntroduced)
	return true;
	}
	}

	return false;
	}

	unsigned Decl::getIdentifierNamespaceForKind(Kind DeclKind) {
	switch (DeclKind) {
	case Function:
	case CXXDeductionGuide:
	case CXXMethod:
	case CXXConstructor:
	case ConstructorUsingShadow:
	case CXXDestructor:
	case CXXConversion:
	case EnumConstant:
	case Var:
	case ImplicitParam:
	case ParmVar:
	case ObjCMethod:
	case ObjCProperty:
	case MSProperty:
	return IDNS_Ordinary;
	case Label:
	return IDNS_Label;
	case IndirectField:
	return IDNS_Ordinary \| IDNS_Member;

	case Binding:
	case NonTypeTemplateParm:
	case VarTemplate:
	// These (C++-only) declarations are found by redeclaration lookup for
	// tag types, so we include them in the tag namespace.
	return IDNS_Ordinary \| IDNS_Tag;

	case ObjCCompatibleAlias:
	case ObjCInterface:
	return IDNS_Ordinary \| IDNS_Type;

	case Typedef:
	case TypeAlias:
	case TemplateTypeParm:
	case ObjCTypeParam:
	return IDNS_Ordinary \| IDNS_Type;

	case UnresolvedUsingTypename:
	return IDNS_Ordinary \| IDNS_Type \| IDNS_Using;

	case UsingShadow:
	return 0; // we'll actually overwrite this later

	case UnresolvedUsingValue:
	return IDNS_Ordinary \| IDNS_Using;

	case Using:
	case UsingPack:
	return IDNS_Using;

	case ObjCProtocol:
	return IDNS_ObjCProtocol;

	case Field:
	case ObjCAtDefsField:
	case ObjCIvar:
	return IDNS_Member;

	case Record:
	case CXXRecord:
	case Enum:
	return IDNS_Tag \| IDNS_Type;

	case Namespace:
	case NamespaceAlias:
	return IDNS_Namespace;

	case FunctionTemplate:
	return IDNS_Ordinary;

	case ClassTemplate:
	case TemplateTemplateParm:
	case TypeAliasTemplate:
	return IDNS_Ordinary \| IDNS_Tag \| IDNS_Type;

	case OMPDeclareReduction:
	return IDNS_OMPReduction;

	// Never have names.
	case Friend:
	case FriendTemplate:
	case AccessSpec:
	case LinkageSpec:
	case Export:
	case FileScopeAsm:
	case StaticAssert:
	case ObjCPropertyImpl:
	case PragmaComment:
	case PragmaDetectMismatch:
	case Block:
	case Captured:
	case TranslationUnit:
	case ExternCContext:
	case Decomposition:

	case UsingDirective:
	case BuiltinTemplate:
	case ClassTemplateSpecialization:
	case ClassTemplatePartialSpecialization:
	case ClassScopeFunctionSpecialization:
	case VarTemplateSpecialization:
	case VarTemplatePartialSpecialization:
	case ObjCImplementation:
	case ObjCCategory:
	case ObjCCategoryImpl:
	case Import:
	case OMPThreadPrivate:
	case OMPCapturedExpr:
	case Empty:
	// Never looked up by name.
	return 0;
	}

	llvm_unreachable("Invalid DeclKind!");
	}

	void Decl::setAttrsImpl(const AttrVec &attrs, ASTContext &Ctx) {
	assert(!HasAttrs && "Decl already contains attrs.");

	AttrVec &AttrBlank = Ctx.getDeclAttrs(this);
	assert(AttrBlank.empty() && "HasAttrs was wrong?");

	AttrBlank = attrs;
	HasAttrs = true;
	}

	void Decl::dropAttrs() {
	if (!HasAttrs) return;

	HasAttrs = false;
	getASTContext().eraseDeclAttrs(this);
	}

	const AttrVec &Decl::getAttrs() const {
	assert(HasAttrs && "No attrs to get!");
	return getASTContext().getDeclAttrs(this);
	}

	Decl Decl::castFromDeclContext (const DeclContext D) {
	Decl::Kind DK = D->getDeclKind();
	switch(DK) {
	#define DECL(NAME, BASE)
	#define DECL_CONTEXT(NAME) \
	case Decl::NAME: \
	return static_cast<NAME##Decl>(const_cast<DeclContext>(D));
	#define DECL_CONTEXT_BASE(NAME)
	#include "clang/AST/DeclNodes.inc"
	default:
	#define DECL(NAME, BASE)
	#define DECL_CONTEXT_BASE(NAME) \
	if (DK >= first##NAME && DK <= last##NAME) \
	return static_cast<NAME##Decl>(const_cast<DeclContext>(D));
	#include "clang/AST/DeclNodes.inc"
	llvm_unreachable("a decl that inherits DeclContext isn't handled");
	}
	}

	DeclContext Decl::castToDeclContext(const Decl D) {
	Decl::Kind DK = D->getKind();
	switch(DK) {
	#define DECL(NAME, BASE)
	#define DECL_CONTEXT(NAME) \
	case Decl::NAME: \
	return static_cast<NAME##Decl>(const_cast<Decl>(D));
	#define DECL_CONTEXT_BASE(NAME)
	#include "clang/AST/DeclNodes.inc"
	default:
	#define DECL(NAME, BASE)
	#define DECL_CONTEXT_BASE(NAME) \
	if (DK >= first##NAME && DK <= last##NAME) \
	return static_cast<NAME##Decl>(const_cast<Decl>(D));
	#include "clang/AST/DeclNodes.inc"
	llvm_unreachable("a decl that inherits DeclContext isn't handled");
	}
	}

	SourceLocation Decl::getBodyRBrace() const {
	// Special handling of FunctionDecl to avoid de-serializing the body from PCH.
	// FunctionDecl stores EndRangeLoc for this purpose.
	if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(this)) {
	const FunctionDecl *Definition;
	if (FD->hasBody(Definition))
	return Definition->getSourceRange().getEnd();
	return SourceLocation();
	}

	if (Stmt *Body = getBody())
	return Body->getSourceRange().getEnd();

	return SourceLocation();
	}

	bool Decl::AccessDeclContextSanity() const {
	#ifndef NDEBUG
	// Suppress this check if any of the following hold:
	// 1. this is the translation unit (and thus has no parent)
	// 2. this is a template parameter (and thus doesn't belong to its context)
	// 3. this is a non-type template parameter
	// 4. the context is not a record
	// 5. it's invalid
	// 6. it's a C++0x static_assert.
	// 7. it's a block literal declaration
	if (isa<TranslationUnitDecl>(this) \|\|
	isa<TemplateTypeParmDecl>(this) \|\|
	isa<NonTypeTemplateParmDecl>(this) \|\|
	!isa<CXXRecordDecl>(getDeclContext()) \|\|
	isInvalidDecl() \|\|
	isa<StaticAssertDecl>(this) \|\|
	isa<BlockDecl>(this) \|\|
	// FIXME: a ParmVarDecl can have ClassTemplateSpecialization
	// as DeclContext (?).
	isa<ParmVarDecl>(this) \|\|
	// FIXME: a ClassTemplateSpecialization or CXXRecordDecl can have
	// AS_none as access specifier.
	isa<CXXRecordDecl>(this) \|\|
	isa<ClassScopeFunctionSpecializationDecl>(this))
	return true;

	assert(Access != AS_none &&
	"Access specifier is AS_none inside a record decl");
	#endif
	return true;
	}

	static Decl::Kind getKind(const Decl *D) { return D->getKind(); }
	static Decl::Kind getKind(const DeclContext *DC) { return DC->getDeclKind(); }

	const FunctionType *Decl::getFunctionType(bool BlocksToo) const {
	QualType Ty;
	if (const ValueDecl *D = dyn_cast<ValueDecl>(this))
	Ty = D->getType();
	else if (const TypedefNameDecl *D = dyn_cast<TypedefNameDecl>(this))
	Ty = D->getUnderlyingType();
	else
	return nullptr;

	if (Ty->isFunctionPointerType())
	Ty = Ty->getAs<PointerType>()->getPointeeType();
	else if (BlocksToo && Ty->isBlockPointerType())
	Ty = Ty->getAs<BlockPointerType>()->getPointeeType();

	return Ty->getAs<FunctionType>();
	}

	/// Starting at a given context (a Decl or DeclContext), look for a
	/// code context that is not a closure (a lambda, block, etc.).
	template <class T> static Decl getNonClosureContext(T D) {
	if (getKind(D) == Decl::CXXMethod) {
	CXXMethodDecl *MD = cast<CXXMethodDecl>(D);
	if (MD->getOverloadedOperator() == OO_Call &&
	MD->getParent()->isLambda())
	return getNonClosureContext(MD->getParent()->getParent());
	return MD;
	} else if (FunctionDecl *FD = dyn_cast<FunctionDecl>(D)) {
	return FD;
	} else if (ObjCMethodDecl *MD = dyn_cast<ObjCMethodDecl>(D)) {
	return MD;
	} else if (BlockDecl *BD = dyn_cast<BlockDecl>(D)) {
	return getNonClosureContext(BD->getParent());
	} else if (CapturedDecl *CD = dyn_cast<CapturedDecl>(D)) {
	return getNonClosureContext(CD->getParent());
	} else {
	return nullptr;
	}
	}

	Decl *Decl::getNonClosureContext() {
	return ::getNonClosureContext(this);
	}

	Decl *DeclContext::getNonClosureAncestor() {
	return ::getNonClosureContext(this);
	}

	//===----------------------------------------------------------------------===//
	// DeclContext Implementation
	//===----------------------------------------------------------------------===//

	bool DeclContext::classof(const Decl *D) {
	switch (D->getKind()) {
	#define DECL(NAME, BASE)
	#define DECL_CONTEXT(NAME) case Decl::NAME:
	#define DECL_CONTEXT_BASE(NAME)
	#include "clang/AST/DeclNodes.inc"
	return true;
	default:
	#define DECL(NAME, BASE)
	#define DECL_CONTEXT_BASE(NAME) \
	if (D->getKind() >= Decl::first##NAME && \
	D->getKind() <= Decl::last##NAME) \
	return true;
	#include "clang/AST/DeclNodes.inc"
	return false;
	}
	}

	DeclContext::~DeclContext() = default;

	/// \brief Find the parent context of this context that will be
	/// used for unqualified name lookup.
	///
	/// Generally, the parent lookup context is the semantic context. However, for
	/// a friend function the parent lookup context is the lexical context, which
	/// is the class in which the friend is declared.
	DeclContext *DeclContext::getLookupParent() {
	// FIXME: Find a better way to identify friends
	if (isa<FunctionDecl>(this))
	if (getParent()->getRedeclContext()->isFileContext() &&
	getLexicalParent()->getRedeclContext()->isRecord())
	return getLexicalParent();

	return getParent();
	}

	bool DeclContext::isInlineNamespace() const {
	return isNamespace() &&
	cast<NamespaceDecl>(this)->isInline();
	}

	bool DeclContext::isStdNamespace() const {
	if (!isNamespace())
	return false;

	const NamespaceDecl *ND = cast<NamespaceDecl>(this);
	if (ND->isInline()) {
	return ND->getParent()->isStdNamespace();
	}

	if (!getParent()->getRedeclContext()->isTranslationUnit())
	return false;

	const IdentifierInfo *II = ND->getIdentifier();
	return II && II->isStr("std");
	}

	bool DeclContext::isDependentContext() const {
	if (isFileContext())
	return false;

	if (isa<ClassTemplatePartialSpecializationDecl>(this))
	return true;

	if (const CXXRecordDecl *Record = dyn_cast<CXXRecordDecl>(this)) {
	if (Record->getDescribedClassTemplate())
	return true;

	if (Record->isDependentLambda())
	return true;
	}

	if (const FunctionDecl *Function = dyn_cast<FunctionDecl>(this)) {
	if (Function->getDescribedFunctionTemplate())
	return true;

	// Friend function declarations are dependent if their lexical
	// context is dependent.
	if (cast<Decl>(this)->getFriendObjectKind())
	return getLexicalParent()->isDependentContext();
	}

	// FIXME: A variable template is a dependent context, but is not a
	// DeclContext. A context within it (such as a lambda-expression)
	// should be considered dependent.

	return getParent() && getParent()->isDependentContext();
	}

	bool DeclContext::isTransparentContext() const {
	if (DeclKind == Decl::Enum)
	return !cast<EnumDecl>(this)->isScoped();
	else if (DeclKind == Decl::LinkageSpec \|\| DeclKind == Decl::Export)
	return true;

	return false;
	}

	static bool isLinkageSpecContext(const DeclContext *DC,
	LinkageSpecDecl::LanguageIDs ID) {
	while (DC->getDeclKind() != Decl::TranslationUnit) {
	if (DC->getDeclKind() == Decl::LinkageSpec)
	return cast<LinkageSpecDecl>(DC)->getLanguage() == ID;
	DC = DC->getLexicalParent();
	}
	return false;
	}

	bool DeclContext::isExternCContext() const {
	return isLinkageSpecContext(this, LinkageSpecDecl::lang_c);
	}

	const LinkageSpecDecl *DeclContext::getExternCContext() const {
	const DeclContext *DC = this;
	while (DC->getDeclKind() != Decl::TranslationUnit) {
	if (DC->getDeclKind() == Decl::LinkageSpec &&
	cast<LinkageSpecDecl>(DC)->getLanguage() == LinkageSpecDecl::lang_c)
	return cast<LinkageSpecDecl>(DC);
	DC = DC->getLexicalParent();
	}
	return nullptr;
	}

	bool DeclContext::isExternCXXContext() const {
	return isLinkageSpecContext(this, LinkageSpecDecl::lang_cxx);
	}

	bool DeclContext::Encloses(const DeclContext *DC) const {
	if (getPrimaryContext() != this)
	return getPrimaryContext()->Encloses(DC);

	for (; DC; DC = DC->getParent())
	if (DC->getPrimaryContext() == this)
	return true;
	return false;
	}

	DeclContext *DeclContext::getPrimaryContext() {
	switch (DeclKind) {
	case Decl::TranslationUnit:
	case Decl::ExternCContext:
	case Decl::LinkageSpec:
	case Decl::Export:
	case Decl::Block:
	case Decl::Captured:
	case Decl::OMPDeclareReduction:
	// There is only one DeclContext for these entities.
	return this;

	case Decl::Namespace:
	// The original namespace is our primary context.
	return static_cast<NamespaceDecl*>(this)->getOriginalNamespace();

	case Decl::ObjCMethod:
	return this;

	case Decl::ObjCInterface:
	if (ObjCInterfaceDecl *Def = cast<ObjCInterfaceDecl>(this)->getDefinition())
	return Def;
	return this;

	case Decl::ObjCProtocol:
	if (ObjCProtocolDecl *Def = cast<ObjCProtocolDecl>(this)->getDefinition())
	return Def;
	return this;

	case Decl::ObjCCategory:
	return this;

	case Decl::ObjCImplementation:
	case Decl::ObjCCategoryImpl:
	return this;

	default:
	if (DeclKind >= Decl::firstTag && DeclKind <= Decl::lastTag) {
	// If this is a tag type that has a definition or is currently
	// being defined, that definition is our primary context.
	TagDecl *Tag = cast<TagDecl>(this);

	if (TagDecl *Def = Tag->getDefinition())
	return Def;

	if (const TagType *TagTy = dyn_cast<TagType>(Tag->getTypeForDecl())) {
	// Note, TagType::getDecl returns the (partial) definition one exists.
	TagDecl *PossiblePartialDef = TagTy->getDecl();
	if (PossiblePartialDef->isBeingDefined())
	return PossiblePartialDef;
	} else {
	assert(isa<InjectedClassNameType>(Tag->getTypeForDecl()));
	}

	return Tag;
	}

	assert(DeclKind >= Decl::firstFunction && DeclKind <= Decl::lastFunction &&
	"Unknown DeclContext kind");
	return this;
	}
	}

	void
	DeclContext::collectAllContexts(SmallVectorImpl<DeclContext *> &Contexts){
	Contexts.clear();

	if (DeclKind != Decl::Namespace) {
	Contexts.push_back(this);
	return;
	}

	NamespaceDecl Self = static_cast<NamespaceDecl >(this);
	for (NamespaceDecl *N = Self->getMostRecentDecl(); N;
	N = N->getPreviousDecl())
	Contexts.push_back(N);

	std::reverse(Contexts.begin(), Contexts.end());
	}

	std::pair<Decl , Decl >
	DeclContext::BuildDeclChain(ArrayRef<Decl*> Decls,
	bool FieldsAlreadyLoaded) {
	// Build up a chain of declarations via the Decl::NextInContextAndBits field.
	Decl *FirstNewDecl = nullptr;
	Decl *PrevDecl = nullptr;
	for (unsigned I = 0, N = Decls.size(); I != N; ++I) {
	if (FieldsAlreadyLoaded && isa<FieldDecl>(Decls[I]))
	continue;

	Decl *D = Decls[I];
	if (PrevDecl)
	PrevDecl->NextInContextAndBits.setPointer(D);
	else
	FirstNewDecl = D;

	PrevDecl = D;
	}

	return std::make_pair(FirstNewDecl, PrevDecl);
	}

	/// \brief We have just acquired external visible storage, and we already have
	/// built a lookup map. For every name in the map, pull in the new names from
	/// the external storage.
	void DeclContext::reconcileExternalVisibleStorage() const {
	assert(NeedToReconcileExternalVisibleStorage && LookupPtr);
	NeedToReconcileExternalVisibleStorage = false;

	for (auto &Lookup : *LookupPtr)
	Lookup.second.setHasExternalDecls();
	}

	/// \brief Load the declarations within this lexical storage from an
	/// external source.
	/// \return \c true if any declarations were added.
	bool
	DeclContext::LoadLexicalDeclsFromExternalStorage() const {
	ExternalASTSource *Source = getParentASTContext().getExternalSource();
	assert(hasExternalLexicalStorage() && Source && "No external storage?");

	// Notify that we have a DeclContext that is initializing.
	ExternalASTSource::Deserializing ADeclContext(Source);

	// Load the external declarations, if any.
	SmallVector<Decl*, 64> Decls;
	ExternalLexicalStorage = false;
	Source->FindExternalLexicalDecls(this, Decls);

	if (Decls.empty())
	return false;

	// We may have already loaded just the fields of this record, in which case
	// we need to ignore them.
	bool FieldsAlreadyLoaded = false;
	if (const RecordDecl *RD = dyn_cast<RecordDecl>(this))
	FieldsAlreadyLoaded = RD->LoadedFieldsFromExternalStorage;

	// Splice the newly-read declarations into the beginning of the list
	// of declarations.
	Decl ExternalFirst, ExternalLast;
	std::tie(ExternalFirst, ExternalLast) =
	BuildDeclChain(Decls, FieldsAlreadyLoaded);
	ExternalLast->NextInContextAndBits.setPointer(FirstDecl);
	FirstDecl = ExternalFirst;
	if (!LastDecl)
	LastDecl = ExternalLast;
	return true;
	}

	DeclContext::lookup_result
	ExternalASTSource::SetNoExternalVisibleDeclsForName(const DeclContext *DC,
	DeclarationName Name) {
	ASTContext &Context = DC->getParentASTContext();
	StoredDeclsMap *Map;
	if (!(Map = DC->LookupPtr))
	Map = DC->CreateStoredDeclsMap(Context);
	if (DC->NeedToReconcileExternalVisibleStorage)
	DC->reconcileExternalVisibleStorage();

	(*Map)[Name].removeExternalDecls();

	return DeclContext::lookup_result();
	}

	DeclContext::lookup_result
	ExternalASTSource::SetExternalVisibleDeclsForName(const DeclContext *DC,
	DeclarationName Name,
	ArrayRef<NamedDecl*> Decls) {
	ASTContext &Context = DC->getParentASTContext();
	StoredDeclsMap *Map;
	if (!(Map = DC->LookupPtr))
	Map = DC->CreateStoredDeclsMap(Context);
	if (DC->NeedToReconcileExternalVisibleStorage)
	DC->reconcileExternalVisibleStorage();

	StoredDeclsList &List = (*Map)[Name];

	// Clear out any old external visible declarations, to avoid quadratic
	// performance in the redeclaration checks below.
	List.removeExternalDecls();

	if (!List.isNull()) {
	// We have both existing declarations and new declarations for this name.
	// Some of the declarations may simply replace existing ones. Handle those
	// first.
	llvm::SmallVector<unsigned, 8> Skip;
	for (unsigned I = 0, N = Decls.size(); I != N; ++I)
	if (List.HandleRedeclaration(Decls[I], /IsKnownNewer/false))
	Skip.push_back(I);
	Skip.push_back(Decls.size());

	// Add in any new declarations.
	unsigned SkipPos = 0;
	for (unsigned I = 0, N = Decls.size(); I != N; ++I) {
	if (I == Skip[SkipPos])
	++SkipPos;
	else
	List.AddSubsequentDecl(Decls[I]);
	}
	} else {
	// Convert the array to a StoredDeclsList.
	for (ArrayRef<NamedDecl*>::iterator
	I = Decls.begin(), E = Decls.end(); I != E; ++I) {
	if (List.isNull())
	List.setOnlyValue(*I);
	else
	List.AddSubsequentDecl(*I);
	}
	}

	return List.getLookupResult();
	}

	DeclContext::decl_iterator DeclContext::decls_begin() const {
	if (hasExternalLexicalStorage())
	LoadLexicalDeclsFromExternalStorage();
	return decl_iterator(FirstDecl);
	}

	bool DeclContext::decls_empty() const {
	if (hasExternalLexicalStorage())
	LoadLexicalDeclsFromExternalStorage();

	return !FirstDecl;
	}

	bool DeclContext::containsDecl(Decl *D) const {
	return (D->getLexicalDeclContext() == this &&
	(D->NextInContextAndBits.getPointer() \|\| D == LastDecl));
	}

	void DeclContext::removeDecl(Decl *D) {
	assert(D->getLexicalDeclContext() == this &&
	"decl being removed from non-lexical context");
	assert((D->NextInContextAndBits.getPointer() \|\| D == LastDecl) &&
	"decl is not in decls list");

	// Remove D from the decl chain. This is O(n) but hopefully rare.
	if (D == FirstDecl) {
	if (D == LastDecl)
	FirstDecl = LastDecl = nullptr;
	else
	FirstDecl = D->NextInContextAndBits.getPointer();
	} else {
	for (Decl *I = FirstDecl; true; I = I->NextInContextAndBits.getPointer()) {
	assert(I && "decl not found in linked list");
	if (I->NextInContextAndBits.getPointer() == D) {
	I->NextInContextAndBits.setPointer(D->NextInContextAndBits.getPointer());
	if (D == LastDecl) LastDecl = I;
	break;
	}
	}
	}

	// Mark that D is no longer in the decl chain.
	D->NextInContextAndBits.setPointer(nullptr);

	// Remove D from the lookup table if necessary.
	if (isa<NamedDecl>(D)) {
	NamedDecl *ND = cast<NamedDecl>(D);

	// Remove only decls that have a name
	if (!ND->getDeclName()) return;

	auto *DC = D->getDeclContext();
	do {
	StoredDeclsMap *Map = DC->getPrimaryContext()->LookupPtr;
	if (Map) {
	StoredDeclsMap::iterator Pos = Map->find(ND->getDeclName());
	assert(Pos != Map->end() && "no lookup entry for decl");
	if (Pos->second.getAsVector() \|\| Pos->second.getAsDecl() == ND)
	Pos->second.remove(ND);
	}
	} while (DC->isTransparentContext() && (DC = DC->getParent()));
	}
	}

	void DeclContext::addHiddenDecl(Decl *D) {
	assert(D->getLexicalDeclContext() == this &&
	"Decl inserted into wrong lexical context");
	assert(!D->getNextDeclInContext() && D != LastDecl &&
	"Decl already inserted into a DeclContext");

	if (FirstDecl) {
	LastDecl->NextInContextAndBits.setPointer(D);
	LastDecl = D;
	} else {
	FirstDecl = LastDecl = D;
	}

	// Notify a C++ record declaration that we've added a member, so it can
	// update its class-specific state.
	if (CXXRecordDecl *Record = dyn_cast<CXXRecordDecl>(this))
	Record->addedMember(D);

	// If this is a newly-created (not de-serialized) import declaration, wire
	// it in to the list of local import declarations.
	if (!D->isFromASTFile()) {
	if (ImportDecl *Import = dyn_cast<ImportDecl>(D))
	D->getASTContext().addedLocalImportDecl(Import);
	}
	}

	void DeclContext::addDecl(Decl *D) {
	addHiddenDecl(D);

	if (NamedDecl *ND = dyn_cast<NamedDecl>(D))
	ND->getDeclContext()->getPrimaryContext()->
	makeDeclVisibleInContextWithFlags(ND, false, true);
	}

	void DeclContext::addDeclInternal(Decl *D) {
	addHiddenDecl(D);

	if (NamedDecl *ND = dyn_cast<NamedDecl>(D))
	ND->getDeclContext()->getPrimaryContext()->
	makeDeclVisibleInContextWithFlags(ND, true, true);
	}

	/// shouldBeHidden - Determine whether a declaration which was declared
	/// within its semantic context should be invisible to qualified name lookup.
	static bool shouldBeHidden(NamedDecl *D) {
	// Skip unnamed declarations.
	if (!D->getDeclName())
	return true;

	// Skip entities that can't be found by name lookup into a particular
	// context.
	if ((D->getIdentifierNamespace() == 0 && !isa<UsingDirectiveDecl>(D)) \|\|
	D->isTemplateParameter())
	return true;

	// Skip template specializations.
	// FIXME: This feels like a hack. Should DeclarationName support
	// template-ids, or is there a better way to keep specializations
	// from being visible?
	if (isa<ClassTemplateSpecializationDecl>(D))
	return true;
	if (FunctionDecl *FD = dyn_cast<FunctionDecl>(D))
	if (FD->isFunctionTemplateSpecialization())
	return true;

	return false;
	}

	/// buildLookup - Build the lookup data structure with all of the
	/// declarations in this DeclContext (and any other contexts linked
	/// to it or transparent contexts nested within it) and return it.
	///
	/// Note that the produced map may miss out declarations from an
	/// external source. If it does, those entries will be marked with
	/// the 'hasExternalDecls' flag.
	StoredDeclsMap *DeclContext::buildLookup() {
	assert(this == getPrimaryContext() && "buildLookup called on non-primary DC");

	if (!HasLazyLocalLexicalLookups && !HasLazyExternalLexicalLookups)
	return LookupPtr;

	SmallVector<DeclContext *, 2> Contexts;
	collectAllContexts(Contexts);

	if (HasLazyExternalLexicalLookups) {
	HasLazyExternalLexicalLookups = false;
	for (auto *DC : Contexts) {
	if (DC->hasExternalLexicalStorage())
	HasLazyLocalLexicalLookups \|=
	DC->LoadLexicalDeclsFromExternalStorage();
	}

	if (!HasLazyLocalLexicalLookups)
	return LookupPtr;
	}

	for (auto *DC : Contexts)
	buildLookupImpl(DC, hasExternalVisibleStorage());

	// We no longer have any lazy decls.
	HasLazyLocalLexicalLookups = false;
	return LookupPtr;
	}

	/// buildLookupImpl - Build part of the lookup data structure for the
	/// declarations contained within DCtx, which will either be this
	/// DeclContext, a DeclContext linked to it, or a transparent context
	/// nested within it.
	void DeclContext::buildLookupImpl(DeclContext *DCtx, bool Internal) {
	for (Decl *D : DCtx->noload_decls()) {
	// Insert this declaration into the lookup structure, but only if
	// it's semantically within its decl context. Any other decls which
	// should be found in this context are added eagerly.
	//
	// If it's from an AST file, don't add it now. It'll get handled by
	// FindExternalVisibleDeclsByName if needed. Exception: if we're not
	// in C++, we do not track external visible decls for the TU, so in
	// that case we need to collect them all here.
	if (NamedDecl *ND = dyn_cast<NamedDecl>(D))
	if (ND->getDeclContext() == DCtx && !shouldBeHidden(ND) &&
	(!ND->isFromASTFile() \|\|
	(isTranslationUnit() &&
	!getParentASTContext().getLangOpts().CPlusPlus)))
	makeDeclVisibleInContextImpl(ND, Internal);

	// If this declaration is itself a transparent declaration context
	// or inline namespace, add the members of this declaration of that
	// context (recursively).
	if (DeclContext *InnerCtx = dyn_cast<DeclContext>(D))
	if (InnerCtx->isTransparentContext() \|\| InnerCtx->isInlineNamespace())
	buildLookupImpl(InnerCtx, Internal);
	}
	}

	NamedDecl *const DeclContextLookupResult::SingleElementDummyList = nullptr;

	DeclContext::lookup_result
	DeclContext::lookup(DeclarationName Name) const {
	assert(DeclKind != Decl::LinkageSpec && DeclKind != Decl::Export &&
	"should not perform lookups into transparent contexts");

	const DeclContext *PrimaryContext = getPrimaryContext();
	if (PrimaryContext != this)
	return PrimaryContext->lookup(Name);

	// If we have an external source, ensure that any later redeclarations of this
	// context have been loaded, since they may add names to the result of this
	// lookup (or add external visible storage).
	ExternalASTSource *Source = getParentASTContext().getExternalSource();
	if (Source)
	(void)cast<Decl>(this)->getMostRecentDecl();

	if (hasExternalVisibleStorage()) {
	assert(Source && "external visible storage but no external source?");

	if (NeedToReconcileExternalVisibleStorage)
	reconcileExternalVisibleStorage();

	StoredDeclsMap *Map = LookupPtr;

	if (HasLazyLocalLexicalLookups \|\| HasLazyExternalLexicalLookups)
	// FIXME: Make buildLookup const?
	Map = const_cast<DeclContext*>(this)->buildLookup();

	if (!Map)
	Map = CreateStoredDeclsMap(getParentASTContext());

	// If we have a lookup result with no external decls, we are done.
	std::pair<StoredDeclsMap::iterator, bool> R =
	Map->insert(std::make_pair(Name, StoredDeclsList()));
	if (!R.second && !R.first->second.hasExternalDecls())
	return R.first->second.getLookupResult();

	if (Source->FindExternalVisibleDeclsByName(this, Name) \|\| !R.second) {
	if (StoredDeclsMap *Map = LookupPtr) {
	StoredDeclsMap::iterator I = Map->find(Name);
	if (I != Map->end())
	return I->second.getLookupResult();
	}
	}

	return lookup_result();
	}

	StoredDeclsMap *Map = LookupPtr;
	if (HasLazyLocalLexicalLookups \|\| HasLazyExternalLexicalLookups)
	Map = const_cast<DeclContext*>(this)->buildLookup();

	if (!Map)
	return lookup_result();

	StoredDeclsMap::iterator I = Map->find(Name);
	if (I == Map->end())
	return lookup_result();

	return I->second.getLookupResult();
	}

	DeclContext::lookup_result
	DeclContext::noload_lookup(DeclarationName Name) {
	assert(DeclKind != Decl::LinkageSpec && DeclKind != Decl::Export &&
	"should not perform lookups into transparent contexts");

	DeclContext *PrimaryContext = getPrimaryContext();
	if (PrimaryContext != this)
	return PrimaryContext->noload_lookup(Name);

	// If we have any lazy lexical declarations not in our lookup map, add them
	// now. Don't import any external declarations, not even if we know we have
	// some missing from the external visible lookups.
	if (HasLazyLocalLexicalLookups) {
	SmallVector<DeclContext *, 2> Contexts;
	collectAllContexts(Contexts);
	for (unsigned I = 0, N = Contexts.size(); I != N; ++I)
	buildLookupImpl(Contexts[I], hasExternalVisibleStorage());
	HasLazyLocalLexicalLookups = false;
	}

	StoredDeclsMap *Map = LookupPtr;
	if (!Map)
	return lookup_result();

	StoredDeclsMap::iterator I = Map->find(Name);
	return I != Map->end() ? I->second.getLookupResult()
	: lookup_result();
	}

	void DeclContext::localUncachedLookup(DeclarationName Name,
	SmallVectorImpl<NamedDecl *> &Results) {
	Results.clear();

	// If there's no external storage, just perform a normal lookup and copy
	// the results.
	if (!hasExternalVisibleStorage() && !hasExternalLexicalStorage() && Name) {
	lookup_result LookupResults = lookup(Name);
	Results.insert(Results.end(), LookupResults.begin(), LookupResults.end());
	return;
	}

	// If we have a lookup table, check there first. Maybe we'll get lucky.
	// FIXME: Should we be checking these flags on the primary context?
	if (Name && !HasLazyLocalLexicalLookups && !HasLazyExternalLexicalLookups) {
	if (StoredDeclsMap *Map = LookupPtr) {
	StoredDeclsMap::iterator Pos = Map->find(Name);
	if (Pos != Map->end()) {
	Results.insert(Results.end(),
	Pos->second.getLookupResult().begin(),
	Pos->second.getLookupResult().end());
	return;
	}
	}
	}

	// Slow case: grovel through the declarations in our chain looking for
	// matches.
	// FIXME: If we have lazy external declarations, this will not find them!
	// FIXME: Should we CollectAllContexts and walk them all here?
	for (Decl *D = FirstDecl; D; D = D->getNextDeclInContext()) {
	if (NamedDecl *ND = dyn_cast<NamedDecl>(D))
	if (ND->getDeclName() == Name)
	Results.push_back(ND);
	}
	}

	DeclContext *DeclContext::getRedeclContext() {
	DeclContext *Ctx = this;
	// Skip through transparent contexts.
	while (Ctx->isTransparentContext())
	Ctx = Ctx->getParent();
	return Ctx;
	}

	DeclContext *DeclContext::getEnclosingNamespaceContext() {
	DeclContext *Ctx = this;
	// Skip through non-namespace, non-translation-unit contexts.
	while (!Ctx->isFileContext())
	Ctx = Ctx->getParent();
	return Ctx->getPrimaryContext();
	}

	RecordDecl *DeclContext::getOuterLexicalRecordContext() {
	// Loop until we find a non-record context.
	RecordDecl *OutermostRD = nullptr;
	DeclContext *DC = this;
	while (DC->isRecord()) {
	OutermostRD = cast<RecordDecl>(DC);
	DC = DC->getLexicalParent();
	}
	return OutermostRD;
	}

	bool DeclContext::InEnclosingNamespaceSetOf(const DeclContext *O) const {
	// For non-file contexts, this is equivalent to Equals.
	if (!isFileContext())
	return O->Equals(this);

	do {
	if (O->Equals(this))
	return true;

	const NamespaceDecl *NS = dyn_cast<NamespaceDecl>(O);
	if (!NS \|\| !NS->isInline())
	break;
	O = NS->getParent();
	} while (O);

	return false;
	}

	void DeclContext::makeDeclVisibleInContext(NamedDecl *D) {
	DeclContext *PrimaryDC = this->getPrimaryContext();
	DeclContext *DeclDC = D->getDeclContext()->getPrimaryContext();
	// If the decl is being added outside of its semantic decl context, we
	// need to ensure that we eagerly build the lookup information for it.
	PrimaryDC->makeDeclVisibleInContextWithFlags(D, false, PrimaryDC == DeclDC);
	}

	void DeclContext::makeDeclVisibleInContextWithFlags(NamedDecl *D, bool Internal,
	bool Recoverable) {
	assert(this == getPrimaryContext() && "expected a primary DC");

	if (!isLookupContext()) {
	if (isTransparentContext())
	getParent()->getPrimaryContext()
	->makeDeclVisibleInContextWithFlags(D, Internal, Recoverable);
	return;
	}

	// Skip declarations which should be invisible to name lookup.
	if (shouldBeHidden(D))
	return;

	// If we already have a lookup data structure, perform the insertion into
	// it. If we might have externally-stored decls with this name, look them
	// up and perform the insertion. If this decl was declared outside its
	// semantic context, buildLookup won't add it, so add it now.
	//
	// FIXME: As a performance hack, don't add such decls into the translation
	// unit unless we're in C++, since qualified lookup into the TU is never
	// performed.
	if (LookupPtr \|\| hasExternalVisibleStorage() \|\|
	((!Recoverable \|\| D->getDeclContext() != D->getLexicalDeclContext()) &&
	(getParentASTContext().getLangOpts().CPlusPlus \|\|
	!isTranslationUnit()))) {
	// If we have lazily omitted any decls, they might have the same name as
	// the decl which we are adding, so build a full lookup table before adding
	// this decl.
	buildLookup();
	makeDeclVisibleInContextImpl(D, Internal);
	} else {
	HasLazyLocalLexicalLookups = true;
	}

	// If we are a transparent context or inline namespace, insert into our
	// parent context, too. This operation is recursive.
	if (isTransparentContext() \|\| isInlineNamespace())
	getParent()->getPrimaryContext()->
	makeDeclVisibleInContextWithFlags(D, Internal, Recoverable);

	Decl *DCAsDecl = cast<Decl>(this);
	// Notify that a decl was made visible unless we are a Tag being defined.
	if (!(isa<TagDecl>(DCAsDecl) && cast<TagDecl>(DCAsDecl)->isBeingDefined()))
	if (ASTMutationListener *L = DCAsDecl->getASTMutationListener())
	L->AddedVisibleDecl(this, D);
	}

	void DeclContext::makeDeclVisibleInContextImpl(NamedDecl *D, bool Internal) {
	// Find or create the stored declaration map.
	StoredDeclsMap *Map = LookupPtr;
	if (!Map) {
	ASTContext *C = &getParentASTContext();
	Map = CreateStoredDeclsMap(*C);
	}

	// If there is an external AST source, load any declarations it knows about
	// with this declaration's name.
	// If the lookup table contains an entry about this name it means that we
	// have already checked the external source.
	if (!Internal)
	if (ExternalASTSource *Source = getParentASTContext().getExternalSource())
	if (hasExternalVisibleStorage() &&
	Map->find(D->getDeclName()) == Map->end())
	Source->FindExternalVisibleDeclsByName(this, D->getDeclName());

	// Insert this declaration into the map.
	StoredDeclsList &DeclNameEntries = (*Map)[D->getDeclName()];

	if (Internal) {
	// If this is being added as part of loading an external declaration,
	// this may not be the only external declaration with this name.
	// In this case, we never try to replace an existing declaration; we'll
	// handle that when we finalize the list of declarations for this name.
	DeclNameEntries.setHasExternalDecls();
	DeclNameEntries.AddSubsequentDecl(D);
	return;
	}

	if (DeclNameEntries.isNull()) {
	DeclNameEntries.setOnlyValue(D);
	return;
	}

	if (DeclNameEntries.HandleRedeclaration(D, /IsKnownNewer/!Internal)) {
	// This declaration has replaced an existing one for which
	// declarationReplaces returns true.
	return;
	}

	// Put this declaration into the appropriate slot.
	DeclNameEntries.AddSubsequentDecl(D);
	}

	UsingDirectiveDecl DeclContext::udir_iterator::operator() const {
	return cast<UsingDirectiveDecl>(*I);
	}

	/// Returns iterator range [First, Last) of UsingDirectiveDecls stored within
	/// this context.
	DeclContext::udir_range DeclContext::using_directives() const {
	// FIXME: Use something more efficient than normal lookup for using
	// directives. In C++, using directives are looked up more than anything else.
	lookup_result Result = lookup(UsingDirectiveDecl::getName());
	return udir_range(Result.begin(), Result.end());
	}

	//===----------------------------------------------------------------------===//
	// Creation and Destruction of StoredDeclsMaps. //
	//===----------------------------------------------------------------------===//

	StoredDeclsMap *DeclContext::CreateStoredDeclsMap(ASTContext &C) const {
	assert(!LookupPtr && "context already has a decls map");
	assert(getPrimaryContext() == this &&
	"creating decls map on non-primary context");

	StoredDeclsMap *M;
	bool Dependent = isDependentContext();
	if (Dependent)
	M = new DependentStoredDeclsMap();
	else
	M = new StoredDeclsMap();
	M->Previous = C.LastSDM;
	C.LastSDM = llvm::PointerIntPair<StoredDeclsMap*,1>(M, Dependent);
	LookupPtr = M;
	return M;
	}

	void ASTContext::ReleaseDeclContextMaps() {
	// It's okay to delete DependentStoredDeclsMaps via a StoredDeclsMap
	// pointer because the subclass doesn't add anything that needs to
	// be deleted.
	StoredDeclsMap::DestroyAll(LastSDM.getPointer(), LastSDM.getInt());
	}

	void StoredDeclsMap::DestroyAll(StoredDeclsMap *Map, bool Dependent) {
	while (Map) {
	// Advance the iteration before we invalidate memory.
	llvm::PointerIntPair<StoredDeclsMap*,1> Next = Map->Previous;

	if (Dependent)
	delete static_cast<DependentStoredDeclsMap*>(Map);
	else
	delete Map;

	Map = Next.getPointer();
	Dependent = Next.getInt();
	}
	}

	DependentDiagnostic *DependentDiagnostic::Create(ASTContext &C,
	DeclContext *Parent,
	const PartialDiagnostic &PDiag) {
	assert(Parent->isDependentContext()
	&& "cannot iterate dependent diagnostics of non-dependent context");
	Parent = Parent->getPrimaryContext();
	if (!Parent->LookupPtr)
	Parent->CreateStoredDeclsMap(C);

	DependentStoredDeclsMap *Map =
	static_cast<DependentStoredDeclsMap *>(Parent->LookupPtr);

	// Allocate the copy of the PartialDiagnostic via the ASTContext's
	// BumpPtrAllocator, rather than the ASTContext itself.
	PartialDiagnostic::Storage *DiagStorage = nullptr;
	if (PDiag.hasStorage())
	DiagStorage = new (C) PartialDiagnostic::Storage;

	DependentDiagnostic *DD = new (C) DependentDiagnostic(PDiag, DiagStorage);

	// TODO: Maybe we shouldn't reverse the order during insertion.
	DD->NextDiagnostic = Map->FirstDiagnostic;
	Map->FirstDiagnostic = DD;

	return DD;
	}
	Index: head/contrib/llvm/tools/clang/lib/AST/MicrosoftMangle.cpp
	===================================================================
	--- head/contrib/llvm/tools/clang/lib/AST/MicrosoftMangle.cpp (revision 329409)
	+++ head/contrib/llvm/tools/clang/lib/AST/MicrosoftMangle.cpp (revision 329410)
	@@ -1,3179 +1,3179 @@
	//===--- MicrosoftMangle.cpp - Microsoft Visual C++ Name Mangling ---------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This provides C++ name mangling targeting the Microsoft Visual C++ ABI.
	//
	//===----------------------------------------------------------------------===//

	#include "clang/AST/Mangle.h"
	#include "clang/AST/ASTContext.h"
	#include "clang/AST/Attr.h"
	#include "clang/AST/CXXInheritance.h"
	#include "clang/AST/CharUnits.h"
	#include "clang/AST/Decl.h"
	#include "clang/AST/DeclCXX.h"
	#include "clang/AST/DeclObjC.h"
	#include "clang/AST/DeclOpenMP.h"
	#include "clang/AST/DeclTemplate.h"
	#include "clang/AST/Expr.h"
	#include "clang/AST/ExprCXX.h"
	#include "clang/AST/VTableBuilder.h"
	#include "clang/Basic/ABI.h"
	#include "clang/Basic/DiagnosticOptions.h"
	#include "clang/Basic/TargetInfo.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/Support/JamCRC.h"
	#include "llvm/Support/MD5.h"
	#include "llvm/Support/MathExtras.h"

	using namespace clang;

	namespace {

	struct msvc_hashing_ostream : public llvm::raw_svector_ostream {
	raw_ostream &OS;
	llvm::SmallString<64> Buffer;

	msvc_hashing_ostream(raw_ostream &OS)
	: llvm::raw_svector_ostream(Buffer), OS(OS) {}
	~msvc_hashing_ostream() override {
	StringRef MangledName = str();
	bool StartsWithEscape = MangledName.startswith("\01");
	if (StartsWithEscape)
	MangledName = MangledName.drop_front(1);
	if (MangledName.size() <= 4096) {
	OS << str();
	return;
	}

	llvm::MD5 Hasher;
	llvm::MD5::MD5Result Hash;
	Hasher.update(MangledName);
	Hasher.final(Hash);

	SmallString<32> HexString;
	llvm::MD5::stringifyResult(Hash, HexString);

	if (StartsWithEscape)
	OS << '\01';
	OS << "??@" << HexString << '@';
	}
	};

	static const DeclContext *
	getLambdaDefaultArgumentDeclContext(const Decl *D) {
	if (const auto *RD = dyn_cast<CXXRecordDecl>(D))
	if (RD->isLambda())
	if (const auto *Parm =
	dyn_cast_or_null<ParmVarDecl>(RD->getLambdaContextDecl()))
	return Parm->getDeclContext();
	return nullptr;
	}

	/// \brief Retrieve the declaration context that should be used when mangling
	/// the given declaration.
	static const DeclContext getEffectiveDeclContext(const Decl D) {
	// The ABI assumes that lambda closure types that occur within
	// default arguments live in the context of the function. However, due to
	// the way in which Clang parses and creates function declarations, this is
	// not the case: the lambda closure type ends up living in the context
	// where the function itself resides, because the function declaration itself
	// had not yet been created. Fix the context here.
	if (const auto *LDADC = getLambdaDefaultArgumentDeclContext(D))
	return LDADC;

	// Perform the same check for block literals.
	if (const BlockDecl *BD = dyn_cast<BlockDecl>(D)) {
	if (ParmVarDecl *ContextParam =
	dyn_cast_or_null<ParmVarDecl>(BD->getBlockManglingContextDecl()))
	return ContextParam->getDeclContext();
	}

	const DeclContext *DC = D->getDeclContext();
	if (isa<CapturedDecl>(DC) \|\| isa<OMPDeclareReductionDecl>(DC)) {
	return getEffectiveDeclContext(cast<Decl>(DC));
	}

	return DC->getRedeclContext();
	}

	static const DeclContext getEffectiveParentContext(const DeclContext DC) {
	return getEffectiveDeclContext(cast<Decl>(DC));
	}

	static const FunctionDecl getStructor(const NamedDecl ND) {
	if (const auto *FTD = dyn_cast<FunctionTemplateDecl>(ND))
	return FTD->getTemplatedDecl()->getCanonicalDecl();

	const auto *FD = cast<FunctionDecl>(ND);
	if (const auto *FTD = FD->getPrimaryTemplate())
	return FTD->getTemplatedDecl()->getCanonicalDecl();

	return FD->getCanonicalDecl();
	}

	/// MicrosoftMangleContextImpl - Overrides the default MangleContext for the
	/// Microsoft Visual C++ ABI.
	class MicrosoftMangleContextImpl : public MicrosoftMangleContext {
	typedef std::pair<const DeclContext , IdentifierInfo > DiscriminatorKeyTy;
	llvm::DenseMap<DiscriminatorKeyTy, unsigned> Discriminator;
	llvm::DenseMap<const NamedDecl *, unsigned> Uniquifier;
	llvm::DenseMap<const CXXRecordDecl *, unsigned> LambdaIds;
	llvm::DenseMap<const NamedDecl *, unsigned> SEHFilterIds;
	llvm::DenseMap<const NamedDecl *, unsigned> SEHFinallyIds;

	public:
	MicrosoftMangleContextImpl(ASTContext &Context, DiagnosticsEngine &Diags)
	: MicrosoftMangleContext(Context, Diags) {}
	bool shouldMangleCXXName(const NamedDecl *D) override;
	bool shouldMangleStringLiteral(const StringLiteral *SL) override;
	void mangleCXXName(const NamedDecl *D, raw_ostream &Out) override;
	void mangleVirtualMemPtrThunk(const CXXMethodDecl *MD,
	raw_ostream &) override;
	void mangleThunk(const CXXMethodDecl *MD, const ThunkInfo &Thunk,
	raw_ostream &) override;
	void mangleCXXDtorThunk(const CXXDestructorDecl *DD, CXXDtorType Type,
	const ThisAdjustment &ThisAdjustment,
	raw_ostream &) override;
	void mangleCXXVFTable(const CXXRecordDecl *Derived,
	ArrayRef<const CXXRecordDecl *> BasePath,
	raw_ostream &Out) override;
	void mangleCXXVBTable(const CXXRecordDecl *Derived,
	ArrayRef<const CXXRecordDecl *> BasePath,
	raw_ostream &Out) override;
	void mangleCXXVirtualDisplacementMap(const CXXRecordDecl *SrcRD,
	const CXXRecordDecl *DstRD,
	raw_ostream &Out) override;
	void mangleCXXThrowInfo(QualType T, bool IsConst, bool IsVolatile,
	bool IsUnaligned, uint32_t NumEntries,
	raw_ostream &Out) override;
	void mangleCXXCatchableTypeArray(QualType T, uint32_t NumEntries,
	raw_ostream &Out) override;
	void mangleCXXCatchableType(QualType T, const CXXConstructorDecl *CD,
	CXXCtorType CT, uint32_t Size, uint32_t NVOffset,
	int32_t VBPtrOffset, uint32_t VBIndex,
	raw_ostream &Out) override;
	void mangleCXXRTTI(QualType T, raw_ostream &Out) override;
	void mangleCXXRTTIName(QualType T, raw_ostream &Out) override;
	void mangleCXXRTTIBaseClassDescriptor(const CXXRecordDecl *Derived,
	uint32_t NVOffset, int32_t VBPtrOffset,
	uint32_t VBTableOffset, uint32_t Flags,
	raw_ostream &Out) override;
	void mangleCXXRTTIBaseClassArray(const CXXRecordDecl *Derived,
	raw_ostream &Out) override;
	void mangleCXXRTTIClassHierarchyDescriptor(const CXXRecordDecl *Derived,
	raw_ostream &Out) override;
	void
	mangleCXXRTTICompleteObjectLocator(const CXXRecordDecl *Derived,
	ArrayRef<const CXXRecordDecl *> BasePath,
	raw_ostream &Out) override;
	void mangleTypeName(QualType T, raw_ostream &) override;
	void mangleCXXCtor(const CXXConstructorDecl *D, CXXCtorType Type,
	raw_ostream &) override;
	void mangleCXXDtor(const CXXDestructorDecl *D, CXXDtorType Type,
	raw_ostream &) override;
	void mangleReferenceTemporary(const VarDecl *, unsigned ManglingNumber,
	raw_ostream &) override;
	void mangleStaticGuardVariable(const VarDecl *D, raw_ostream &Out) override;
	void mangleThreadSafeStaticGuardVariable(const VarDecl *D, unsigned GuardNum,
	raw_ostream &Out) override;
	void mangleDynamicInitializer(const VarDecl *D, raw_ostream &Out) override;
	void mangleDynamicAtExitDestructor(const VarDecl *D,
	raw_ostream &Out) override;
	void mangleSEHFilterExpression(const NamedDecl *EnclosingDecl,
	raw_ostream &Out) override;
	void mangleSEHFinallyBlock(const NamedDecl *EnclosingDecl,
	raw_ostream &Out) override;
	void mangleStringLiteral(const StringLiteral *SL, raw_ostream &Out) override;
	bool getNextDiscriminator(const NamedDecl *ND, unsigned &disc) {
	const DeclContext *DC = getEffectiveDeclContext(ND);
	if (!DC->isFunctionOrMethod())
	return false;

	// Lambda closure types are already numbered, give out a phony number so
	// that they demangle nicely.
	if (const auto *RD = dyn_cast<CXXRecordDecl>(ND)) {
	if (RD->isLambda()) {
	disc = 1;
	return true;
	}
	}

	// Use the canonical number for externally visible decls.
	if (ND->isExternallyVisible()) {
	disc = getASTContext().getManglingNumber(ND);
	return true;
	}

	// Anonymous tags are already numbered.
	if (const TagDecl *Tag = dyn_cast<TagDecl>(ND)) {
	if (!Tag->hasNameForLinkage() &&
	!getASTContext().getDeclaratorForUnnamedTagDecl(Tag) &&
	!getASTContext().getTypedefNameForUnnamedTagDecl(Tag))
	return false;
	}

	// Make up a reasonable number for internal decls.
	unsigned &discriminator = Uniquifier[ND];
	if (!discriminator)
	discriminator = ++Discriminator[std::make_pair(DC, ND->getIdentifier())];
	disc = discriminator + 1;
	return true;
	}

	unsigned getLambdaId(const CXXRecordDecl *RD) {
	assert(RD->isLambda() && "RD must be a lambda!");
	assert(!RD->isExternallyVisible() && "RD must not be visible!");
	assert(RD->getLambdaManglingNumber() == 0 &&
	"RD must not have a mangling number!");
	std::pair<llvm::DenseMap<const CXXRecordDecl *, unsigned>::iterator, bool>
	Result = LambdaIds.insert(std::make_pair(RD, LambdaIds.size()));
	return Result.first->second;
	}

	private:
	void mangleInitFiniStub(const VarDecl *D, char CharCode, raw_ostream &Out);
	};

	/// MicrosoftCXXNameMangler - Manage the mangling of a single name for the
	/// Microsoft Visual C++ ABI.
	class MicrosoftCXXNameMangler {
	MicrosoftMangleContextImpl &Context;
	raw_ostream &Out;

	/// The "structor" is the top-level declaration being mangled, if
	/// that's not a template specialization; otherwise it's the pattern
	/// for that specialization.
	const NamedDecl *Structor;
	unsigned StructorType;

	typedef llvm::SmallVector<std::string, 10> BackRefVec;
	BackRefVec NameBackReferences;

	typedef llvm::DenseMap<const void *, unsigned> ArgBackRefMap;
	ArgBackRefMap TypeBackReferences;

	typedef std::set<int> PassObjectSizeArgsSet;
	PassObjectSizeArgsSet PassObjectSizeArgs;

	ASTContext &getASTContext() const { return Context.getASTContext(); }

	// FIXME: If we add support for __ptr32/64 qualifiers, then we should push
	// this check into mangleQualifiers().
	const bool PointersAre64Bit;

	public:
	enum QualifierMangleMode { QMM_Drop, QMM_Mangle, QMM_Escape, QMM_Result };

	MicrosoftCXXNameMangler(MicrosoftMangleContextImpl &C, raw_ostream &Out_)
	: Context(C), Out(Out_), Structor(nullptr), StructorType(-1),
	PointersAre64Bit(C.getASTContext().getTargetInfo().getPointerWidth(0) ==
	64) {}

	MicrosoftCXXNameMangler(MicrosoftMangleContextImpl &C, raw_ostream &Out_,
	const CXXConstructorDecl *D, CXXCtorType Type)
	: Context(C), Out(Out_), Structor(getStructor(D)), StructorType(Type),
	PointersAre64Bit(C.getASTContext().getTargetInfo().getPointerWidth(0) ==
	64) {}

	MicrosoftCXXNameMangler(MicrosoftMangleContextImpl &C, raw_ostream &Out_,
	const CXXDestructorDecl *D, CXXDtorType Type)
	: Context(C), Out(Out_), Structor(getStructor(D)), StructorType(Type),
	PointersAre64Bit(C.getASTContext().getTargetInfo().getPointerWidth(0) ==
	64) {}

	raw_ostream &getStream() const { return Out; }

	void mangle(const NamedDecl *D, StringRef Prefix = "\01?");
	void mangleName(const NamedDecl *ND);
	void mangleFunctionEncoding(const FunctionDecl *FD, bool ShouldMangle);
	void mangleVariableEncoding(const VarDecl *VD);
	void mangleMemberDataPointer(const CXXRecordDecl RD, const ValueDecl VD);
	void mangleMemberFunctionPointer(const CXXRecordDecl *RD,
	const CXXMethodDecl *MD);
	void mangleVirtualMemPtrThunk(
	const CXXMethodDecl *MD,
	const MicrosoftVTableContext::MethodVFTableLocation &ML);
	void mangleNumber(int64_t Number);
	void mangleTagTypeKind(TagTypeKind TK);
	void mangleArtificalTagType(TagTypeKind TK, StringRef UnqualifiedName,
	ArrayRef<StringRef> NestedNames = None);
	void mangleType(QualType T, SourceRange Range,
	QualifierMangleMode QMM = QMM_Mangle);
	void mangleFunctionType(const FunctionType *T,
	const FunctionDecl *D = nullptr,
	bool ForceThisQuals = false);
	void mangleNestedName(const NamedDecl *ND);

	private:
	bool isStructorDecl(const NamedDecl *ND) const {
	return ND == Structor \|\| getStructor(ND) == Structor;
	}

	void mangleUnqualifiedName(const NamedDecl *ND) {
	mangleUnqualifiedName(ND, ND->getDeclName());
	}
	void mangleUnqualifiedName(const NamedDecl *ND, DeclarationName Name);
	void mangleSourceName(StringRef Name);
	void mangleOperatorName(OverloadedOperatorKind OO, SourceLocation Loc);
	void mangleCXXDtorType(CXXDtorType T);
	void mangleQualifiers(Qualifiers Quals, bool IsMember);
	void mangleRefQualifier(RefQualifierKind RefQualifier);
	void manglePointerCVQualifiers(Qualifiers Quals);
	void manglePointerExtQualifiers(Qualifiers Quals, QualType PointeeType);

	void mangleUnscopedTemplateName(const TemplateDecl *ND);
	void
	mangleTemplateInstantiationName(const TemplateDecl *TD,
	const TemplateArgumentList &TemplateArgs);
	void mangleObjCMethodName(const ObjCMethodDecl *MD);

	void mangleArgumentType(QualType T, SourceRange Range);
	void manglePassObjectSizeArg(const PassObjectSizeAttr *POSA);

	// Declare manglers for every type class.
	#define ABSTRACT_TYPE(CLASS, PARENT)
	#define NON_CANONICAL_TYPE(CLASS, PARENT)
	#define TYPE(CLASS, PARENT) void mangleType(const CLASS##Type *T, \
	Qualifiers Quals, \
	SourceRange Range);
	#include "clang/AST/TypeNodes.def"
	#undef ABSTRACT_TYPE
	#undef NON_CANONICAL_TYPE
	#undef TYPE

	void mangleType(const TagDecl *TD);
	void mangleDecayedArrayType(const ArrayType *T);
	void mangleArrayType(const ArrayType *T);
	void mangleFunctionClass(const FunctionDecl *FD);
	void mangleCallingConvention(CallingConv CC);
	void mangleCallingConvention(const FunctionType *T);
	void mangleIntegerLiteral(const llvm::APSInt &Number, bool IsBoolean);
	void mangleExpression(const Expr *E);
	void mangleThrowSpecification(const FunctionProtoType *T);

	void mangleTemplateArgs(const TemplateDecl *TD,
	const TemplateArgumentList &TemplateArgs);
	void mangleTemplateArg(const TemplateDecl *TD, const TemplateArgument &TA,
	const NamedDecl *Parm);
	};
	}

	bool MicrosoftMangleContextImpl::shouldMangleCXXName(const NamedDecl *D) {
	if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(D)) {
	LanguageLinkage L = FD->getLanguageLinkage();
	// Overloadable functions need mangling.
	if (FD->hasAttr<OverloadableAttr>())
	return true;

	// The ABI expects that we would never mangle "typical" user-defined entry
	// points regardless of visibility or freestanding-ness.
	//
	// N.B. This is distinct from asking about "main". "main" has a lot of
	// special rules associated with it in the standard while these
	// user-defined entry points are outside of the purview of the standard.
	// For example, there can be only one definition for "main" in a standards
	// compliant program; however nothing forbids the existence of wmain and
	// WinMain in the same translation unit.
	if (FD->isMSVCRTEntryPoint())
	return false;

	// C++ functions and those whose names are not a simple identifier need
	// mangling.
	if (!FD->getDeclName().isIdentifier() \|\| L == CXXLanguageLinkage)
	return true;

	// C functions are not mangled.
	if (L == CLanguageLinkage)
	return false;
	}

	// Otherwise, no mangling is done outside C++ mode.
	if (!getASTContext().getLangOpts().CPlusPlus)
	return false;

	const VarDecl *VD = dyn_cast<VarDecl>(D);
	if (VD && !isa<DecompositionDecl>(D)) {
	// C variables are not mangled.
	if (VD->isExternC())
	return false;

	// Variables at global scope with non-internal linkage are not mangled.
	const DeclContext *DC = getEffectiveDeclContext(D);
	// Check for extern variable declared locally.
	if (DC->isFunctionOrMethod() && D->hasLinkage())
	while (!DC->isNamespace() && !DC->isTranslationUnit())
	DC = getEffectiveParentContext(DC);

	if (DC->isTranslationUnit() && D->getFormalLinkage() == InternalLinkage &&
	!isa<VarTemplateSpecializationDecl>(D) &&
	D->getIdentifier() != nullptr)
	return false;
	}

	return true;
	}

	bool
	MicrosoftMangleContextImpl::shouldMangleStringLiteral(const StringLiteral *SL) {
	return true;
	}

	void MicrosoftCXXNameMangler::mangle(const NamedDecl *D, StringRef Prefix) {
	// MSVC doesn't mangle C++ names the same way it mangles extern "C" names.
	// Therefore it's really important that we don't decorate the
	// name with leading underscores or leading/trailing at signs. So, by
	// default, we emit an asm marker at the start so we get the name right.
	// Callers can override this with a custom prefix.

	// <mangled-name> ::= ? <name> <type-encoding>
	Out << Prefix;
	mangleName(D);
	if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(D))
	mangleFunctionEncoding(FD, Context.shouldMangleDeclName(FD));
	else if (const VarDecl *VD = dyn_cast<VarDecl>(D))
	mangleVariableEncoding(VD);
	else
	llvm_unreachable("Tried to mangle unexpected NamedDecl!");
	}

	void MicrosoftCXXNameMangler::mangleFunctionEncoding(const FunctionDecl *FD,
	bool ShouldMangle) {
	// <type-encoding> ::= <function-class> <function-type>

	// Since MSVC operates on the type as written and not the canonical type, it
	// actually matters which decl we have here. MSVC appears to choose the
	// first, since it is most likely to be the declaration in a header file.
	FD = FD->getFirstDecl();

	// We should never ever see a FunctionNoProtoType at this point.
	// We don't even know how to mangle their types anyway :).
	const FunctionProtoType *FT = FD->getType()->castAs<FunctionProtoType>();

	// extern "C" functions can hold entities that must be mangled.
	// As it stands, these functions still need to get expressed in the full
	// external name. They have their class and type omitted, replaced with '9'.
	if (ShouldMangle) {
	// We would like to mangle all extern "C" functions using this additional
	// component but this would break compatibility with MSVC's behavior.
	// Instead, do this when we know that compatibility isn't important (in
	// other words, when it is an overloaded extern "C" function).
	if (FD->isExternC() && FD->hasAttr<OverloadableAttr>())
	Out << "$$J0";

	mangleFunctionClass(FD);

	mangleFunctionType(FT, FD);
	} else {
	Out << '9';
	}
	}

	void MicrosoftCXXNameMangler::mangleVariableEncoding(const VarDecl *VD) {
	// <type-encoding> ::= <storage-class> <variable-type>
	// <storage-class> ::= 0 # private static member
	// ::= 1 # protected static member
	// ::= 2 # public static member
	// ::= 3 # global
	// ::= 4 # static local

	// The first character in the encoding (after the name) is the storage class.
	if (VD->isStaticDataMember()) {
	// If it's a static member, it also encodes the access level.
	switch (VD->getAccess()) {
	default:
	case AS_private: Out << '0'; break;
	case AS_protected: Out << '1'; break;
	case AS_public: Out << '2'; break;
	}
	}
	else if (!VD->isStaticLocal())
	Out << '3';
	else
	Out << '4';
	// Now mangle the type.
	// <variable-type> ::= <type> <cvr-qualifiers>
	// ::= <type> <pointee-cvr-qualifiers> # pointers, references
	// Pointers and references are odd. The type of 'int * const foo;' gets
	// mangled as 'QAHA' instead of 'PAHB', for example.
	SourceRange SR = VD->getSourceRange();
	QualType Ty = VD->getType();
	if (Ty->isPointerType() \|\| Ty->isReferenceType() \|\|
	Ty->isMemberPointerType()) {
	mangleType(Ty, SR, QMM_Drop);
	manglePointerExtQualifiers(
	Ty.getDesugaredType(getASTContext()).getLocalQualifiers(), QualType());
	if (const MemberPointerType *MPT = Ty->getAs<MemberPointerType>()) {
	mangleQualifiers(MPT->getPointeeType().getQualifiers(), true);
	// Member pointers are suffixed with a back reference to the member
	// pointer's class name.
	mangleName(MPT->getClass()->getAsCXXRecordDecl());
	} else
	mangleQualifiers(Ty->getPointeeType().getQualifiers(), false);
	} else if (const ArrayType *AT = getASTContext().getAsArrayType(Ty)) {
	// Global arrays are funny, too.
	mangleDecayedArrayType(AT);
	if (AT->getElementType()->isArrayType())
	Out << 'A';
	else
	mangleQualifiers(Ty.getQualifiers(), false);
	} else {
	mangleType(Ty, SR, QMM_Drop);
	mangleQualifiers(Ty.getQualifiers(), false);
	}
	}

	void MicrosoftCXXNameMangler::mangleMemberDataPointer(const CXXRecordDecl *RD,
	const ValueDecl *VD) {
	// <member-data-pointer> ::= <integer-literal>
	// ::= $F <number> <number>
	// ::= $G <number> <number> <number>

	int64_t FieldOffset;
	int64_t VBTableOffset;
	MSInheritanceAttr::Spelling IM = RD->getMSInheritanceModel();
	if (VD) {
	FieldOffset = getASTContext().getFieldOffset(VD);
	assert(FieldOffset % getASTContext().getCharWidth() == 0 &&
	"cannot take address of bitfield");
	FieldOffset /= getASTContext().getCharWidth();

	VBTableOffset = 0;

	if (IM == MSInheritanceAttr::Keyword_virtual_inheritance)
	FieldOffset -= getASTContext().getOffsetOfBaseWithVBPtr(RD).getQuantity();
	} else {
	FieldOffset = RD->nullFieldOffsetIsZero() ? 0 : -1;

	VBTableOffset = -1;
	}

	char Code = '\0';
	switch (IM) {
	case MSInheritanceAttr::Keyword_single_inheritance: Code = '0'; break;
	case MSInheritanceAttr::Keyword_multiple_inheritance: Code = '0'; break;
	case MSInheritanceAttr::Keyword_virtual_inheritance: Code = 'F'; break;
	case MSInheritanceAttr::Keyword_unspecified_inheritance: Code = 'G'; break;
	}

	Out << '$' << Code;

	mangleNumber(FieldOffset);

	// The C++ standard doesn't allow base-to-derived member pointer conversions
	// in template parameter contexts, so the vbptr offset of data member pointers
	// is always zero.
	if (MSInheritanceAttr::hasVBPtrOffsetField(IM))
	mangleNumber(0);
	if (MSInheritanceAttr::hasVBTableOffsetField(IM))
	mangleNumber(VBTableOffset);
	}

	void
	MicrosoftCXXNameMangler::mangleMemberFunctionPointer(const CXXRecordDecl *RD,
	const CXXMethodDecl *MD) {
	// <member-function-pointer> ::= $1? <name>
	// ::= $H? <name> <number>
	// ::= $I? <name> <number> <number>
	// ::= $J? <name> <number> <number> <number>

	MSInheritanceAttr::Spelling IM = RD->getMSInheritanceModel();

	char Code = '\0';
	switch (IM) {
	case MSInheritanceAttr::Keyword_single_inheritance: Code = '1'; break;
	case MSInheritanceAttr::Keyword_multiple_inheritance: Code = 'H'; break;
	case MSInheritanceAttr::Keyword_virtual_inheritance: Code = 'I'; break;
	case MSInheritanceAttr::Keyword_unspecified_inheritance: Code = 'J'; break;
	}

	// If non-virtual, mangle the name. If virtual, mangle as a virtual memptr
	// thunk.
	uint64_t NVOffset = 0;
	uint64_t VBTableOffset = 0;
	uint64_t VBPtrOffset = 0;
	if (MD) {
	Out << '$' << Code << '?';
	if (MD->isVirtual()) {
	MicrosoftVTableContext *VTContext =
	cast<MicrosoftVTableContext>(getASTContext().getVTableContext());
	const MicrosoftVTableContext::MethodVFTableLocation &ML =
	VTContext->getMethodVFTableLocation(GlobalDecl(MD));
	mangleVirtualMemPtrThunk(MD, ML);
	NVOffset = ML.VFPtrOffset.getQuantity();
	VBTableOffset = ML.VBTableIndex * 4;
	if (ML.VBase) {
	const ASTRecordLayout &Layout = getASTContext().getASTRecordLayout(RD);
	VBPtrOffset = Layout.getVBPtrOffset().getQuantity();
	}
	} else {
	mangleName(MD);
	mangleFunctionEncoding(MD, /ShouldMangle=/true);
	}

	if (VBTableOffset == 0 &&
	IM == MSInheritanceAttr::Keyword_virtual_inheritance)
	NVOffset -= getASTContext().getOffsetOfBaseWithVBPtr(RD).getQuantity();
	} else {
	// Null single inheritance member functions are encoded as a simple nullptr.
	if (IM == MSInheritanceAttr::Keyword_single_inheritance) {
	Out << "$0A@";
	return;
	}
	if (IM == MSInheritanceAttr::Keyword_unspecified_inheritance)
	VBTableOffset = -1;
	Out << '$' << Code;
	}

	if (MSInheritanceAttr::hasNVOffsetField(/IsMemberFunction=/true, IM))
	mangleNumber(static_cast<uint32_t>(NVOffset));
	if (MSInheritanceAttr::hasVBPtrOffsetField(IM))
	mangleNumber(VBPtrOffset);
	if (MSInheritanceAttr::hasVBTableOffsetField(IM))
	mangleNumber(VBTableOffset);
	}

	void MicrosoftCXXNameMangler::mangleVirtualMemPtrThunk(
	const CXXMethodDecl *MD,
	const MicrosoftVTableContext::MethodVFTableLocation &ML) {
	// Get the vftable offset.
	CharUnits PointerWidth = getASTContext().toCharUnitsFromBits(
	getASTContext().getTargetInfo().getPointerWidth(0));
	uint64_t OffsetInVFTable = ML.Index * PointerWidth.getQuantity();

	Out << "?_9";
	mangleName(MD->getParent());
	Out << "$B";
	mangleNumber(OffsetInVFTable);
	Out << 'A';
	mangleCallingConvention(MD->getType()->getAs<FunctionProtoType>());
	}

	void MicrosoftCXXNameMangler::mangleName(const NamedDecl *ND) {
	// <name> ::= <unscoped-name> {[<named-scope>]+ \| [<nested-name>]}? @

	// Always start with the unqualified name.
	mangleUnqualifiedName(ND);

	mangleNestedName(ND);

	// Terminate the whole name with an '@'.
	Out << '@';
	}

	void MicrosoftCXXNameMangler::mangleNumber(int64_t Number) {
	// <non-negative integer> ::= A@ # when Number == 0
	// ::= <decimal digit> # when 1 <= Number <= 10
	// ::= <hex digit>+ @ # when Number >= 10
	//
	// <number> ::= [?] <non-negative integer>

	uint64_t Value = static_cast<uint64_t>(Number);
	if (Number < 0) {
	Value = -Value;
	Out << '?';
	}

	if (Value == 0)
	Out << "A@";
	else if (Value >= 1 && Value <= 10)
	Out << (Value - 1);
	else {
	// Numbers that are not encoded as decimal digits are represented as nibbles
	// in the range of ASCII characters 'A' to 'P'.
	// The number 0x123450 would be encoded as 'BCDEFA'
	char EncodedNumberBuffer[sizeof(uint64_t) * 2];
	MutableArrayRef<char> BufferRef(EncodedNumberBuffer);
	MutableArrayRef<char>::reverse_iterator I = BufferRef.rbegin();
	for (; Value != 0; Value >>= 4)
	*I++ = 'A' + (Value & 0xf);
	Out.write(I.base(), I - BufferRef.rbegin());
	Out << '@';
	}
	}

	static const TemplateDecl *
	isTemplate(const NamedDecl ND, const TemplateArgumentList &TemplateArgs) {
	// Check if we have a function template.
	if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(ND)) {
	if (const TemplateDecl *TD = FD->getPrimaryTemplate()) {
	TemplateArgs = FD->getTemplateSpecializationArgs();
	return TD;
	}
	}

	// Check if we have a class template.
	if (const ClassTemplateSpecializationDecl *Spec =
	dyn_cast<ClassTemplateSpecializationDecl>(ND)) {
	TemplateArgs = &Spec->getTemplateArgs();
	return Spec->getSpecializedTemplate();
	}

	// Check if we have a variable template.
	if (const VarTemplateSpecializationDecl *Spec =
	dyn_cast<VarTemplateSpecializationDecl>(ND)) {
	TemplateArgs = &Spec->getTemplateArgs();
	return Spec->getSpecializedTemplate();
	}

	return nullptr;
	}

	void MicrosoftCXXNameMangler::mangleUnqualifiedName(const NamedDecl *ND,
	DeclarationName Name) {
	// <unqualified-name> ::= <operator-name>
	// ::= <ctor-dtor-name>
	// ::= <source-name>
	// ::= <template-name>

	// Check if we have a template.
	const TemplateArgumentList *TemplateArgs = nullptr;
	if (const TemplateDecl *TD = isTemplate(ND, TemplateArgs)) {
	// Function templates aren't considered for name back referencing. This
	// makes sense since function templates aren't likely to occur multiple
	// times in a symbol.
	if (isa<FunctionTemplateDecl>(TD)) {
	mangleTemplateInstantiationName(TD, *TemplateArgs);
	Out << '@';
	return;
	}

	// Here comes the tricky thing: if we need to mangle something like
	// void foo(A::X<Y>, B::X<Y>),
	// the X<Y> part is aliased. However, if you need to mangle
	// void foo(A::X<A::Y>, A::X<B::Y>),
	// the A::X<> part is not aliased.
	// That said, from the mangler's perspective we have a structure like this:
	// namespace[s] -> type[ -> template-parameters]
	// but from the Clang perspective we have
	// type [ -> template-parameters]
	// \-> namespace[s]
	// What we do is we create a new mangler, mangle the same type (without
	// a namespace suffix) to a string using the extra mangler and then use
	// the mangled type name as a key to check the mangling of different types
	// for aliasing.

	llvm::SmallString<64> TemplateMangling;
	llvm::raw_svector_ostream Stream(TemplateMangling);
	MicrosoftCXXNameMangler Extra(Context, Stream);
	Extra.mangleTemplateInstantiationName(TD, *TemplateArgs);

	mangleSourceName(TemplateMangling);
	return;
	}

	switch (Name.getNameKind()) {
	case DeclarationName::Identifier: {
	if (const IdentifierInfo *II = Name.getAsIdentifierInfo()) {
	mangleSourceName(II->getName());
	break;
	}

	// Otherwise, an anonymous entity. We must have a declaration.
	assert(ND && "mangling empty name without declaration");

	if (const NamespaceDecl *NS = dyn_cast<NamespaceDecl>(ND)) {
	if (NS->isAnonymousNamespace()) {
	Out << "?A@";
	break;
	}
	}

	if (const DecompositionDecl *DD = dyn_cast<DecompositionDecl>(ND)) {
	// FIXME: Invented mangling for decomposition declarations:
	// [X,Y,Z]
	// where X,Y,Z are the names of the bindings.
	llvm::SmallString<128> Name("[");
	for (auto *BD : DD->bindings()) {
	if (Name.size() > 1)
	Name += ',';
	Name += BD->getDeclName().getAsIdentifierInfo()->getName();
	}
	Name += ']';
	mangleSourceName(Name);
	break;
	}

	if (const VarDecl *VD = dyn_cast<VarDecl>(ND)) {
	// We must have an anonymous union or struct declaration.
	const CXXRecordDecl *RD = VD->getType()->getAsCXXRecordDecl();
	assert(RD && "expected variable decl to have a record type");
	// Anonymous types with no tag or typedef get the name of their
	// declarator mangled in. If they have no declarator, number them with
	// a $S prefix.
	llvm::SmallString<64> Name("$S");
	// Get a unique id for the anonymous struct.
	Name += llvm::utostr(Context.getAnonymousStructId(RD) + 1);
	mangleSourceName(Name.str());
	break;
	}

	// We must have an anonymous struct.
	const TagDecl *TD = cast<TagDecl>(ND);
	if (const TypedefNameDecl *D = TD->getTypedefNameForAnonDecl()) {
	assert(TD->getDeclContext() == D->getDeclContext() &&
	"Typedef should not be in another decl context!");
	assert(D->getDeclName().getAsIdentifierInfo() &&
	"Typedef was not named!");
	mangleSourceName(D->getDeclName().getAsIdentifierInfo()->getName());
	break;
	}

	if (const CXXRecordDecl *Record = dyn_cast<CXXRecordDecl>(TD)) {
	if (Record->isLambda()) {
	llvm::SmallString<10> Name("<lambda_");

	Decl *LambdaContextDecl = Record->getLambdaContextDecl();
	unsigned LambdaManglingNumber = Record->getLambdaManglingNumber();
	unsigned LambdaId;
	const ParmVarDecl *Parm =
	dyn_cast_or_null<ParmVarDecl>(LambdaContextDecl);
	const FunctionDecl *Func =
	Parm ? dyn_cast<FunctionDecl>(Parm->getDeclContext()) : nullptr;

	if (Func) {
	unsigned DefaultArgNo =
	Func->getNumParams() - Parm->getFunctionScopeIndex();
	Name += llvm::utostr(DefaultArgNo);
	Name += "_";
	}

	if (LambdaManglingNumber)
	LambdaId = LambdaManglingNumber;
	else
	LambdaId = Context.getLambdaId(Record);

	Name += llvm::utostr(LambdaId);
	Name += ">";

	mangleSourceName(Name);

	// If the context of a closure type is an initializer for a class
	// member (static or nonstatic), it is encoded in a qualified name.
	if (LambdaManglingNumber && LambdaContextDecl) {
	if ((isa<VarDecl>(LambdaContextDecl) \|\|
	isa<FieldDecl>(LambdaContextDecl)) &&
	LambdaContextDecl->getDeclContext()->isRecord()) {
	mangleUnqualifiedName(cast<NamedDecl>(LambdaContextDecl));
	}
	}
	break;
	}
	}

	llvm::SmallString<64> Name;
	if (DeclaratorDecl *DD =
	Context.getASTContext().getDeclaratorForUnnamedTagDecl(TD)) {
	// Anonymous types without a name for linkage purposes have their
	// declarator mangled in if they have one.
	Name += "<unnamed-type-";
	Name += DD->getName();
	} else if (TypedefNameDecl *TND =
	Context.getASTContext().getTypedefNameForUnnamedTagDecl(
	TD)) {
	// Anonymous types without a name for linkage purposes have their
	// associate typedef mangled in if they have one.
	Name += "<unnamed-type-";
	Name += TND->getName();
	} else if (auto *ED = dyn_cast<EnumDecl>(TD)) {
	auto EnumeratorI = ED->enumerator_begin();
	assert(EnumeratorI != ED->enumerator_end());
	Name += "<unnamed-enum-";
	Name += EnumeratorI->getName();
	} else {
	// Otherwise, number the types using a $S prefix.
	Name += "<unnamed-type-$S";
	Name += llvm::utostr(Context.getAnonymousStructId(TD) + 1);
	}
	Name += ">";
	mangleSourceName(Name.str());
	break;
	}

	case DeclarationName::ObjCZeroArgSelector:
	case DeclarationName::ObjCOneArgSelector:
	case DeclarationName::ObjCMultiArgSelector:
	llvm_unreachable("Can't mangle Objective-C selector names here!");

	case DeclarationName::CXXConstructorName:
	if (isStructorDecl(ND)) {
	if (StructorType == Ctor_CopyingClosure) {
	Out << "?_O";
	return;
	}
	if (StructorType == Ctor_DefaultClosure) {
	Out << "?_F";
	return;
	}
	}
	Out << "?0";
	return;

	case DeclarationName::CXXDestructorName:
	if (isStructorDecl(ND))
	// If the named decl is the C++ destructor we're mangling,
	// use the type we were given.
	mangleCXXDtorType(static_cast<CXXDtorType>(StructorType));
	else
	// Otherwise, use the base destructor name. This is relevant if a
	// class with a destructor is declared within a destructor.
	mangleCXXDtorType(Dtor_Base);
	break;

	case DeclarationName::CXXConversionFunctionName:
	// <operator-name> ::= ?B # (cast)
	// The target type is encoded as the return type.
	Out << "?B";
	break;

	case DeclarationName::CXXOperatorName:
	mangleOperatorName(Name.getCXXOverloadedOperator(), ND->getLocation());
	break;

	case DeclarationName::CXXLiteralOperatorName: {
	Out << "?__K";
	mangleSourceName(Name.getCXXLiteralIdentifier()->getName());
	break;
	}

	case DeclarationName::CXXDeductionGuideName:
	llvm_unreachable("Can't mangle a deduction guide name!");

	case DeclarationName::CXXUsingDirective:
	llvm_unreachable("Can't mangle a using directive name!");
	}
	}

	+// <postfix> ::= <unqualified-name> [<postfix>]
	+// ::= <substitution> [<postfix>]
	void MicrosoftCXXNameMangler::mangleNestedName(const NamedDecl *ND) {
	- // <postfix> ::= <unqualified-name> [<postfix>]
	- // ::= <substitution> [<postfix>]
	const DeclContext *DC = getEffectiveDeclContext(ND);
	-
	while (!DC->isTranslationUnit()) {
	if (isa<TagDecl>(ND) \|\| isa<VarDecl>(ND)) {
	unsigned Disc;
	if (Context.getNextDiscriminator(ND, Disc)) {
	Out << '?';
	mangleNumber(Disc);
	Out << '?';
	}
	}

	if (const BlockDecl *BD = dyn_cast<BlockDecl>(DC)) {
	auto Discriminate =
	[](StringRef Name, const unsigned Discriminator,
	const unsigned ParameterDiscriminator) -> std::string {
	std::string Buffer;
	llvm::raw_string_ostream Stream(Buffer);
	Stream << Name;
	if (Discriminator)
	Stream << '_' << Discriminator;
	if (ParameterDiscriminator)
	Stream << '_' << ParameterDiscriminator;
	return Stream.str();
	};

	unsigned Discriminator = BD->getBlockManglingNumber();
	if (!Discriminator)
	Discriminator = Context.getBlockId(BD, /Local=/false);

	// Mangle the parameter position as a discriminator to deal with unnamed
	// parameters. Rather than mangling the unqualified parameter name,
	// always use the position to give a uniform mangling.
	unsigned ParameterDiscriminator = 0;
	if (const auto *MC = BD->getBlockManglingContextDecl())
	if (const auto *P = dyn_cast<ParmVarDecl>(MC))
	if (const auto *F = dyn_cast<FunctionDecl>(P->getDeclContext()))
	ParameterDiscriminator =
	F->getNumParams() - P->getFunctionScopeIndex();

	DC = getEffectiveDeclContext(BD);

	Out << '?';
	mangleSourceName(Discriminate("_block_invoke", Discriminator,
	ParameterDiscriminator));
	// If we have a block mangling context, encode that now. This allows us
	// to discriminate between named static data initializers in the same
	// scope. This is handled differently from parameters, which use
	// positions to discriminate between multiple instances.
	if (const auto *MC = BD->getBlockManglingContextDecl())
	if (!isa<ParmVarDecl>(MC))
	if (const auto *ND = dyn_cast<NamedDecl>(MC))
	mangleUnqualifiedName(ND);
	// MS ABI and Itanium manglings are in inverted scopes. In the case of a
	// RecordDecl, mangle the entire scope hierachy at this point rather than
	// just the unqualified name to get the ordering correct.
	if (const auto *RD = dyn_cast<RecordDecl>(DC))
	mangleName(RD);
	else
	Out << '@';
	// void __cdecl
	Out << "YAX";
	// struct __block_literal *
	Out << 'P';
	// __ptr64
	if (PointersAre64Bit)
	Out << 'E';
	Out << 'A';
	mangleArtificalTagType(TTK_Struct,
	Discriminate("__block_literal", Discriminator,
	ParameterDiscriminator));
	Out << "@Z";

	// If the effective context was a Record, we have fully mangled the
	// qualified name and do not need to continue.
	if (isa<RecordDecl>(DC))
	break;
	continue;
	} else if (const ObjCMethodDecl *Method = dyn_cast<ObjCMethodDecl>(DC)) {
	mangleObjCMethodName(Method);
	} else if (isa<NamedDecl>(DC)) {
	ND = cast<NamedDecl>(DC);
	if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(ND)) {
	mangle(FD, "?");
	break;
	} else {
	mangleUnqualifiedName(ND);
	// Lambdas in default arguments conceptually belong to the function the
	// parameter corresponds to.
	if (const auto *LDADC = getLambdaDefaultArgumentDeclContext(ND)) {
	DC = LDADC;
	continue;
	}
	}
	}
	DC = DC->getParent();
	}
	}

	void MicrosoftCXXNameMangler::mangleCXXDtorType(CXXDtorType T) {
	// Microsoft uses the names on the case labels for these dtor variants. Clang
	// uses the Itanium terminology internally. Everything in this ABI delegates
	// towards the base dtor.
	switch (T) {
	// <operator-name> ::= ?1 # destructor
	case Dtor_Base: Out << "?1"; return;
	// <operator-name> ::= ?_D # vbase destructor
	case Dtor_Complete: Out << "?_D"; return;
	// <operator-name> ::= ?_G # scalar deleting destructor
	case Dtor_Deleting: Out << "?_G"; return;
	// <operator-name> ::= ?_E # vector deleting destructor
	// FIXME: Add a vector deleting dtor type. It goes in the vtable, so we need
	// it.
	case Dtor_Comdat:
	llvm_unreachable("not expecting a COMDAT");
	}
	llvm_unreachable("Unsupported dtor type?");
	}

	void MicrosoftCXXNameMangler::mangleOperatorName(OverloadedOperatorKind OO,
	SourceLocation Loc) {
	switch (OO) {
	// ?0 # constructor
	// ?1 # destructor
	// <operator-name> ::= ?2 # new
	case OO_New: Out << "?2"; break;
	// <operator-name> ::= ?3 # delete
	case OO_Delete: Out << "?3"; break;
	// <operator-name> ::= ?4 # =
	case OO_Equal: Out << "?4"; break;
	// <operator-name> ::= ?5 # >>
	case OO_GreaterGreater: Out << "?5"; break;
	// <operator-name> ::= ?6 # <<
	case OO_LessLess: Out << "?6"; break;
	// <operator-name> ::= ?7 # !
	case OO_Exclaim: Out << "?7"; break;
	// <operator-name> ::= ?8 # ==
	case OO_EqualEqual: Out << "?8"; break;
	// <operator-name> ::= ?9 # !=
	case OO_ExclaimEqual: Out << "?9"; break;
	// <operator-name> ::= ?A # []
	case OO_Subscript: Out << "?A"; break;
	// ?B # conversion
	// <operator-name> ::= ?C # ->
	case OO_Arrow: Out << "?C"; break;
	// <operator-name> ::= ?D # *
	case OO_Star: Out << "?D"; break;
	// <operator-name> ::= ?E # ++
	case OO_PlusPlus: Out << "?E"; break;
	// <operator-name> ::= ?F # --
	case OO_MinusMinus: Out << "?F"; break;
	// <operator-name> ::= ?G # -
	case OO_Minus: Out << "?G"; break;
	// <operator-name> ::= ?H # +
	case OO_Plus: Out << "?H"; break;
	// <operator-name> ::= ?I # &
	case OO_Amp: Out << "?I"; break;
	// <operator-name> ::= ?J # ->*
	case OO_ArrowStar: Out << "?J"; break;
	// <operator-name> ::= ?K # /
	case OO_Slash: Out << "?K"; break;
	// <operator-name> ::= ?L # %
	case OO_Percent: Out << "?L"; break;
	// <operator-name> ::= ?M # <
	case OO_Less: Out << "?M"; break;
	// <operator-name> ::= ?N # <=
	case OO_LessEqual: Out << "?N"; break;
	// <operator-name> ::= ?O # >
	case OO_Greater: Out << "?O"; break;
	// <operator-name> ::= ?P # >=
	case OO_GreaterEqual: Out << "?P"; break;
	// <operator-name> ::= ?Q # ,
	case OO_Comma: Out << "?Q"; break;
	// <operator-name> ::= ?R # ()
	case OO_Call: Out << "?R"; break;
	// <operator-name> ::= ?S # ~
	case OO_Tilde: Out << "?S"; break;
	// <operator-name> ::= ?T # ^
	case OO_Caret: Out << "?T"; break;
	// <operator-name> ::= ?U # \|
	case OO_Pipe: Out << "?U"; break;
	// <operator-name> ::= ?V # &&
	case OO_AmpAmp: Out << "?V"; break;
	// <operator-name> ::= ?W # \|\|
	case OO_PipePipe: Out << "?W"; break;
	// <operator-name> ::= ?X # *=
	case OO_StarEqual: Out << "?X"; break;
	// <operator-name> ::= ?Y # +=
	case OO_PlusEqual: Out << "?Y"; break;
	// <operator-name> ::= ?Z # -=
	case OO_MinusEqual: Out << "?Z"; break;
	// <operator-name> ::= ?_0 # /=
	case OO_SlashEqual: Out << "?_0"; break;
	// <operator-name> ::= ?_1 # %=
	case OO_PercentEqual: Out << "?_1"; break;
	// <operator-name> ::= ?_2 # >>=
	case OO_GreaterGreaterEqual: Out << "?_2"; break;
	// <operator-name> ::= ?_3 # <<=
	case OO_LessLessEqual: Out << "?_3"; break;
	// <operator-name> ::= ?_4 # &=
	case OO_AmpEqual: Out << "?_4"; break;
	// <operator-name> ::= ?_5 # \|=
	case OO_PipeEqual: Out << "?_5"; break;
	// <operator-name> ::= ?_6 # ^=
	case OO_CaretEqual: Out << "?_6"; break;
	// ?_7 # vftable
	// ?_8 # vbtable
	// ?_9 # vcall
	// ?_A # typeof
	// ?_B # local static guard
	// ?_C # string
	// ?_D # vbase destructor
	// ?_E # vector deleting destructor
	// ?_F # default constructor closure
	// ?_G # scalar deleting destructor
	// ?_H # vector constructor iterator
	// ?_I # vector destructor iterator
	// ?_J # vector vbase constructor iterator
	// ?_K # virtual displacement map
	// ?_L # eh vector constructor iterator
	// ?_M # eh vector destructor iterator
	// ?_N # eh vector vbase constructor iterator
	// ?_O # copy constructor closure
	// ?_P<name> # udt returning <name>
	// ?_Q # <unknown>
	// ?_R0 # RTTI Type Descriptor
	// ?_R1 # RTTI Base Class Descriptor at (a,b,c,d)
	// ?_R2 # RTTI Base Class Array
	// ?_R3 # RTTI Class Hierarchy Descriptor
	// ?_R4 # RTTI Complete Object Locator
	// ?_S # local vftable
	// ?_T # local vftable constructor closure
	// <operator-name> ::= ?_U # new[]
	case OO_Array_New: Out << "?_U"; break;
	// <operator-name> ::= ?_V # delete[]
	case OO_Array_Delete: Out << "?_V"; break;
	// <operator-name> ::= ?__L # co_await
	case OO_Coawait: Out << "?__L"; break;

	case OO_Spaceship: {
	// FIXME: Once MS picks a mangling, use it.
	DiagnosticsEngine &Diags = Context.getDiags();
	unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
	"cannot mangle this three-way comparison operator yet");
	Diags.Report(Loc, DiagID);
	break;
	}

	case OO_Conditional: {
	DiagnosticsEngine &Diags = Context.getDiags();
	unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
	"cannot mangle this conditional operator yet");
	Diags.Report(Loc, DiagID);
	break;
	}

	case OO_None:
	case NUM_OVERLOADED_OPERATORS:
	llvm_unreachable("Not an overloaded operator");
	}
	}

	void MicrosoftCXXNameMangler::mangleSourceName(StringRef Name) {
	// <source name> ::= <identifier> @
	BackRefVec::iterator Found =
	std::find(NameBackReferences.begin(), NameBackReferences.end(), Name);
	if (Found == NameBackReferences.end()) {
	if (NameBackReferences.size() < 10)
	NameBackReferences.push_back(Name);
	Out << Name << '@';
	} else {
	Out << (Found - NameBackReferences.begin());
	}
	}

	void MicrosoftCXXNameMangler::mangleObjCMethodName(const ObjCMethodDecl *MD) {
	Context.mangleObjCMethodName(MD, Out);
	}

	void MicrosoftCXXNameMangler::mangleTemplateInstantiationName(
	const TemplateDecl *TD, const TemplateArgumentList &TemplateArgs) {
	// <template-name> ::= <unscoped-template-name> <template-args>
	// ::= <substitution>
	// Always start with the unqualified name.

	// Templates have their own context for back references.
	ArgBackRefMap OuterArgsContext;
	BackRefVec OuterTemplateContext;
	PassObjectSizeArgsSet OuterPassObjectSizeArgs;
	NameBackReferences.swap(OuterTemplateContext);
	TypeBackReferences.swap(OuterArgsContext);
	PassObjectSizeArgs.swap(OuterPassObjectSizeArgs);

	mangleUnscopedTemplateName(TD);
	mangleTemplateArgs(TD, TemplateArgs);

	// Restore the previous back reference contexts.
	NameBackReferences.swap(OuterTemplateContext);
	TypeBackReferences.swap(OuterArgsContext);
	PassObjectSizeArgs.swap(OuterPassObjectSizeArgs);
	}

	void
	MicrosoftCXXNameMangler::mangleUnscopedTemplateName(const TemplateDecl *TD) {
	// <unscoped-template-name> ::= ?$ <unqualified-name>
	Out << "?$";
	mangleUnqualifiedName(TD);
	}

	void MicrosoftCXXNameMangler::mangleIntegerLiteral(const llvm::APSInt &Value,
	bool IsBoolean) {
	// <integer-literal> ::= $0 <number>
	Out << "$0";
	// Make sure booleans are encoded as 0/1.
	if (IsBoolean && Value.getBoolValue())
	mangleNumber(1);
	else if (Value.isSigned())
	mangleNumber(Value.getSExtValue());
	else
	mangleNumber(Value.getZExtValue());
	}

	void MicrosoftCXXNameMangler::mangleExpression(const Expr *E) {
	// See if this is a constant expression.
	llvm::APSInt Value;
	if (E->isIntegerConstantExpr(Value, Context.getASTContext())) {
	mangleIntegerLiteral(Value, E->getType()->isBooleanType());
	return;
	}

	// Look through no-op casts like template parameter substitutions.
	E = E->IgnoreParenNoopCasts(Context.getASTContext());

	const CXXUuidofExpr *UE = nullptr;
	if (const UnaryOperator *UO = dyn_cast<UnaryOperator>(E)) {
	if (UO->getOpcode() == UO_AddrOf)
	UE = dyn_cast<CXXUuidofExpr>(UO->getSubExpr());
	} else
	UE = dyn_cast<CXXUuidofExpr>(E);

	if (UE) {
	// If we had to peek through an address-of operator, treat this like we are
	// dealing with a pointer type. Otherwise, treat it like a const reference.
	//
	// N.B. This matches up with the handling of TemplateArgument::Declaration
	// in mangleTemplateArg
	if (UE == E)
	Out << "$E?";
	else
	Out << "$1?";

	// This CXXUuidofExpr is mangled as-if it were actually a VarDecl from
	// const __s_GUID _GUID_{lower case UUID with underscores}
	StringRef Uuid = UE->getUuidStr();
	std::string Name = "_GUID_" + Uuid.lower();
	std::replace(Name.begin(), Name.end(), '-', '_');

	mangleSourceName(Name);
	// Terminate the whole name with an '@'.
	Out << '@';
	// It's a global variable.
	Out << '3';
	// It's a struct called __s_GUID.
	mangleArtificalTagType(TTK_Struct, "__s_GUID");
	// It's const.
	Out << 'B';
	return;
	}

	// As bad as this diagnostic is, it's better than crashing.
	DiagnosticsEngine &Diags = Context.getDiags();
	unsigned DiagID = Diags.getCustomDiagID(
	DiagnosticsEngine::Error, "cannot yet mangle expression type %0");
	Diags.Report(E->getExprLoc(), DiagID) << E->getStmtClassName()
	<< E->getSourceRange();
	}

	void MicrosoftCXXNameMangler::mangleTemplateArgs(
	const TemplateDecl *TD, const TemplateArgumentList &TemplateArgs) {
	// <template-args> ::= <template-arg>+
	const TemplateParameterList *TPL = TD->getTemplateParameters();
	assert(TPL->size() == TemplateArgs.size() &&
	"size mismatch between args and parms!");

	unsigned Idx = 0;
	for (const TemplateArgument &TA : TemplateArgs.asArray())
	mangleTemplateArg(TD, TA, TPL->getParam(Idx++));
	}

	void MicrosoftCXXNameMangler::mangleTemplateArg(const TemplateDecl *TD,
	const TemplateArgument &TA,
	const NamedDecl *Parm) {
	// <template-arg> ::= <type>
	// ::= <integer-literal>
	// ::= <member-data-pointer>
	// ::= <member-function-pointer>
	// ::= $E? <name> <type-encoding>
	// ::= $1? <name> <type-encoding>
	// ::= $0A@
	// ::= <template-args>

	switch (TA.getKind()) {
	case TemplateArgument::Null:
	llvm_unreachable("Can't mangle null template arguments!");
	case TemplateArgument::TemplateExpansion:
	llvm_unreachable("Can't mangle template expansion arguments!");
	case TemplateArgument::Type: {
	QualType T = TA.getAsType();
	mangleType(T, SourceRange(), QMM_Escape);
	break;
	}
	case TemplateArgument::Declaration: {
	const NamedDecl *ND = cast<NamedDecl>(TA.getAsDecl());
	if (isa<FieldDecl>(ND) \|\| isa<IndirectFieldDecl>(ND)) {
	mangleMemberDataPointer(
	cast<CXXRecordDecl>(ND->getDeclContext())->getMostRecentDecl(),
	cast<ValueDecl>(ND));
	} else if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(ND)) {
	const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(FD);
	if (MD && MD->isInstance()) {
	mangleMemberFunctionPointer(MD->getParent()->getMostRecentDecl(), MD);
	} else {
	Out << "$1?";
	mangleName(FD);
	mangleFunctionEncoding(FD, /ShouldMangle=/true);
	}
	} else {
	mangle(ND, TA.getParamTypeForDecl()->isReferenceType() ? "$E?" : "$1?");
	}
	break;
	}
	case TemplateArgument::Integral:
	mangleIntegerLiteral(TA.getAsIntegral(),
	TA.getIntegralType()->isBooleanType());
	break;
	case TemplateArgument::NullPtr: {
	QualType T = TA.getNullPtrType();
	if (const MemberPointerType *MPT = T->getAs<MemberPointerType>()) {
	const CXXRecordDecl *RD = MPT->getMostRecentCXXRecordDecl();
	if (MPT->isMemberFunctionPointerType() &&
	!isa<FunctionTemplateDecl>(TD)) {
	mangleMemberFunctionPointer(RD, nullptr);
	return;
	}
	if (MPT->isMemberDataPointer()) {
	if (!isa<FunctionTemplateDecl>(TD)) {
	mangleMemberDataPointer(RD, nullptr);
	return;
	}
	// nullptr data pointers are always represented with a single field
	// which is initialized with either 0 or -1. Why -1? Well, we need to
	// distinguish the case where the data member is at offset zero in the
	// record.
	// However, we are free to use 0 if we would use multiple fields for
	// non-nullptr member pointers.
	if (!RD->nullFieldOffsetIsZero()) {
	mangleIntegerLiteral(llvm::APSInt::get(-1), /IsBoolean=/false);
	return;
	}
	}
	}
	mangleIntegerLiteral(llvm::APSInt::getUnsigned(0), /IsBoolean=/false);
	break;
	}
	case TemplateArgument::Expression:
	mangleExpression(TA.getAsExpr());
	break;
	case TemplateArgument::Pack: {
	ArrayRef<TemplateArgument> TemplateArgs = TA.getPackAsArray();
	if (TemplateArgs.empty()) {
	if (isa<TemplateTypeParmDecl>(Parm) \|\|
	isa<TemplateTemplateParmDecl>(Parm))
	// MSVC 2015 changed the mangling for empty expanded template packs,
	// use the old mangling for link compatibility for old versions.
	Out << (Context.getASTContext().getLangOpts().isCompatibleWithMSVC(
	LangOptions::MSVC2015)
	? "$$V"
	: "$$$V");
	else if (isa<NonTypeTemplateParmDecl>(Parm))
	Out << "$S";
	else
	llvm_unreachable("unexpected template parameter decl!");
	} else {
	for (const TemplateArgument &PA : TemplateArgs)
	mangleTemplateArg(TD, PA, Parm);
	}
	break;
	}
	case TemplateArgument::Template: {
	const NamedDecl *ND =
	TA.getAsTemplate().getAsTemplateDecl()->getTemplatedDecl();
	if (const auto *TD = dyn_cast<TagDecl>(ND)) {
	mangleType(TD);
	} else if (isa<TypeAliasDecl>(ND)) {
	Out << "$$Y";
	mangleName(ND);
	} else {
	llvm_unreachable("unexpected template template NamedDecl!");
	}
	break;
	}
	}
	}

	void MicrosoftCXXNameMangler::mangleQualifiers(Qualifiers Quals,
	bool IsMember) {
	// <cvr-qualifiers> ::= [E] [F] [I] <base-cvr-qualifiers>
	// 'E' means __ptr64 (32-bit only); 'F' means __unaligned (32/64-bit only);
	// 'I' means __restrict (32/64-bit).
	// Note that the MSVC __restrict keyword isn't the same as the C99 restrict
	// keyword!
	// <base-cvr-qualifiers> ::= A # near
	// ::= B # near const
	// ::= C # near volatile
	// ::= D # near const volatile
	// ::= E # far (16-bit)
	// ::= F # far const (16-bit)
	// ::= G # far volatile (16-bit)
	// ::= H # far const volatile (16-bit)
	// ::= I # huge (16-bit)
	// ::= J # huge const (16-bit)
	// ::= K # huge volatile (16-bit)
	// ::= L # huge const volatile (16-bit)
	// ::= M <basis> # based
	// ::= N <basis> # based const
	// ::= O <basis> # based volatile
	// ::= P <basis> # based const volatile
	// ::= Q # near member
	// ::= R # near const member
	// ::= S # near volatile member
	// ::= T # near const volatile member
	// ::= U # far member (16-bit)
	// ::= V # far const member (16-bit)
	// ::= W # far volatile member (16-bit)
	// ::= X # far const volatile member (16-bit)
	// ::= Y # huge member (16-bit)
	// ::= Z # huge const member (16-bit)
	// ::= 0 # huge volatile member (16-bit)
	// ::= 1 # huge const volatile member (16-bit)
	// ::= 2 <basis> # based member
	// ::= 3 <basis> # based const member
	// ::= 4 <basis> # based volatile member
	// ::= 5 <basis> # based const volatile member
	// ::= 6 # near function (pointers only)
	// ::= 7 # far function (pointers only)
	// ::= 8 # near method (pointers only)
	// ::= 9 # far method (pointers only)
	// ::= _A <basis> # based function (pointers only)
	// ::= _B <basis> # based function (far?) (pointers only)
	// ::= _C <basis> # based method (pointers only)
	// ::= _D <basis> # based method (far?) (pointers only)
	// ::= _E # block (Clang)
	// <basis> ::= 0 # __based(void)
	// ::= 1 # __based(segment)?
	// ::= 2 <name> # __based(name)
	// ::= 3 # ?
	// ::= 4 # ?
	// ::= 5 # not really based
	bool HasConst = Quals.hasConst(),
	HasVolatile = Quals.hasVolatile();

	if (!IsMember) {
	if (HasConst && HasVolatile) {
	Out << 'D';
	} else if (HasVolatile) {
	Out << 'C';
	} else if (HasConst) {
	Out << 'B';
	} else {
	Out << 'A';
	}
	} else {
	if (HasConst && HasVolatile) {
	Out << 'T';
	} else if (HasVolatile) {
	Out << 'S';
	} else if (HasConst) {
	Out << 'R';
	} else {
	Out << 'Q';
	}
	}

	// FIXME: For now, just drop all extension qualifiers on the floor.
	}

	void
	MicrosoftCXXNameMangler::mangleRefQualifier(RefQualifierKind RefQualifier) {
	// <ref-qualifier> ::= G # lvalue reference
	// ::= H # rvalue-reference
	switch (RefQualifier) {
	case RQ_None:
	break;

	case RQ_LValue:
	Out << 'G';
	break;

	case RQ_RValue:
	Out << 'H';
	break;
	}
	}

	void MicrosoftCXXNameMangler::manglePointerExtQualifiers(Qualifiers Quals,
	QualType PointeeType) {
	bool HasRestrict = Quals.hasRestrict();
	if (PointersAre64Bit &&
	(PointeeType.isNull() \|\| !PointeeType->isFunctionType()))
	Out << 'E';

	if (HasRestrict)
	Out << 'I';

	if (Quals.hasUnaligned() \|\|
	(!PointeeType.isNull() && PointeeType.getLocalQualifiers().hasUnaligned()))
	Out << 'F';
	}

	void MicrosoftCXXNameMangler::manglePointerCVQualifiers(Qualifiers Quals) {
	// <pointer-cv-qualifiers> ::= P # no qualifiers
	// ::= Q # const
	// ::= R # volatile
	// ::= S # const volatile
	bool HasConst = Quals.hasConst(),
	HasVolatile = Quals.hasVolatile();

	if (HasConst && HasVolatile) {
	Out << 'S';
	} else if (HasVolatile) {
	Out << 'R';
	} else if (HasConst) {
	Out << 'Q';
	} else {
	Out << 'P';
	}
	}

	void MicrosoftCXXNameMangler::mangleArgumentType(QualType T,
	SourceRange Range) {
	// MSVC will backreference two canonically equivalent types that have slightly
	// different manglings when mangled alone.

	// Decayed types do not match up with non-decayed versions of the same type.
	//
	// e.g.
	// void (*x)(void) will not form a backreference with void x(void)
	void *TypePtr;
	if (const auto *DT = T->getAs<DecayedType>()) {
	QualType OriginalType = DT->getOriginalType();
	// All decayed ArrayTypes should be treated identically; as-if they were
	// a decayed IncompleteArrayType.
	if (const auto *AT = getASTContext().getAsArrayType(OriginalType))
	OriginalType = getASTContext().getIncompleteArrayType(
	AT->getElementType(), AT->getSizeModifier(),
	AT->getIndexTypeCVRQualifiers());

	TypePtr = OriginalType.getCanonicalType().getAsOpaquePtr();
	// If the original parameter was textually written as an array,
	// instead treat the decayed parameter like it's const.
	//
	// e.g.
	// int [] -> int * const
	if (OriginalType->isArrayType())
	T = T.withConst();
	} else {
	TypePtr = T.getCanonicalType().getAsOpaquePtr();
	}

	ArgBackRefMap::iterator Found = TypeBackReferences.find(TypePtr);

	if (Found == TypeBackReferences.end()) {
	size_t OutSizeBefore = Out.tell();

	mangleType(T, Range, QMM_Drop);

	// See if it's worth creating a back reference.
	// Only types longer than 1 character are considered
	// and only 10 back references slots are available:
	bool LongerThanOneChar = (Out.tell() - OutSizeBefore > 1);
	if (LongerThanOneChar && TypeBackReferences.size() < 10) {
	size_t Size = TypeBackReferences.size();
	TypeBackReferences[TypePtr] = Size;
	}
	} else {
	Out << Found->second;
	}
	}

	void MicrosoftCXXNameMangler::manglePassObjectSizeArg(
	const PassObjectSizeAttr *POSA) {
	int Type = POSA->getType();

	auto Iter = PassObjectSizeArgs.insert(Type).first;
	auto TypePtr = (const void )&*Iter;
	ArgBackRefMap::iterator Found = TypeBackReferences.find(TypePtr);

	if (Found == TypeBackReferences.end()) {
	mangleArtificalTagType(TTK_Enum, "__pass_object_size" + llvm::utostr(Type),
	{"__clang"});

	if (TypeBackReferences.size() < 10) {
	size_t Size = TypeBackReferences.size();
	TypeBackReferences[TypePtr] = Size;
	}
	} else {
	Out << Found->second;
	}
	}

	void MicrosoftCXXNameMangler::mangleType(QualType T, SourceRange Range,
	QualifierMangleMode QMM) {
	// Don't use the canonical types. MSVC includes things like 'const' on
	// pointer arguments to function pointers that canonicalization strips away.
	T = T.getDesugaredType(getASTContext());
	Qualifiers Quals = T.getLocalQualifiers();
	if (const ArrayType *AT = getASTContext().getAsArrayType(T)) {
	// If there were any Quals, getAsArrayType() pushed them onto the array
	// element type.
	if (QMM == QMM_Mangle)
	Out << 'A';
	else if (QMM == QMM_Escape \|\| QMM == QMM_Result)
	Out << "$$B";
	mangleArrayType(AT);
	return;
	}

	bool IsPointer = T->isAnyPointerType() \|\| T->isMemberPointerType() \|\|
	T->isReferenceType() \|\| T->isBlockPointerType();

	switch (QMM) {
	case QMM_Drop:
	break;
	case QMM_Mangle:
	if (const FunctionType *FT = dyn_cast<FunctionType>(T)) {
	Out << '6';
	mangleFunctionType(FT);
	return;
	}
	mangleQualifiers(Quals, false);
	break;
	case QMM_Escape:
	if (!IsPointer && Quals) {
	Out << "$$C";
	mangleQualifiers(Quals, false);
	}
	break;
	case QMM_Result:
	// Presence of __unaligned qualifier shouldn't affect mangling here.
	Quals.removeUnaligned();
	if ((!IsPointer && Quals) \|\| isa<TagType>(T)) {
	Out << '?';
	mangleQualifiers(Quals, false);
	}
	break;
	}

	const Type *ty = T.getTypePtr();

	switch (ty->getTypeClass()) {
	#define ABSTRACT_TYPE(CLASS, PARENT)
	#define NON_CANONICAL_TYPE(CLASS, PARENT) \
	case Type::CLASS: \
	llvm_unreachable("can't mangle non-canonical type " #CLASS "Type"); \
	return;
	#define TYPE(CLASS, PARENT) \
	case Type::CLASS: \
	mangleType(cast<CLASS##Type>(ty), Quals, Range); \
	break;
	#include "clang/AST/TypeNodes.def"
	#undef ABSTRACT_TYPE
	#undef NON_CANONICAL_TYPE
	#undef TYPE
	}
	}

	void MicrosoftCXXNameMangler::mangleType(const BuiltinType *T, Qualifiers,
	SourceRange Range) {
	// <type> ::= <builtin-type>
	// <builtin-type> ::= X # void
	// ::= C # signed char
	// ::= D # char
	// ::= E # unsigned char
	// ::= F # short
	// ::= G # unsigned short (or wchar_t if it's not a builtin)
	// ::= H # int
	// ::= I # unsigned int
	// ::= J # long
	// ::= K # unsigned long
	// L # <none>
	// ::= M # float
	// ::= N # double
	// ::= O # long double (__float80 is mangled differently)
	// ::= _J # long long, __int64
	// ::= _K # unsigned long long, __int64
	// ::= _L # __int128
	// ::= _M # unsigned __int128
	// ::= _N # bool
	// _O # <array in parameter>
	// ::= _T # __float80 (Intel)
	// ::= _S # char16_t
	// ::= _U # char32_t
	// ::= _W # wchar_t
	// ::= _Z # __float80 (Digital Mars)
	switch (T->getKind()) {
	case BuiltinType::Void:
	Out << 'X';
	break;
	case BuiltinType::SChar:
	Out << 'C';
	break;
	case BuiltinType::Char_U:
	case BuiltinType::Char_S:
	Out << 'D';
	break;
	case BuiltinType::UChar:
	Out << 'E';
	break;
	case BuiltinType::Short:
	Out << 'F';
	break;
	case BuiltinType::UShort:
	Out << 'G';
	break;
	case BuiltinType::Int:
	Out << 'H';
	break;
	case BuiltinType::UInt:
	Out << 'I';
	break;
	case BuiltinType::Long:
	Out << 'J';
	break;
	case BuiltinType::ULong:
	Out << 'K';
	break;
	case BuiltinType::Float:
	Out << 'M';
	break;
	case BuiltinType::Double:
	Out << 'N';
	break;
	// TODO: Determine size and mangle accordingly
	case BuiltinType::LongDouble:
	Out << 'O';
	break;
	case BuiltinType::LongLong:
	Out << "_J";
	break;
	case BuiltinType::ULongLong:
	Out << "_K";
	break;
	case BuiltinType::Int128:
	Out << "_L";
	break;
	case BuiltinType::UInt128:
	Out << "_M";
	break;
	case BuiltinType::Bool:
	Out << "_N";
	break;
	case BuiltinType::Char16:
	Out << "_S";
	break;
	case BuiltinType::Char32:
	Out << "_U";
	break;
	case BuiltinType::WChar_S:
	case BuiltinType::WChar_U:
	Out << "_W";
	break;

	#define BUILTIN_TYPE(Id, SingletonId)
	#define PLACEHOLDER_TYPE(Id, SingletonId) \
	case BuiltinType::Id:
	#include "clang/AST/BuiltinTypes.def"
	case BuiltinType::Dependent:
	llvm_unreachable("placeholder types shouldn't get to name mangling");

	case BuiltinType::ObjCId:
	Out << "PA";
	mangleArtificalTagType(TTK_Struct, "objc_object");
	break;
	case BuiltinType::ObjCClass:
	Out << "PA";
	mangleArtificalTagType(TTK_Struct, "objc_class");
	break;
	case BuiltinType::ObjCSel:
	Out << "PA";
	mangleArtificalTagType(TTK_Struct, "objc_selector");
	break;

	#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
	case BuiltinType::Id: \
	Out << "PAUocl_" #ImgType "_" #Suffix "@@"; \
	break;
	#include "clang/Basic/OpenCLImageTypes.def"
	case BuiltinType::OCLSampler:
	Out << "PA";
	mangleArtificalTagType(TTK_Struct, "ocl_sampler");
	break;
	case BuiltinType::OCLEvent:
	Out << "PA";
	mangleArtificalTagType(TTK_Struct, "ocl_event");
	break;
	case BuiltinType::OCLClkEvent:
	Out << "PA";
	mangleArtificalTagType(TTK_Struct, "ocl_clkevent");
	break;
	case BuiltinType::OCLQueue:
	Out << "PA";
	mangleArtificalTagType(TTK_Struct, "ocl_queue");
	break;
	case BuiltinType::OCLReserveID:
	Out << "PA";
	mangleArtificalTagType(TTK_Struct, "ocl_reserveid");
	break;

	case BuiltinType::NullPtr:
	Out << "$$T";
	break;

	case BuiltinType::Float16:
	case BuiltinType::Float128:
	case BuiltinType::Half: {
	DiagnosticsEngine &Diags = Context.getDiags();
	unsigned DiagID = Diags.getCustomDiagID(
	DiagnosticsEngine::Error, "cannot mangle this built-in %0 type yet");
	Diags.Report(Range.getBegin(), DiagID)
	<< T->getName(Context.getASTContext().getPrintingPolicy()) << Range;
	break;
	}
	}
	}

	// <type> ::= <function-type>
	void MicrosoftCXXNameMangler::mangleType(const FunctionProtoType *T, Qualifiers,
	SourceRange) {
	// Structors only appear in decls, so at this point we know it's not a
	// structor type.
	// FIXME: This may not be lambda-friendly.
	if (T->getTypeQuals() \|\| T->getRefQualifier() != RQ_None) {
	Out << "$$A8@@";
	mangleFunctionType(T, /D=/nullptr, /ForceThisQuals=/true);
	} else {
	Out << "$$A6";
	mangleFunctionType(T);
	}
	}
	void MicrosoftCXXNameMangler::mangleType(const FunctionNoProtoType *T,
	Qualifiers, SourceRange) {
	Out << "$$A6";
	mangleFunctionType(T);
	}

	void MicrosoftCXXNameMangler::mangleFunctionType(const FunctionType *T,
	const FunctionDecl *D,
	bool ForceThisQuals) {
	// <function-type> ::= <this-cvr-qualifiers> <calling-convention>
	// <return-type> <argument-list> <throw-spec>
	const FunctionProtoType *Proto = dyn_cast<FunctionProtoType>(T);

	SourceRange Range;
	if (D) Range = D->getSourceRange();

	bool IsInLambda = false;
	bool IsStructor = false, HasThisQuals = ForceThisQuals, IsCtorClosure = false;
	CallingConv CC = T->getCallConv();
	if (const CXXMethodDecl *MD = dyn_cast_or_null<CXXMethodDecl>(D)) {
	if (MD->getParent()->isLambda())
	IsInLambda = true;
	if (MD->isInstance())
	HasThisQuals = true;
	if (isa<CXXDestructorDecl>(MD)) {
	IsStructor = true;
	} else if (isa<CXXConstructorDecl>(MD)) {
	IsStructor = true;
	IsCtorClosure = (StructorType == Ctor_CopyingClosure \|\|
	StructorType == Ctor_DefaultClosure) &&
	isStructorDecl(MD);
	if (IsCtorClosure)
	CC = getASTContext().getDefaultCallingConvention(
	/IsVariadic=/false, /IsCXXMethod=/true);
	}
	}

	// If this is a C++ instance method, mangle the CVR qualifiers for the
	// this pointer.
	if (HasThisQuals) {
	Qualifiers Quals = Qualifiers::fromCVRUMask(Proto->getTypeQuals());
	manglePointerExtQualifiers(Quals, /PointeeType=/QualType());
	mangleRefQualifier(Proto->getRefQualifier());
	mangleQualifiers(Quals, /IsMember=/false);
	}

	mangleCallingConvention(CC);

	// <return-type> ::= <type>
	// ::= @ # structors (they have no declared return type)
	if (IsStructor) {
	if (isa<CXXDestructorDecl>(D) && isStructorDecl(D)) {
	// The scalar deleting destructor takes an extra int argument which is not
	// reflected in the AST.
	if (StructorType == Dtor_Deleting) {
	Out << (PointersAre64Bit ? "PEAXI@Z" : "PAXI@Z");
	return;
	}
	// The vbase destructor returns void which is not reflected in the AST.
	if (StructorType == Dtor_Complete) {
	Out << "XXZ";
	return;
	}
	}
	if (IsCtorClosure) {
	// Default constructor closure and copy constructor closure both return
	// void.
	Out << 'X';

	if (StructorType == Ctor_DefaultClosure) {
	// Default constructor closure always has no arguments.
	Out << 'X';
	} else if (StructorType == Ctor_CopyingClosure) {
	// Copy constructor closure always takes an unqualified reference.
	mangleArgumentType(getASTContext().getLValueReferenceType(
	Proto->getParamType(0)
	->getAs<LValueReferenceType>()
	->getPointeeType(),
	/SpelledAsLValue=/true),
	Range);
	Out << '@';
	} else {
	llvm_unreachable("unexpected constructor closure!");
	}
	Out << 'Z';
	return;
	}
	Out << '@';
	} else {
	QualType ResultType = T->getReturnType();
	if (const auto *AT =
	dyn_cast_or_null<AutoType>(ResultType->getContainedAutoType())) {
	Out << '?';
	mangleQualifiers(ResultType.getLocalQualifiers(), /IsMember=/false);
	Out << '?';
	assert(AT->getKeyword() != AutoTypeKeyword::GNUAutoType &&
	"shouldn't need to mangle __auto_type!");
	mangleSourceName(AT->isDecltypeAuto() ? "<decltype-auto>" : "<auto>");
	Out << '@';
	} else if (IsInLambda) {
	Out << '@';
	} else {
	if (ResultType->isVoidType())
	ResultType = ResultType.getUnqualifiedType();
	mangleType(ResultType, Range, QMM_Result);
	}
	}

	// <argument-list> ::= X # void
	// ::= <type>+ @
	// ::= <type>* Z # varargs
	if (!Proto) {
	// Function types without prototypes can arise when mangling a function type
	// within an overloadable function in C. We mangle these as the absence of
	// any parameter types (not even an empty parameter list).
	Out << '@';
	} else if (Proto->getNumParams() == 0 && !Proto->isVariadic()) {
	Out << 'X';
	} else {
	// Happens for function pointer type arguments for example.
	for (unsigned I = 0, E = Proto->getNumParams(); I != E; ++I) {
	mangleArgumentType(Proto->getParamType(I), Range);
	// Mangle each pass_object_size parameter as if it's a parameter of enum
	// type passed directly after the parameter with the pass_object_size
	// attribute. The aforementioned enum's name is __pass_object_size, and we
	// pretend it resides in a top-level namespace called __clang.
	//
	// FIXME: Is there a defined extension notation for the MS ABI, or is it
	// necessary to just cross our fingers and hope this type+namespace
	// combination doesn't conflict with anything?
	if (D)
	if (const auto *P = D->getParamDecl(I)->getAttr<PassObjectSizeAttr>())
	manglePassObjectSizeArg(P);
	}
	// <builtin-type> ::= Z # ellipsis
	if (Proto->isVariadic())
	Out << 'Z';
	else
	Out << '@';
	}

	mangleThrowSpecification(Proto);
	}

	void MicrosoftCXXNameMangler::mangleFunctionClass(const FunctionDecl *FD) {
	// <function-class> ::= <member-function> E? # E designates a 64-bit 'this'
	// # pointer. in 64-bit mode all
	// # 'this' pointers are 64-bit.
	// ::= <global-function>
	// <member-function> ::= A # private: near
	// ::= B # private: far
	// ::= C # private: static near
	// ::= D # private: static far
	// ::= E # private: virtual near
	// ::= F # private: virtual far
	// ::= I # protected: near
	// ::= J # protected: far
	// ::= K # protected: static near
	// ::= L # protected: static far
	// ::= M # protected: virtual near
	// ::= N # protected: virtual far
	// ::= Q # public: near
	// ::= R # public: far
	// ::= S # public: static near
	// ::= T # public: static far
	// ::= U # public: virtual near
	// ::= V # public: virtual far
	// <global-function> ::= Y # global near
	// ::= Z # global far
	if (const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(FD)) {
	bool IsVirtual = MD->isVirtual();
	// When mangling vbase destructor variants, ignore whether or not the
	// underlying destructor was defined to be virtual.
	if (isa<CXXDestructorDecl>(MD) && isStructorDecl(MD) &&
	StructorType == Dtor_Complete) {
	IsVirtual = false;
	}
	switch (MD->getAccess()) {
	case AS_none:
	llvm_unreachable("Unsupported access specifier");
	case AS_private:
	if (MD->isStatic())
	Out << 'C';
	else if (IsVirtual)
	Out << 'E';
	else
	Out << 'A';
	break;
	case AS_protected:
	if (MD->isStatic())
	Out << 'K';
	else if (IsVirtual)
	Out << 'M';
	else
	Out << 'I';
	break;
	case AS_public:
	if (MD->isStatic())
	Out << 'S';
	else if (IsVirtual)
	Out << 'U';
	else
	Out << 'Q';
	}
	} else {
	Out << 'Y';
	}
	}
	void MicrosoftCXXNameMangler::mangleCallingConvention(CallingConv CC) {
	// <calling-convention> ::= A # __cdecl
	// ::= B # __export __cdecl
	// ::= C # __pascal
	// ::= D # __export __pascal
	// ::= E # __thiscall
	// ::= F # __export __thiscall
	// ::= G # __stdcall
	// ::= H # __export __stdcall
	// ::= I # __fastcall
	// ::= J # __export __fastcall
	// ::= Q # __vectorcall
	// ::= w # __regcall
	// The 'export' calling conventions are from a bygone era
	// (coughWin16cough) when functions were declared for export with
	// that keyword. (It didn't actually export them, it just made them so
	// that they could be in a DLL and somebody from another module could call
	// them.)

	switch (CC) {
	default:
	llvm_unreachable("Unsupported CC for mangling");
	case CC_Win64:
	case CC_X86_64SysV:
	case CC_C: Out << 'A'; break;
	case CC_X86Pascal: Out << 'C'; break;
	case CC_X86ThisCall: Out << 'E'; break;
	case CC_X86StdCall: Out << 'G'; break;
	case CC_X86FastCall: Out << 'I'; break;
	case CC_X86VectorCall: Out << 'Q'; break;
	+ case CC_Swift: Out << 'S'; break;
	case CC_X86RegCall: Out << 'w'; break;
	}
	}
	void MicrosoftCXXNameMangler::mangleCallingConvention(const FunctionType *T) {
	mangleCallingConvention(T->getCallConv());
	}
	void MicrosoftCXXNameMangler::mangleThrowSpecification(
	const FunctionProtoType *FT) {
	// <throw-spec> ::= Z # throw(...) (default)
	// ::= @ # throw() or __declspec/__attribute__((nothrow))
	// ::= <type>+
	// NOTE: Since the Microsoft compiler ignores throw specifications, they are
	// all actually mangled as 'Z'. (They're ignored because their associated
	// functionality isn't implemented, and probably never will be.)
	Out << 'Z';
	}

	void MicrosoftCXXNameMangler::mangleType(const UnresolvedUsingType *T,
	Qualifiers, SourceRange Range) {
	// Probably should be mangled as a template instantiation; need to see what
	// VC does first.
	DiagnosticsEngine &Diags = Context.getDiags();
	unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
	"cannot mangle this unresolved dependent type yet");
	Diags.Report(Range.getBegin(), DiagID)
	<< Range;
	}

	// <type> ::= <union-type> \| <struct-type> \| <class-type> \| <enum-type>
	// <union-type> ::= T <name>
	// <struct-type> ::= U <name>
	// <class-type> ::= V <name>
	// <enum-type> ::= W4 <name>
	void MicrosoftCXXNameMangler::mangleTagTypeKind(TagTypeKind TTK) {
	switch (TTK) {
	case TTK_Union:
	Out << 'T';
	break;
	case TTK_Struct:
	case TTK_Interface:
	Out << 'U';
	break;
	case TTK_Class:
	Out << 'V';
	break;
	case TTK_Enum:
	Out << "W4";
	break;
	}
	}
	void MicrosoftCXXNameMangler::mangleType(const EnumType *T, Qualifiers,
	SourceRange) {
	mangleType(cast<TagType>(T)->getDecl());
	}
	void MicrosoftCXXNameMangler::mangleType(const RecordType *T, Qualifiers,
	SourceRange) {
	mangleType(cast<TagType>(T)->getDecl());
	}
	void MicrosoftCXXNameMangler::mangleType(const TagDecl *TD) {
	mangleTagTypeKind(TD->getTagKind());
	mangleName(TD);
	}
	void MicrosoftCXXNameMangler::mangleArtificalTagType(
	TagTypeKind TK, StringRef UnqualifiedName, ArrayRef<StringRef> NestedNames) {
	// <name> ::= <unscoped-name> {[<named-scope>]+ \| [<nested-name>]}? @
	mangleTagTypeKind(TK);

	// Always start with the unqualified name.
	mangleSourceName(UnqualifiedName);

	for (auto I = NestedNames.rbegin(), E = NestedNames.rend(); I != E; ++I)
	mangleSourceName(*I);

	// Terminate the whole name with an '@'.
	Out << '@';
	}

	// <type> ::= <array-type>
	// <array-type> ::= <pointer-cvr-qualifiers> <cvr-qualifiers>
	// [Y <dimension-count> <dimension>+]
	// <element-type> # as global, E is never required
	// It's supposed to be the other way around, but for some strange reason, it
	// isn't. Today this behavior is retained for the sole purpose of backwards
	// compatibility.
	void MicrosoftCXXNameMangler::mangleDecayedArrayType(const ArrayType *T) {
	// This isn't a recursive mangling, so now we have to do it all in this
	// one call.
	manglePointerCVQualifiers(T->getElementType().getQualifiers());
	mangleType(T->getElementType(), SourceRange());
	}
	void MicrosoftCXXNameMangler::mangleType(const ConstantArrayType *T, Qualifiers,
	SourceRange) {
	llvm_unreachable("Should have been special cased");
	}
	void MicrosoftCXXNameMangler::mangleType(const VariableArrayType *T, Qualifiers,
	SourceRange) {
	llvm_unreachable("Should have been special cased");
	}
	void MicrosoftCXXNameMangler::mangleType(const DependentSizedArrayType *T,
	Qualifiers, SourceRange) {
	llvm_unreachable("Should have been special cased");
	}
	void MicrosoftCXXNameMangler::mangleType(const IncompleteArrayType *T,
	Qualifiers, SourceRange) {
	llvm_unreachable("Should have been special cased");
	}
	void MicrosoftCXXNameMangler::mangleArrayType(const ArrayType *T) {
	QualType ElementTy(T, 0);
	SmallVector<llvm::APInt, 3> Dimensions;
	for (;;) {
	if (ElementTy->isConstantArrayType()) {
	const ConstantArrayType *CAT =
	getASTContext().getAsConstantArrayType(ElementTy);
	Dimensions.push_back(CAT->getSize());
	ElementTy = CAT->getElementType();
	} else if (ElementTy->isIncompleteArrayType()) {
	const IncompleteArrayType *IAT =
	getASTContext().getAsIncompleteArrayType(ElementTy);
	Dimensions.push_back(llvm::APInt(32, 0));
	ElementTy = IAT->getElementType();
	} else if (ElementTy->isVariableArrayType()) {
	const VariableArrayType *VAT =
	getASTContext().getAsVariableArrayType(ElementTy);
	Dimensions.push_back(llvm::APInt(32, 0));
	ElementTy = VAT->getElementType();
	} else if (ElementTy->isDependentSizedArrayType()) {
	// The dependent expression has to be folded into a constant (TODO).
	const DependentSizedArrayType *DSAT =
	getASTContext().getAsDependentSizedArrayType(ElementTy);
	DiagnosticsEngine &Diags = Context.getDiags();
	unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
	"cannot mangle this dependent-length array yet");
	Diags.Report(DSAT->getSizeExpr()->getExprLoc(), DiagID)
	<< DSAT->getBracketsRange();
	return;
	} else {
	break;
	}
	}
	Out << 'Y';
	// <dimension-count> ::= <number> # number of extra dimensions
	mangleNumber(Dimensions.size());
	for (const llvm::APInt &Dimension : Dimensions)
	mangleNumber(Dimension.getLimitedValue());
	mangleType(ElementTy, SourceRange(), QMM_Escape);
	}

	// <type> ::= <pointer-to-member-type>
	// <pointer-to-member-type> ::= <pointer-cvr-qualifiers> <cvr-qualifiers>
	// <class name> <type>
	void MicrosoftCXXNameMangler::mangleType(const MemberPointerType *T, Qualifiers Quals,
	SourceRange Range) {
	QualType PointeeType = T->getPointeeType();
	manglePointerCVQualifiers(Quals);
	manglePointerExtQualifiers(Quals, PointeeType);
	if (const FunctionProtoType *FPT = PointeeType->getAs<FunctionProtoType>()) {
	Out << '8';
	mangleName(T->getClass()->castAs<RecordType>()->getDecl());
	mangleFunctionType(FPT, nullptr, true);
	} else {
	mangleQualifiers(PointeeType.getQualifiers(), true);
	mangleName(T->getClass()->castAs<RecordType>()->getDecl());
	mangleType(PointeeType, Range, QMM_Drop);
	}
	}

	void MicrosoftCXXNameMangler::mangleType(const TemplateTypeParmType *T,
	Qualifiers, SourceRange Range) {
	DiagnosticsEngine &Diags = Context.getDiags();
	unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
	"cannot mangle this template type parameter type yet");
	Diags.Report(Range.getBegin(), DiagID)
	<< Range;
	}

	void MicrosoftCXXNameMangler::mangleType(const SubstTemplateTypeParmPackType *T,
	Qualifiers, SourceRange Range) {
	DiagnosticsEngine &Diags = Context.getDiags();
	unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
	"cannot mangle this substituted parameter pack yet");
	Diags.Report(Range.getBegin(), DiagID)
	<< Range;
	}

	// <type> ::= <pointer-type>
	// <pointer-type> ::= E? <pointer-cvr-qualifiers> <cvr-qualifiers> <type>
	// # the E is required for 64-bit non-static pointers
	void MicrosoftCXXNameMangler::mangleType(const PointerType *T, Qualifiers Quals,
	SourceRange Range) {
	QualType PointeeType = T->getPointeeType();
	manglePointerCVQualifiers(Quals);
	manglePointerExtQualifiers(Quals, PointeeType);
	mangleType(PointeeType, Range);
	}

	void MicrosoftCXXNameMangler::mangleType(const ObjCObjectPointerType *T,
	Qualifiers Quals, SourceRange Range) {
	if (T->isObjCIdType() \|\| T->isObjCClassType())
	return mangleType(T->getPointeeType(), Range, QMM_Drop);

	QualType PointeeType = T->getPointeeType();
	manglePointerCVQualifiers(Quals);
	manglePointerExtQualifiers(Quals, PointeeType);
	mangleType(PointeeType, Range);
	}

	// <type> ::= <reference-type>
	// <reference-type> ::= A E? <cvr-qualifiers> <type>
	// # the E is required for 64-bit non-static lvalue references
	void MicrosoftCXXNameMangler::mangleType(const LValueReferenceType *T,
	Qualifiers Quals, SourceRange Range) {
	QualType PointeeType = T->getPointeeType();
	assert(!Quals.hasConst() && !Quals.hasVolatile() && "unexpected qualifier!");
	Out << 'A';
	manglePointerExtQualifiers(Quals, PointeeType);
	mangleType(PointeeType, Range);
	}

	// <type> ::= <r-value-reference-type>
	// <r-value-reference-type> ::= $$Q E? <cvr-qualifiers> <type>
	// # the E is required for 64-bit non-static rvalue references
	void MicrosoftCXXNameMangler::mangleType(const RValueReferenceType *T,
	Qualifiers Quals, SourceRange Range) {
	QualType PointeeType = T->getPointeeType();
	assert(!Quals.hasConst() && !Quals.hasVolatile() && "unexpected qualifier!");
	Out << "$$Q";
	manglePointerExtQualifiers(Quals, PointeeType);
	mangleType(PointeeType, Range);
	}

	void MicrosoftCXXNameMangler::mangleType(const ComplexType *T, Qualifiers,
	SourceRange Range) {
	QualType ElementType = T->getElementType();

	llvm::SmallString<64> TemplateMangling;
	llvm::raw_svector_ostream Stream(TemplateMangling);
	MicrosoftCXXNameMangler Extra(Context, Stream);
	Stream << "?$";
	Extra.mangleSourceName("_Complex");
	Extra.mangleType(ElementType, Range, QMM_Escape);

	mangleArtificalTagType(TTK_Struct, TemplateMangling, {"__clang"});
	}

	void MicrosoftCXXNameMangler::mangleType(const VectorType *T, Qualifiers Quals,
	SourceRange Range) {
	const BuiltinType *ET = T->getElementType()->getAs<BuiltinType>();
	assert(ET && "vectors with non-builtin elements are unsupported");
	uint64_t Width = getASTContext().getTypeSize(T);
	// Pattern match exactly the typedefs in our intrinsic headers. Anything that
	// doesn't match the Intel types uses a custom mangling below.
	size_t OutSizeBefore = Out.tell();
	llvm::Triple::ArchType AT =
	getASTContext().getTargetInfo().getTriple().getArch();
	if (AT == llvm::Triple::x86 \|\| AT == llvm::Triple::x86_64) {
	if (Width == 64 && ET->getKind() == BuiltinType::LongLong) {
	mangleArtificalTagType(TTK_Union, "__m64");
	} else if (Width >= 128) {
	if (ET->getKind() == BuiltinType::Float)
	mangleArtificalTagType(TTK_Union, "__m" + llvm::utostr(Width));
	else if (ET->getKind() == BuiltinType::LongLong)
	mangleArtificalTagType(TTK_Union, "__m" + llvm::utostr(Width) + 'i');
	else if (ET->getKind() == BuiltinType::Double)
	mangleArtificalTagType(TTK_Struct, "__m" + llvm::utostr(Width) + 'd');
	}
	}

	bool IsBuiltin = Out.tell() != OutSizeBefore;
	if (!IsBuiltin) {
	// The MS ABI doesn't have a special mangling for vector types, so we define
	// our own mangling to handle uses of __vector_size__ on user-specified
	// types, and for extensions like __v4sf.

	llvm::SmallString<64> TemplateMangling;
	llvm::raw_svector_ostream Stream(TemplateMangling);
	MicrosoftCXXNameMangler Extra(Context, Stream);
	Stream << "?$";
	Extra.mangleSourceName("__vector");
	Extra.mangleType(QualType(ET, 0), Range, QMM_Escape);
	Extra.mangleIntegerLiteral(llvm::APSInt::getUnsigned(T->getNumElements()),
	/IsBoolean=/false);

	mangleArtificalTagType(TTK_Union, TemplateMangling, {"__clang"});
	}
	}

	void MicrosoftCXXNameMangler::mangleType(const ExtVectorType *T,
	Qualifiers Quals, SourceRange Range) {
	mangleType(static_cast<const VectorType *>(T), Quals, Range);
	}
	void MicrosoftCXXNameMangler::mangleType(const DependentSizedExtVectorType *T,
	Qualifiers, SourceRange Range) {
	DiagnosticsEngine &Diags = Context.getDiags();
	unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
	"cannot mangle this dependent-sized extended vector type yet");
	Diags.Report(Range.getBegin(), DiagID)
	<< Range;
	}

	void MicrosoftCXXNameMangler::mangleType(const DependentAddressSpaceType *T,
	Qualifiers, SourceRange Range) {
	DiagnosticsEngine &Diags = Context.getDiags();
	unsigned DiagID = Diags.getCustomDiagID(
	DiagnosticsEngine::Error,
	"cannot mangle this dependent address space type yet");
	Diags.Report(Range.getBegin(), DiagID) << Range;
	}

	void MicrosoftCXXNameMangler::mangleType(const ObjCInterfaceType *T, Qualifiers,
	SourceRange) {
	// ObjC interfaces have structs underlying them.
	mangleTagTypeKind(TTK_Struct);
	mangleName(T->getDecl());
	}

	void MicrosoftCXXNameMangler::mangleType(const ObjCObjectType *T, Qualifiers,
	SourceRange Range) {
	// We don't allow overloading by different protocol qualification,
	// so mangling them isn't necessary.
	mangleType(T->getBaseType(), Range, QMM_Drop);
	}

	void MicrosoftCXXNameMangler::mangleType(const BlockPointerType *T,
	Qualifiers Quals, SourceRange Range) {
	QualType PointeeType = T->getPointeeType();
	manglePointerCVQualifiers(Quals);
	manglePointerExtQualifiers(Quals, PointeeType);

	Out << "_E";

	mangleFunctionType(PointeeType->castAs<FunctionProtoType>());
	}

	void MicrosoftCXXNameMangler::mangleType(const InjectedClassNameType *,
	Qualifiers, SourceRange) {
	llvm_unreachable("Cannot mangle injected class name type.");
	}

	void MicrosoftCXXNameMangler::mangleType(const TemplateSpecializationType *T,
	Qualifiers, SourceRange Range) {
	DiagnosticsEngine &Diags = Context.getDiags();
	unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
	"cannot mangle this template specialization type yet");
	Diags.Report(Range.getBegin(), DiagID)
	<< Range;
	}

	void MicrosoftCXXNameMangler::mangleType(const DependentNameType *T, Qualifiers,
	SourceRange Range) {
	DiagnosticsEngine &Diags = Context.getDiags();
	unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
	"cannot mangle this dependent name type yet");
	Diags.Report(Range.getBegin(), DiagID)
	<< Range;
	}

	void MicrosoftCXXNameMangler::mangleType(
	const DependentTemplateSpecializationType *T, Qualifiers,
	SourceRange Range) {
	DiagnosticsEngine &Diags = Context.getDiags();
	unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
	"cannot mangle this dependent template specialization type yet");
	Diags.Report(Range.getBegin(), DiagID)
	<< Range;
	}

	void MicrosoftCXXNameMangler::mangleType(const PackExpansionType *T, Qualifiers,
	SourceRange Range) {
	DiagnosticsEngine &Diags = Context.getDiags();
	unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
	"cannot mangle this pack expansion yet");
	Diags.Report(Range.getBegin(), DiagID)
	<< Range;
	}

	void MicrosoftCXXNameMangler::mangleType(const TypeOfType *T, Qualifiers,
	SourceRange Range) {
	DiagnosticsEngine &Diags = Context.getDiags();
	unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
	"cannot mangle this typeof(type) yet");
	Diags.Report(Range.getBegin(), DiagID)
	<< Range;
	}

	void MicrosoftCXXNameMangler::mangleType(const TypeOfExprType *T, Qualifiers,
	SourceRange Range) {
	DiagnosticsEngine &Diags = Context.getDiags();
	unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
	"cannot mangle this typeof(expression) yet");
	Diags.Report(Range.getBegin(), DiagID)
	<< Range;
	}

	void MicrosoftCXXNameMangler::mangleType(const DecltypeType *T, Qualifiers,
	SourceRange Range) {
	DiagnosticsEngine &Diags = Context.getDiags();
	unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
	"cannot mangle this decltype() yet");
	Diags.Report(Range.getBegin(), DiagID)
	<< Range;
	}

	void MicrosoftCXXNameMangler::mangleType(const UnaryTransformType *T,
	Qualifiers, SourceRange Range) {
	DiagnosticsEngine &Diags = Context.getDiags();
	unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
	"cannot mangle this unary transform type yet");
	Diags.Report(Range.getBegin(), DiagID)
	<< Range;
	}

	void MicrosoftCXXNameMangler::mangleType(const AutoType *T, Qualifiers,
	SourceRange Range) {
	assert(T->getDeducedType().isNull() && "expecting a dependent type!");

	DiagnosticsEngine &Diags = Context.getDiags();
	unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
	"cannot mangle this 'auto' type yet");
	Diags.Report(Range.getBegin(), DiagID)
	<< Range;
	}

	void MicrosoftCXXNameMangler::mangleType(
	const DeducedTemplateSpecializationType *T, Qualifiers, SourceRange Range) {
	assert(T->getDeducedType().isNull() && "expecting a dependent type!");

	DiagnosticsEngine &Diags = Context.getDiags();
	unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
	"cannot mangle this deduced class template specialization type yet");
	Diags.Report(Range.getBegin(), DiagID)
	<< Range;
	}

	void MicrosoftCXXNameMangler::mangleType(const AtomicType *T, Qualifiers,
	SourceRange Range) {
	QualType ValueType = T->getValueType();

	llvm::SmallString<64> TemplateMangling;
	llvm::raw_svector_ostream Stream(TemplateMangling);
	MicrosoftCXXNameMangler Extra(Context, Stream);
	Stream << "?$";
	Extra.mangleSourceName("_Atomic");
	Extra.mangleType(ValueType, Range, QMM_Escape);

	mangleArtificalTagType(TTK_Struct, TemplateMangling, {"__clang"});
	}

	void MicrosoftCXXNameMangler::mangleType(const PipeType *T, Qualifiers,
	SourceRange Range) {
	DiagnosticsEngine &Diags = Context.getDiags();
	unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
	"cannot mangle this OpenCL pipe type yet");
	Diags.Report(Range.getBegin(), DiagID)
	<< Range;
	}

	void MicrosoftMangleContextImpl::mangleCXXName(const NamedDecl *D,
	raw_ostream &Out) {
	assert((isa<FunctionDecl>(D) \|\| isa<VarDecl>(D)) &&
	"Invalid mangleName() call, argument is not a variable or function!");
	assert(!isa<CXXConstructorDecl>(D) && !isa<CXXDestructorDecl>(D) &&
	"Invalid mangleName() call on 'structor decl!");

	PrettyStackTraceDecl CrashInfo(D, SourceLocation(),
	getASTContext().getSourceManager(),
	"Mangling declaration");

	msvc_hashing_ostream MHO(Out);
	MicrosoftCXXNameMangler Mangler(*this, MHO);
	return Mangler.mangle(D);
	}

	// <this-adjustment> ::= <no-adjustment> \| <static-adjustment> \|
	// <virtual-adjustment>
	// <no-adjustment> ::= A # private near
	// ::= B # private far
	// ::= I # protected near
	// ::= J # protected far
	// ::= Q # public near
	// ::= R # public far
	// <static-adjustment> ::= G <static-offset> # private near
	// ::= H <static-offset> # private far
	// ::= O <static-offset> # protected near
	// ::= P <static-offset> # protected far
	// ::= W <static-offset> # public near
	// ::= X <static-offset> # public far
	// <virtual-adjustment> ::= $0 <virtual-shift> <static-offset> # private near
	// ::= $1 <virtual-shift> <static-offset> # private far
	// ::= $2 <virtual-shift> <static-offset> # protected near
	// ::= $3 <virtual-shift> <static-offset> # protected far
	// ::= $4 <virtual-shift> <static-offset> # public near
	// ::= $5 <virtual-shift> <static-offset> # public far
	// <virtual-shift> ::= <vtordisp-shift> \| <vtordispex-shift>
	// <vtordisp-shift> ::= <offset-to-vtordisp>
	// <vtordispex-shift> ::= <offset-to-vbptr> <vbase-offset-offset>
	// <offset-to-vtordisp>
	static void mangleThunkThisAdjustment(const CXXMethodDecl *MD,
	const ThisAdjustment &Adjustment,
	MicrosoftCXXNameMangler &Mangler,
	raw_ostream &Out) {
	if (!Adjustment.Virtual.isEmpty()) {
	Out << '$';
	char AccessSpec;
	switch (MD->getAccess()) {
	case AS_none:
	llvm_unreachable("Unsupported access specifier");
	case AS_private:
	AccessSpec = '0';
	break;
	case AS_protected:
	AccessSpec = '2';
	break;
	case AS_public:
	AccessSpec = '4';
	}
	if (Adjustment.Virtual.Microsoft.VBPtrOffset) {
	Out << 'R' << AccessSpec;
	Mangler.mangleNumber(
	static_cast<uint32_t>(Adjustment.Virtual.Microsoft.VBPtrOffset));
	Mangler.mangleNumber(
	static_cast<uint32_t>(Adjustment.Virtual.Microsoft.VBOffsetOffset));
	Mangler.mangleNumber(
	static_cast<uint32_t>(Adjustment.Virtual.Microsoft.VtordispOffset));
	Mangler.mangleNumber(static_cast<uint32_t>(Adjustment.NonVirtual));
	} else {
	Out << AccessSpec;
	Mangler.mangleNumber(
	static_cast<uint32_t>(Adjustment.Virtual.Microsoft.VtordispOffset));
	Mangler.mangleNumber(-static_cast<uint32_t>(Adjustment.NonVirtual));
	}
	} else if (Adjustment.NonVirtual != 0) {
	switch (MD->getAccess()) {
	case AS_none:
	llvm_unreachable("Unsupported access specifier");
	case AS_private:
	Out << 'G';
	break;
	case AS_protected:
	Out << 'O';
	break;
	case AS_public:
	Out << 'W';
	}
	Mangler.mangleNumber(-static_cast<uint32_t>(Adjustment.NonVirtual));
	} else {
	switch (MD->getAccess()) {
	case AS_none:
	llvm_unreachable("Unsupported access specifier");
	case AS_private:
	Out << 'A';
	break;
	case AS_protected:
	Out << 'I';
	break;
	case AS_public:
	Out << 'Q';
	}
	}
	}

	void
	MicrosoftMangleContextImpl::mangleVirtualMemPtrThunk(const CXXMethodDecl *MD,
	raw_ostream &Out) {
	MicrosoftVTableContext *VTContext =
	cast<MicrosoftVTableContext>(getASTContext().getVTableContext());
	const MicrosoftVTableContext::MethodVFTableLocation &ML =
	VTContext->getMethodVFTableLocation(GlobalDecl(MD));

	msvc_hashing_ostream MHO(Out);
	MicrosoftCXXNameMangler Mangler(*this, MHO);
	Mangler.getStream() << "\01?";
	Mangler.mangleVirtualMemPtrThunk(MD, ML);
	}

	void MicrosoftMangleContextImpl::mangleThunk(const CXXMethodDecl *MD,
	const ThunkInfo &Thunk,
	raw_ostream &Out) {
	msvc_hashing_ostream MHO(Out);
	MicrosoftCXXNameMangler Mangler(*this, MHO);
	Mangler.getStream() << "\01?";
	Mangler.mangleName(MD);
	mangleThunkThisAdjustment(MD, Thunk.This, Mangler, MHO);
	if (!Thunk.Return.isEmpty())
	assert(Thunk.Method != nullptr &&
	"Thunk info should hold the overridee decl");

	const CXXMethodDecl *DeclForFPT = Thunk.Method ? Thunk.Method : MD;
	Mangler.mangleFunctionType(
	DeclForFPT->getType()->castAs<FunctionProtoType>(), MD);
	}

	void MicrosoftMangleContextImpl::mangleCXXDtorThunk(
	const CXXDestructorDecl *DD, CXXDtorType Type,
	const ThisAdjustment &Adjustment, raw_ostream &Out) {
	// FIXME: Actually, the dtor thunk should be emitted for vector deleting
	// dtors rather than scalar deleting dtors. Just use the vector deleting dtor
	// mangling manually until we support both deleting dtor types.
	assert(Type == Dtor_Deleting);
	msvc_hashing_ostream MHO(Out);
	MicrosoftCXXNameMangler Mangler(*this, MHO, DD, Type);
	Mangler.getStream() << "\01??_E";
	Mangler.mangleName(DD->getParent());
	mangleThunkThisAdjustment(DD, Adjustment, Mangler, MHO);
	Mangler.mangleFunctionType(DD->getType()->castAs<FunctionProtoType>(), DD);
	}

	void MicrosoftMangleContextImpl::mangleCXXVFTable(
	const CXXRecordDecl Derived, ArrayRef<const CXXRecordDecl > BasePath,
	raw_ostream &Out) {
	// <mangled-name> ::= ?_7 <class-name> <storage-class>
	// <cvr-qualifiers> [<name>] @
	// NOTE: <cvr-qualifiers> here is always 'B' (const). <storage-class>
	// is always '6' for vftables.
	msvc_hashing_ostream MHO(Out);
	MicrosoftCXXNameMangler Mangler(*this, MHO);
	if (Derived->hasAttr<DLLImportAttr>())
	Mangler.getStream() << "\01??_S";
	else
	Mangler.getStream() << "\01??_7";
	Mangler.mangleName(Derived);
	Mangler.getStream() << "6B"; // '6' for vftable, 'B' for const.
	for (const CXXRecordDecl *RD : BasePath)
	Mangler.mangleName(RD);
	Mangler.getStream() << '@';
	}

	void MicrosoftMangleContextImpl::mangleCXXVBTable(
	const CXXRecordDecl Derived, ArrayRef<const CXXRecordDecl > BasePath,
	raw_ostream &Out) {
	// <mangled-name> ::= ?_8 <class-name> <storage-class>
	// <cvr-qualifiers> [<name>] @
	// NOTE: <cvr-qualifiers> here is always 'B' (const). <storage-class>
	// is always '7' for vbtables.
	msvc_hashing_ostream MHO(Out);
	MicrosoftCXXNameMangler Mangler(*this, MHO);
	Mangler.getStream() << "\01??_8";
	Mangler.mangleName(Derived);
	Mangler.getStream() << "7B"; // '7' for vbtable, 'B' for const.
	for (const CXXRecordDecl *RD : BasePath)
	Mangler.mangleName(RD);
	Mangler.getStream() << '@';
	}

	void MicrosoftMangleContextImpl::mangleCXXRTTI(QualType T, raw_ostream &Out) {
	msvc_hashing_ostream MHO(Out);
	MicrosoftCXXNameMangler Mangler(*this, MHO);
	Mangler.getStream() << "\01??_R0";
	Mangler.mangleType(T, SourceRange(), MicrosoftCXXNameMangler::QMM_Result);
	Mangler.getStream() << "@8";
	}

	void MicrosoftMangleContextImpl::mangleCXXRTTIName(QualType T,
	raw_ostream &Out) {
	MicrosoftCXXNameMangler Mangler(*this, Out);
	Mangler.getStream() << '.';
	Mangler.mangleType(T, SourceRange(), MicrosoftCXXNameMangler::QMM_Result);
	}

	void MicrosoftMangleContextImpl::mangleCXXVirtualDisplacementMap(
	const CXXRecordDecl SrcRD, const CXXRecordDecl DstRD, raw_ostream &Out) {
	msvc_hashing_ostream MHO(Out);
	MicrosoftCXXNameMangler Mangler(*this, MHO);
	Mangler.getStream() << "\01??_K";
	Mangler.mangleName(SrcRD);
	Mangler.getStream() << "$C";
	Mangler.mangleName(DstRD);
	}

	void MicrosoftMangleContextImpl::mangleCXXThrowInfo(QualType T, bool IsConst,
	bool IsVolatile,
	bool IsUnaligned,
	uint32_t NumEntries,
	raw_ostream &Out) {
	msvc_hashing_ostream MHO(Out);
	MicrosoftCXXNameMangler Mangler(*this, MHO);
	Mangler.getStream() << "_TI";
	if (IsConst)
	Mangler.getStream() << 'C';
	if (IsVolatile)
	Mangler.getStream() << 'V';
	if (IsUnaligned)
	Mangler.getStream() << 'U';
	Mangler.getStream() << NumEntries;
	Mangler.mangleType(T, SourceRange(), MicrosoftCXXNameMangler::QMM_Result);
	}

	void MicrosoftMangleContextImpl::mangleCXXCatchableTypeArray(
	QualType T, uint32_t NumEntries, raw_ostream &Out) {
	msvc_hashing_ostream MHO(Out);
	MicrosoftCXXNameMangler Mangler(*this, MHO);
	Mangler.getStream() << "_CTA";
	Mangler.getStream() << NumEntries;
	Mangler.mangleType(T, SourceRange(), MicrosoftCXXNameMangler::QMM_Result);
	}

	void MicrosoftMangleContextImpl::mangleCXXCatchableType(
	QualType T, const CXXConstructorDecl *CD, CXXCtorType CT, uint32_t Size,
	uint32_t NVOffset, int32_t VBPtrOffset, uint32_t VBIndex,
	raw_ostream &Out) {
	MicrosoftCXXNameMangler Mangler(*this, Out);
	Mangler.getStream() << "_CT";

	llvm::SmallString<64> RTTIMangling;
	{
	llvm::raw_svector_ostream Stream(RTTIMangling);
	msvc_hashing_ostream MHO(Stream);
	mangleCXXRTTI(T, MHO);
	}
	Mangler.getStream() << RTTIMangling.substr(1);

	// VS2015 CTP6 omits the copy-constructor in the mangled name. This name is,
	// in fact, superfluous but I'm not sure the change was made consciously.
	llvm::SmallString<64> CopyCtorMangling;
	if (!getASTContext().getLangOpts().isCompatibleWithMSVC(
	LangOptions::MSVC2015) &&
	CD) {
	llvm::raw_svector_ostream Stream(CopyCtorMangling);
	msvc_hashing_ostream MHO(Stream);
	mangleCXXCtor(CD, CT, MHO);
	}
	Mangler.getStream() << CopyCtorMangling.substr(1);

	Mangler.getStream() << Size;
	if (VBPtrOffset == -1) {
	if (NVOffset) {
	Mangler.getStream() << NVOffset;
	}
	} else {
	Mangler.getStream() << NVOffset;
	Mangler.getStream() << VBPtrOffset;
	Mangler.getStream() << VBIndex;
	}
	}

	void MicrosoftMangleContextImpl::mangleCXXRTTIBaseClassDescriptor(
	const CXXRecordDecl *Derived, uint32_t NVOffset, int32_t VBPtrOffset,
	uint32_t VBTableOffset, uint32_t Flags, raw_ostream &Out) {
	msvc_hashing_ostream MHO(Out);
	MicrosoftCXXNameMangler Mangler(*this, MHO);
	Mangler.getStream() << "\01??_R1";
	Mangler.mangleNumber(NVOffset);
	Mangler.mangleNumber(VBPtrOffset);
	Mangler.mangleNumber(VBTableOffset);
	Mangler.mangleNumber(Flags);
	Mangler.mangleName(Derived);
	Mangler.getStream() << "8";
	}

	void MicrosoftMangleContextImpl::mangleCXXRTTIBaseClassArray(
	const CXXRecordDecl *Derived, raw_ostream &Out) {
	msvc_hashing_ostream MHO(Out);
	MicrosoftCXXNameMangler Mangler(*this, MHO);
	Mangler.getStream() << "\01??_R2";
	Mangler.mangleName(Derived);
	Mangler.getStream() << "8";
	}

	void MicrosoftMangleContextImpl::mangleCXXRTTIClassHierarchyDescriptor(
	const CXXRecordDecl *Derived, raw_ostream &Out) {
	msvc_hashing_ostream MHO(Out);
	MicrosoftCXXNameMangler Mangler(*this, MHO);
	Mangler.getStream() << "\01??_R3";
	Mangler.mangleName(Derived);
	Mangler.getStream() << "8";
	}

	void MicrosoftMangleContextImpl::mangleCXXRTTICompleteObjectLocator(
	const CXXRecordDecl Derived, ArrayRef<const CXXRecordDecl > BasePath,
	raw_ostream &Out) {
	// <mangled-name> ::= ?_R4 <class-name> <storage-class>
	// <cvr-qualifiers> [<name>] @
	// NOTE: <cvr-qualifiers> here is always 'B' (const). <storage-class>
	// is always '6' for vftables.
	llvm::SmallString<64> VFTableMangling;
	llvm::raw_svector_ostream Stream(VFTableMangling);
	mangleCXXVFTable(Derived, BasePath, Stream);

	if (VFTableMangling.startswith("\01??@")) {
	assert(VFTableMangling.endswith("@"));
	Out << VFTableMangling << "??_R4@";
	return;
	}

	assert(VFTableMangling.startswith("\01??_7") \|\|
	VFTableMangling.startswith("\01??_S"));

	Out << "\01??_R4" << StringRef(VFTableMangling).drop_front(5);
	}

	void MicrosoftMangleContextImpl::mangleSEHFilterExpression(
	const NamedDecl *EnclosingDecl, raw_ostream &Out) {
	msvc_hashing_ostream MHO(Out);
	MicrosoftCXXNameMangler Mangler(*this, MHO);
	// The function body is in the same comdat as the function with the handler,
	// so the numbering here doesn't have to be the same across TUs.
	//
	// <mangled-name> ::= ?filt$ <filter-number> @0
	Mangler.getStream() << "\01?filt$" << SEHFilterIds[EnclosingDecl]++ << "@0@";
	Mangler.mangleName(EnclosingDecl);
	}

	void MicrosoftMangleContextImpl::mangleSEHFinallyBlock(
	const NamedDecl *EnclosingDecl, raw_ostream &Out) {
	msvc_hashing_ostream MHO(Out);
	MicrosoftCXXNameMangler Mangler(*this, MHO);
	// The function body is in the same comdat as the function with the handler,
	// so the numbering here doesn't have to be the same across TUs.
	//
	// <mangled-name> ::= ?fin$ <filter-number> @0
	Mangler.getStream() << "\01?fin$" << SEHFinallyIds[EnclosingDecl]++ << "@0@";
	Mangler.mangleName(EnclosingDecl);
	}

	void MicrosoftMangleContextImpl::mangleTypeName(QualType T, raw_ostream &Out) {
	// This is just a made up unique string for the purposes of tbaa. undname
	// does not know how to demangle it.
	MicrosoftCXXNameMangler Mangler(*this, Out);
	Mangler.getStream() << '?';
	Mangler.mangleType(T, SourceRange());
	}

	void MicrosoftMangleContextImpl::mangleCXXCtor(const CXXConstructorDecl *D,
	CXXCtorType Type,
	raw_ostream &Out) {
	msvc_hashing_ostream MHO(Out);
	MicrosoftCXXNameMangler mangler(*this, MHO, D, Type);
	mangler.mangle(D);
	}

	void MicrosoftMangleContextImpl::mangleCXXDtor(const CXXDestructorDecl *D,
	CXXDtorType Type,
	raw_ostream &Out) {
	msvc_hashing_ostream MHO(Out);
	MicrosoftCXXNameMangler mangler(*this, MHO, D, Type);
	mangler.mangle(D);
	}

	void MicrosoftMangleContextImpl::mangleReferenceTemporary(
	const VarDecl *VD, unsigned ManglingNumber, raw_ostream &Out) {
	msvc_hashing_ostream MHO(Out);
	MicrosoftCXXNameMangler Mangler(*this, MHO);

	Mangler.getStream() << "\01?$RT" << ManglingNumber << '@';
	Mangler.mangle(VD, "");
	}

	void MicrosoftMangleContextImpl::mangleThreadSafeStaticGuardVariable(
	const VarDecl *VD, unsigned GuardNum, raw_ostream &Out) {
	msvc_hashing_ostream MHO(Out);
	MicrosoftCXXNameMangler Mangler(*this, MHO);

	Mangler.getStream() << "\01?$TSS" << GuardNum << '@';
	Mangler.mangleNestedName(VD);
	Mangler.getStream() << "@4HA";
	}

	void MicrosoftMangleContextImpl::mangleStaticGuardVariable(const VarDecl *VD,
	raw_ostream &Out) {
	// <guard-name> ::= ?_B <postfix> @5 <scope-depth>
	// ::= ?__J <postfix> @5 <scope-depth>
	// ::= ?$S <guard-num> @ <postfix> @4IA

	// The first mangling is what MSVC uses to guard static locals in inline
	// functions. It uses a different mangling in external functions to support
	// guarding more than 32 variables. MSVC rejects inline functions with more
	// than 32 static locals. We don't fully implement the second mangling
	// because those guards are not externally visible, and instead use LLVM's
	// default renaming when creating a new guard variable.
	msvc_hashing_ostream MHO(Out);
	MicrosoftCXXNameMangler Mangler(*this, MHO);

	bool Visible = VD->isExternallyVisible();
	if (Visible) {
	Mangler.getStream() << (VD->getTLSKind() ? "\01??__J" : "\01??_B");
	} else {
	Mangler.getStream() << "\01?$S1@";
	}
	unsigned ScopeDepth = 0;
	if (Visible && !getNextDiscriminator(VD, ScopeDepth))
	// If we do not have a discriminator and are emitting a guard variable for
	// use at global scope, then mangling the nested name will not be enough to
	// remove ambiguities.
	Mangler.mangle(VD, "");
	else
	Mangler.mangleNestedName(VD);
	Mangler.getStream() << (Visible ? "@5" : "@4IA");
	if (ScopeDepth)
	Mangler.mangleNumber(ScopeDepth);
	}

	void MicrosoftMangleContextImpl::mangleInitFiniStub(const VarDecl *D,
	char CharCode,
	raw_ostream &Out) {
	msvc_hashing_ostream MHO(Out);
	MicrosoftCXXNameMangler Mangler(*this, MHO);
	Mangler.getStream() << "\01??__" << CharCode;
	Mangler.mangleName(D);
	if (D->isStaticDataMember()) {
	Mangler.mangleVariableEncoding(D);
	Mangler.getStream() << '@';
	}
	// This is the function class mangling. These stubs are global, non-variadic,
	// cdecl functions that return void and take no args.
	Mangler.getStream() << "YAXXZ";
	}

	void MicrosoftMangleContextImpl::mangleDynamicInitializer(const VarDecl *D,
	raw_ostream &Out) {
	// <initializer-name> ::= ?__E <name> YAXXZ
	mangleInitFiniStub(D, 'E', Out);
	}

	void
	MicrosoftMangleContextImpl::mangleDynamicAtExitDestructor(const VarDecl *D,
	raw_ostream &Out) {
	// <destructor-name> ::= ?__F <name> YAXXZ
	mangleInitFiniStub(D, 'F', Out);
	}

	void MicrosoftMangleContextImpl::mangleStringLiteral(const StringLiteral *SL,
	raw_ostream &Out) {
	// <char-type> ::= 0 # char
	// ::= 1 # wchar_t
	// ::= ??? # char16_t/char32_t will need a mangling too...
	//
	// <literal-length> ::= <non-negative integer> # the length of the literal
	//
	// <encoded-crc> ::= <hex digit>+ @ # crc of the literal including
	// # null-terminator
	//
	// <encoded-string> ::= <simple character> # uninteresting character
	// ::= '?$' <hex digit> <hex digit> # these two nibbles
	// # encode the byte for the
	// # character
	// ::= '?' [a-z] # \xe1 - \xfa
	// ::= '?' [A-Z] # \xc1 - \xda
	// ::= '?' [0-9] # [,/\:. \n\t'-]
	//
	// <literal> ::= '??_C@_' <char-type> <literal-length> <encoded-crc>
	// <encoded-string> '@'
	MicrosoftCXXNameMangler Mangler(*this, Out);
	Mangler.getStream() << "\01??_C@_";

	// <char-type>: The "kind" of string literal is encoded into the mangled name.
	if (SL->isWide())
	Mangler.getStream() << '1';
	else
	Mangler.getStream() << '0';

	// <literal-length>: The next part of the mangled name consists of the length
	// of the string.
	// The StringLiteral does not consider the NUL terminator byte(s) but the
	// mangling does.
	// N.B. The length is in terms of bytes, not characters.
	Mangler.mangleNumber(SL->getByteLength() + SL->getCharByteWidth());

	auto GetLittleEndianByte = [&SL](unsigned Index) {
	unsigned CharByteWidth = SL->getCharByteWidth();
	uint32_t CodeUnit = SL->getCodeUnit(Index / CharByteWidth);
	unsigned OffsetInCodeUnit = Index % CharByteWidth;
	return static_cast<char>((CodeUnit >> (8 * OffsetInCodeUnit)) & 0xff);
	};

	auto GetBigEndianByte = [&SL](unsigned Index) {
	unsigned CharByteWidth = SL->getCharByteWidth();
	uint32_t CodeUnit = SL->getCodeUnit(Index / CharByteWidth);
	unsigned OffsetInCodeUnit = (CharByteWidth - 1) - (Index % CharByteWidth);
	return static_cast<char>((CodeUnit >> (8 * OffsetInCodeUnit)) & 0xff);
	};

	// CRC all the bytes of the StringLiteral.
	llvm::JamCRC JC;
	for (unsigned I = 0, E = SL->getByteLength(); I != E; ++I)
	JC.update(GetLittleEndianByte(I));

	// The NUL terminator byte(s) were not present earlier,
	// we need to manually process those bytes into the CRC.
	for (unsigned NullTerminator = 0; NullTerminator < SL->getCharByteWidth();
	++NullTerminator)
	JC.update('\x00');

	// <encoded-crc>: The CRC is encoded utilizing the standard number mangling
	// scheme.
	Mangler.mangleNumber(JC.getCRC());

	// <encoded-string>: The mangled name also contains the first 32 _characters_
	// (including null-terminator bytes) of the StringLiteral.
	// Each character is encoded by splitting them into bytes and then encoding
	// the constituent bytes.
	auto MangleByte = [&Mangler](char Byte) {
	// There are five different manglings for characters:
	// - [a-zA-Z0-9_$]: A one-to-one mapping.
	// - ?[a-z]: The range from \xe1 to \xfa.
	// - ?[A-Z]: The range from \xc1 to \xda.
	// - ?[0-9]: The set of [,/\:. \n\t'-].
	// - ?$XX: A fallback which maps nibbles.
	if (isIdentifierBody(Byte, /AllowDollar=/true)) {
	Mangler.getStream() << Byte;
	} else if (isLetter(Byte & 0x7f)) {
	Mangler.getStream() << '?' << static_cast<char>(Byte & 0x7f);
	} else {
	const char SpecialChars[] = {',', '/', '\\', ':', '.',
	' ', '\n', '\t', '\'', '-'};
	const char *Pos =
	std::find(std::begin(SpecialChars), std::end(SpecialChars), Byte);
	if (Pos != std::end(SpecialChars)) {
	Mangler.getStream() << '?' << (Pos - std::begin(SpecialChars));
	} else {
	Mangler.getStream() << "?$";
	Mangler.getStream() << static_cast<char>('A' + ((Byte >> 4) & 0xf));
	Mangler.getStream() << static_cast<char>('A' + (Byte & 0xf));
	}
	}
	};

	// Enforce our 32 character max.
	unsigned NumCharsToMangle = std::min(32U, SL->getLength());
	for (unsigned I = 0, E = NumCharsToMangle * SL->getCharByteWidth(); I != E;
	++I)
	if (SL->isWide())
	MangleByte(GetBigEndianByte(I));
	else
	MangleByte(GetLittleEndianByte(I));

	// Encode the NUL terminator if there is room.
	if (NumCharsToMangle < 32)
	for (unsigned NullTerminator = 0; NullTerminator < SL->getCharByteWidth();
	++NullTerminator)
	MangleByte(0);

	Mangler.getStream() << '@';
	}

	MicrosoftMangleContext *
	MicrosoftMangleContext::create(ASTContext &Context, DiagnosticsEngine &Diags) {
	return new MicrosoftMangleContextImpl(Context, Diags);
	}
	Index: head/contrib/llvm/tools/clang/lib/CodeGen/CodeGenModule.cpp
	===================================================================
	--- head/contrib/llvm/tools/clang/lib/CodeGen/CodeGenModule.cpp (revision 329409)
	+++ head/contrib/llvm/tools/clang/lib/CodeGen/CodeGenModule.cpp (revision 329410)
	@@ -1,4697 +1,4681 @@
	//===--- CodeGenModule.cpp - Emit LLVM Code from ASTs for a Module --------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This coordinates the per-module state used while generating code.
	//
	//===----------------------------------------------------------------------===//

	#include "CodeGenModule.h"
	#include "CGBlocks.h"
	#include "CGCUDARuntime.h"
	#include "CGCXXABI.h"
	#include "CGCall.h"
	#include "CGDebugInfo.h"
	#include "CGObjCRuntime.h"
	#include "CGOpenCLRuntime.h"
	#include "CGOpenMPRuntime.h"
	#include "CGOpenMPRuntimeNVPTX.h"
	#include "CodeGenFunction.h"
	#include "CodeGenPGO.h"
	#include "ConstantEmitter.h"
	#include "CoverageMappingGen.h"
	#include "TargetInfo.h"
	#include "clang/AST/ASTContext.h"
	#include "clang/AST/CharUnits.h"
	#include "clang/AST/DeclCXX.h"
	#include "clang/AST/DeclObjC.h"
	#include "clang/AST/DeclTemplate.h"
	#include "clang/AST/Mangle.h"
	#include "clang/AST/RecordLayout.h"
	#include "clang/AST/RecursiveASTVisitor.h"
	#include "clang/Basic/Builtins.h"
	#include "clang/Basic/CharInfo.h"
	#include "clang/Basic/Diagnostic.h"
	#include "clang/Basic/Module.h"
	#include "clang/Basic/SourceManager.h"
	#include "clang/Basic/TargetInfo.h"
	#include "clang/Basic/Version.h"
	#include "clang/CodeGen/ConstantInitBuilder.h"
	#include "clang/Frontend/CodeGenOptions.h"
	#include "clang/Sema/SemaDiagnostic.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/Analysis/TargetLibraryInfo.h"
	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/Module.h"
	#include "llvm/ProfileData/InstrProfReader.h"
	#include "llvm/Support/ConvertUTF.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MD5.h"

	using namespace clang;
	using namespace CodeGen;

	static llvm::cl::opt<bool> LimitedCoverage(
	"limited-coverage-experimental", llvm::cl::ZeroOrMore, llvm::cl::Hidden,
	llvm::cl::desc("Emit limited coverage mapping information (experimental)"),
	llvm::cl::init(false));

	static const char AnnotationSection[] = "llvm.metadata";

	static CGCXXABI *createCXXABI(CodeGenModule &CGM) {
	switch (CGM.getTarget().getCXXABI().getKind()) {
	case TargetCXXABI::GenericAArch64:
	case TargetCXXABI::GenericARM:
	case TargetCXXABI::iOS:
	case TargetCXXABI::iOS64:
	case TargetCXXABI::WatchOS:
	case TargetCXXABI::GenericMIPS:
	case TargetCXXABI::GenericItanium:
	case TargetCXXABI::WebAssembly:
	return CreateItaniumCXXABI(CGM);
	case TargetCXXABI::Microsoft:
	return CreateMicrosoftCXXABI(CGM);
	}

	llvm_unreachable("invalid C++ ABI kind");
	}

	CodeGenModule::CodeGenModule(ASTContext &C, const HeaderSearchOptions &HSO,
	const PreprocessorOptions &PPO,
	const CodeGenOptions &CGO, llvm::Module &M,
	DiagnosticsEngine &diags,
	CoverageSourceInfo *CoverageInfo)
	: Context(C), LangOpts(C.getLangOpts()), HeaderSearchOpts(HSO),
	PreprocessorOpts(PPO), CodeGenOpts(CGO), TheModule(M), Diags(diags),
	Target(C.getTargetInfo()), ABI(createCXXABI(*this)),
	VMContext(M.getContext()), Types(this), VTables(this),
	SanitizerMD(new SanitizerMetadata(*this)) {

	// Initialize the type cache.
	llvm::LLVMContext &LLVMContext = M.getContext();
	VoidTy = llvm::Type::getVoidTy(LLVMContext);
	Int8Ty = llvm::Type::getInt8Ty(LLVMContext);
	Int16Ty = llvm::Type::getInt16Ty(LLVMContext);
	Int32Ty = llvm::Type::getInt32Ty(LLVMContext);
	Int64Ty = llvm::Type::getInt64Ty(LLVMContext);
	HalfTy = llvm::Type::getHalfTy(LLVMContext);
	FloatTy = llvm::Type::getFloatTy(LLVMContext);
	DoubleTy = llvm::Type::getDoubleTy(LLVMContext);
	PointerWidthInBits = C.getTargetInfo().getPointerWidth(0);
	PointerAlignInBytes =
	C.toCharUnitsFromBits(C.getTargetInfo().getPointerAlign(0)).getQuantity();
	SizeSizeInBytes =
	C.toCharUnitsFromBits(C.getTargetInfo().getMaxPointerWidth()).getQuantity();
	IntAlignInBytes =
	C.toCharUnitsFromBits(C.getTargetInfo().getIntAlign()).getQuantity();
	IntTy = llvm::IntegerType::get(LLVMContext, C.getTargetInfo().getIntWidth());
	IntPtrTy = llvm::IntegerType::get(LLVMContext,
	C.getTargetInfo().getMaxPointerWidth());
	Int8PtrTy = Int8Ty->getPointerTo(0);
	Int8PtrPtrTy = Int8PtrTy->getPointerTo(0);
	AllocaInt8PtrTy = Int8Ty->getPointerTo(
	M.getDataLayout().getAllocaAddrSpace());
	ASTAllocaAddressSpace = getTargetCodeGenInfo().getASTAllocaAddressSpace();

	RuntimeCC = getTargetCodeGenInfo().getABIInfo().getRuntimeCC();
	BuiltinCC = getTargetCodeGenInfo().getABIInfo().getBuiltinCC();

	if (LangOpts.ObjC1)
	createObjCRuntime();
	if (LangOpts.OpenCL)
	createOpenCLRuntime();
	if (LangOpts.OpenMP)
	createOpenMPRuntime();
	if (LangOpts.CUDA)
	createCUDARuntime();

	// Enable TBAA unless it's suppressed. ThreadSanitizer needs TBAA even at O0.
	if (LangOpts.Sanitize.has(SanitizerKind::Thread) \|\|
	(!CodeGenOpts.RelaxedAliasing && CodeGenOpts.OptimizationLevel > 0))
	TBAA.reset(new CodeGenTBAA(Context, TheModule, CodeGenOpts, getLangOpts(),
	getCXXABI().getMangleContext()));

	// If debug info or coverage generation is enabled, create the CGDebugInfo
	// object.
	if (CodeGenOpts.getDebugInfo() != codegenoptions::NoDebugInfo \|\|
	CodeGenOpts.EmitGcovArcs \|\| CodeGenOpts.EmitGcovNotes)
	DebugInfo.reset(new CGDebugInfo(*this));

	Block.GlobalUniqueCount = 0;

	if (C.getLangOpts().ObjC1)
	ObjCData.reset(new ObjCEntrypoints());

	if (CodeGenOpts.hasProfileClangUse()) {
	auto ReaderOrErr = llvm::IndexedInstrProfReader::create(
	CodeGenOpts.ProfileInstrumentUsePath);
	if (auto E = ReaderOrErr.takeError()) {
	unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
	"Could not read profile %0: %1");
	llvm::handleAllErrors(std::move(E), [&](const llvm::ErrorInfoBase &EI) {
	getDiags().Report(DiagID) << CodeGenOpts.ProfileInstrumentUsePath
	<< EI.message();
	});
	} else
	PGOReader = std::move(ReaderOrErr.get());
	}

	// If coverage mapping generation is enabled, create the
	// CoverageMappingModuleGen object.
	if (CodeGenOpts.CoverageMapping)
	CoverageMapping.reset(new CoverageMappingModuleGen(this, CoverageInfo));
	}

	CodeGenModule::~CodeGenModule() {}

	void CodeGenModule::createObjCRuntime() {
	// This is just isGNUFamily(), but we want to force implementors of
	// new ABIs to decide how best to do this.
	switch (LangOpts.ObjCRuntime.getKind()) {
	case ObjCRuntime::GNUstep:
	case ObjCRuntime::GCC:
	case ObjCRuntime::ObjFW:
	ObjCRuntime.reset(CreateGNUObjCRuntime(*this));
	return;

	case ObjCRuntime::FragileMacOSX:
	case ObjCRuntime::MacOSX:
	case ObjCRuntime::iOS:
	case ObjCRuntime::WatchOS:
	ObjCRuntime.reset(CreateMacObjCRuntime(*this));
	return;
	}
	llvm_unreachable("bad runtime kind");
	}

	void CodeGenModule::createOpenCLRuntime() {
	OpenCLRuntime.reset(new CGOpenCLRuntime(*this));
	}

	void CodeGenModule::createOpenMPRuntime() {
	// Select a specialized code generation class based on the target, if any.
	// If it does not exist use the default implementation.
	switch (getTriple().getArch()) {
	case llvm::Triple::nvptx:
	case llvm::Triple::nvptx64:
	assert(getLangOpts().OpenMPIsDevice &&
	"OpenMP NVPTX is only prepared to deal with device code.");
	OpenMPRuntime.reset(new CGOpenMPRuntimeNVPTX(*this));
	break;
	default:
	if (LangOpts.OpenMPSimd)
	OpenMPRuntime.reset(new CGOpenMPSIMDRuntime(*this));
	else
	OpenMPRuntime.reset(new CGOpenMPRuntime(*this));
	break;
	}
	}

	void CodeGenModule::createCUDARuntime() {
	CUDARuntime.reset(CreateNVCUDARuntime(*this));
	}

	void CodeGenModule::addReplacement(StringRef Name, llvm::Constant *C) {
	Replacements[Name] = C;
	}

	void CodeGenModule::applyReplacements() {
	for (auto &I : Replacements) {
	StringRef MangledName = I.first();
	llvm::Constant *Replacement = I.second;
	llvm::GlobalValue *Entry = GetGlobalValue(MangledName);
	if (!Entry)
	continue;
	auto *OldF = cast<llvm::Function>(Entry);
	auto *NewF = dyn_cast<llvm::Function>(Replacement);
	if (!NewF) {
	if (auto *Alias = dyn_cast<llvm::GlobalAlias>(Replacement)) {
	NewF = dyn_cast<llvm::Function>(Alias->getAliasee());
	} else {
	auto *CE = cast<llvm::ConstantExpr>(Replacement);
	assert(CE->getOpcode() == llvm::Instruction::BitCast \|\|
	CE->getOpcode() == llvm::Instruction::GetElementPtr);
	NewF = dyn_cast<llvm::Function>(CE->getOperand(0));
	}
	}

	// Replace old with new, but keep the old order.
	OldF->replaceAllUsesWith(Replacement);
	if (NewF) {
	NewF->removeFromParent();
	OldF->getParent()->getFunctionList().insertAfter(OldF->getIterator(),
	NewF);
	}
	OldF->eraseFromParent();
	}
	}

	void CodeGenModule::addGlobalValReplacement(llvm::GlobalValue GV, llvm::Constant C) {
	GlobalValReplacements.push_back(std::make_pair(GV, C));
	}

	void CodeGenModule::applyGlobalValReplacements() {
	for (auto &I : GlobalValReplacements) {
	llvm::GlobalValue *GV = I.first;
	llvm::Constant *C = I.second;

	GV->replaceAllUsesWith(C);
	GV->eraseFromParent();
	}
	}

	// This is only used in aliases that we created and we know they have a
	// linear structure.
	static const llvm::GlobalObject *getAliasedGlobal(
	const llvm::GlobalIndirectSymbol &GIS) {
	llvm::SmallPtrSet<const llvm::GlobalIndirectSymbol*, 4> Visited;
	const llvm::Constant *C = &GIS;
	for (;;) {
	C = C->stripPointerCasts();
	if (auto *GO = dyn_cast<llvm::GlobalObject>(C))
	return GO;
	// stripPointerCasts will not walk over weak aliases.
	auto *GIS2 = dyn_cast<llvm::GlobalIndirectSymbol>(C);
	if (!GIS2)
	return nullptr;
	if (!Visited.insert(GIS2).second)
	return nullptr;
	C = GIS2->getIndirectSymbol();
	}
	}

	void CodeGenModule::checkAliases() {
	// Check if the constructed aliases are well formed. It is really unfortunate
	// that we have to do this in CodeGen, but we only construct mangled names
	// and aliases during codegen.
	bool Error = false;
	DiagnosticsEngine &Diags = getDiags();
	for (const GlobalDecl &GD : Aliases) {
	const auto *D = cast<ValueDecl>(GD.getDecl());
	SourceLocation Location;
	bool IsIFunc = D->hasAttr<IFuncAttr>();
	if (const Attr *A = D->getDefiningAttr())
	Location = A->getLocation();
	else
	llvm_unreachable("Not an alias or ifunc?");
	StringRef MangledName = getMangledName(GD);
	llvm::GlobalValue *Entry = GetGlobalValue(MangledName);
	auto *Alias = cast<llvm::GlobalIndirectSymbol>(Entry);
	const llvm::GlobalValue GV = getAliasedGlobal(Alias);
	if (!GV) {
	Error = true;
	Diags.Report(Location, diag::err_cyclic_alias) << IsIFunc;
	} else if (GV->isDeclaration()) {
	Error = true;
	Diags.Report(Location, diag::err_alias_to_undefined)
	<< IsIFunc << IsIFunc;
	} else if (IsIFunc) {
	// Check resolver function type.
	llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(
	GV->getType()->getPointerElementType());
	assert(FTy);
	if (!FTy->getReturnType()->isPointerTy())
	Diags.Report(Location, diag::err_ifunc_resolver_return);
	if (FTy->getNumParams())
	Diags.Report(Location, diag::err_ifunc_resolver_params);
	}

	llvm::Constant *Aliasee = Alias->getIndirectSymbol();
	llvm::GlobalValue *AliaseeGV;
	if (auto CE = dyn_cast<llvm::ConstantExpr>(Aliasee))
	AliaseeGV = cast<llvm::GlobalValue>(CE->getOperand(0));
	else
	AliaseeGV = cast<llvm::GlobalValue>(Aliasee);

	if (const SectionAttr *SA = D->getAttr<SectionAttr>()) {
	StringRef AliasSection = SA->getName();
	if (AliasSection != AliaseeGV->getSection())
	Diags.Report(SA->getLocation(), diag::warn_alias_with_section)
	<< AliasSection << IsIFunc << IsIFunc;
	}

	// We have to handle alias to weak aliases in here. LLVM itself disallows
	// this since the object semantics would not match the IL one. For
	// compatibility with gcc we implement it by just pointing the alias
	// to its aliasee's aliasee. We also warn, since the user is probably
	// expecting the link to be weak.
	if (auto GA = dyn_cast<llvm::GlobalIndirectSymbol>(AliaseeGV)) {
	if (GA->isInterposable()) {
	Diags.Report(Location, diag::warn_alias_to_weak_alias)
	<< GV->getName() << GA->getName() << IsIFunc;
	Aliasee = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
	GA->getIndirectSymbol(), Alias->getType());
	Alias->setIndirectSymbol(Aliasee);
	}
	}
	}
	if (!Error)
	return;

	for (const GlobalDecl &GD : Aliases) {
	StringRef MangledName = getMangledName(GD);
	llvm::GlobalValue *Entry = GetGlobalValue(MangledName);
	auto *Alias = dyn_cast<llvm::GlobalIndirectSymbol>(Entry);
	Alias->replaceAllUsesWith(llvm::UndefValue::get(Alias->getType()));
	Alias->eraseFromParent();
	}
	}

	void CodeGenModule::clear() {
	DeferredDeclsToEmit.clear();
	if (OpenMPRuntime)
	OpenMPRuntime->clear();
	}

	void InstrProfStats::reportDiagnostics(DiagnosticsEngine &Diags,
	StringRef MainFile) {
	if (!hasDiagnostics())
	return;
	if (VisitedInMainFile > 0 && VisitedInMainFile == MissingInMainFile) {
	if (MainFile.empty())
	MainFile = "<stdin>";
	Diags.Report(diag::warn_profile_data_unprofiled) << MainFile;
	} else {
	if (Mismatched > 0)
	Diags.Report(diag::warn_profile_data_out_of_date) << Visited << Mismatched;

	if (Missing > 0)
	Diags.Report(diag::warn_profile_data_missing) << Visited << Missing;
	}
	}

	void CodeGenModule::Release() {
	EmitDeferred();
	EmitVTablesOpportunistically();
	applyGlobalValReplacements();
	applyReplacements();
	checkAliases();
	EmitCXXGlobalInitFunc();
	EmitCXXGlobalDtorFunc();
	EmitCXXThreadLocalInitFunc();
	if (ObjCRuntime)
	if (llvm::Function *ObjCInitFunction = ObjCRuntime->ModuleInitFunction())
	AddGlobalCtor(ObjCInitFunction);
	if (Context.getLangOpts().CUDA && !Context.getLangOpts().CUDAIsDevice &&
	CUDARuntime) {
	if (llvm::Function *CudaCtorFunction = CUDARuntime->makeModuleCtorFunction())
	AddGlobalCtor(CudaCtorFunction);
	if (llvm::Function *CudaDtorFunction = CUDARuntime->makeModuleDtorFunction())
	AddGlobalDtor(CudaDtorFunction);
	}
	if (OpenMPRuntime)
	if (llvm::Function *OpenMPRegistrationFunction =
	OpenMPRuntime->emitRegistrationFunction()) {
	auto ComdatKey = OpenMPRegistrationFunction->hasComdat() ?
	OpenMPRegistrationFunction : nullptr;
	AddGlobalCtor(OpenMPRegistrationFunction, 0, ComdatKey);
	}
	if (PGOReader) {
	getModule().setProfileSummary(PGOReader->getSummary().getMD(VMContext));
	if (PGOStats.hasDiagnostics())
	PGOStats.reportDiagnostics(getDiags(), getCodeGenOpts().MainFileName);
	}
	EmitCtorList(GlobalCtors, "llvm.global_ctors");
	EmitCtorList(GlobalDtors, "llvm.global_dtors");
	EmitGlobalAnnotations();
	EmitStaticExternCAliases();
	EmitDeferredUnusedCoverageMappings();
	if (CoverageMapping)
	CoverageMapping->emit();
	if (CodeGenOpts.SanitizeCfiCrossDso) {
	CodeGenFunction(*this).EmitCfiCheckFail();
	CodeGenFunction(*this).EmitCfiCheckStub();
	}
	emitAtAvailableLinkGuard();
	emitLLVMUsed();
	if (SanStats)
	SanStats->finish();

	if (CodeGenOpts.Autolink &&
	(Context.getLangOpts().Modules \|\| !LinkerOptionsMetadata.empty())) {
	EmitModuleLinkOptions();
	}

	// Record mregparm value now so it is visible through rest of codegen.
	if (Context.getTargetInfo().getTriple().getArch() == llvm::Triple::x86)
	getModule().addModuleFlag(llvm::Module::Error, "NumRegisterParameters",
	CodeGenOpts.NumRegisterParameters);

	if (CodeGenOpts.DwarfVersion) {
	// We actually want the latest version when there are conflicts.
	// We can change from Warning to Latest if such mode is supported.
	getModule().addModuleFlag(llvm::Module::Warning, "Dwarf Version",
	CodeGenOpts.DwarfVersion);
	}
	if (CodeGenOpts.EmitCodeView) {
	// Indicate that we want CodeView in the metadata.
	getModule().addModuleFlag(llvm::Module::Warning, "CodeView", 1);
	}
	if (CodeGenOpts.OptimizationLevel > 0 && CodeGenOpts.StrictVTablePointers) {
	// We don't support LTO with 2 with different StrictVTablePointers
	// FIXME: we could support it by stripping all the information introduced
	// by StrictVTablePointers.

	getModule().addModuleFlag(llvm::Module::Error, "StrictVTablePointers",1);

	llvm::Metadata *Ops[2] = {
	llvm::MDString::get(VMContext, "StrictVTablePointers"),
	llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
	llvm::Type::getInt32Ty(VMContext), 1))};

	getModule().addModuleFlag(llvm::Module::Require,
	"StrictVTablePointersRequirement",
	llvm::MDNode::get(VMContext, Ops));
	}
	if (DebugInfo)
	// We support a single version in the linked module. The LLVM
	// parser will drop debug info with a different version number
	// (and warn about it, too).
	getModule().addModuleFlag(llvm::Module::Warning, "Debug Info Version",
	llvm::DEBUG_METADATA_VERSION);

	// We need to record the widths of enums and wchar_t, so that we can generate
	// the correct build attributes in the ARM backend. wchar_size is also used by
	// TargetLibraryInfo.
	uint64_t WCharWidth =
	Context.getTypeSizeInChars(Context.getWideCharType()).getQuantity();
	getModule().addModuleFlag(llvm::Module::Error, "wchar_size", WCharWidth);

	llvm::Triple::ArchType Arch = Context.getTargetInfo().getTriple().getArch();
	if ( Arch == llvm::Triple::arm
	\|\| Arch == llvm::Triple::armeb
	\|\| Arch == llvm::Triple::thumb
	\|\| Arch == llvm::Triple::thumbeb) {
	// The minimum width of an enum in bytes
	uint64_t EnumWidth = Context.getLangOpts().ShortEnums ? 1 : 4;
	getModule().addModuleFlag(llvm::Module::Error, "min_enum_size", EnumWidth);
	}

	if (CodeGenOpts.SanitizeCfiCrossDso) {
	// Indicate that we want cross-DSO control flow integrity checks.
	getModule().addModuleFlag(llvm::Module::Override, "Cross-DSO CFI", 1);
	}

	if (LangOpts.CUDAIsDevice && getTriple().isNVPTX()) {
	// Indicate whether __nvvm_reflect should be configured to flush denormal
	// floating point values to 0. (This corresponds to its "__CUDA_FTZ"
	// property.)
	getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz",
	LangOpts.CUDADeviceFlushDenormalsToZero ? 1 : 0);
	}

	// Emit OpenCL specific module metadata: OpenCL/SPIR version.
	if (LangOpts.OpenCL) {
	EmitOpenCLMetadata();
	// Emit SPIR version.
	if (getTriple().getArch() == llvm::Triple::spir \|\|
	getTriple().getArch() == llvm::Triple::spir64) {
	// SPIR v2.0 s2.12 - The SPIR version used by the module is stored in the
	// opencl.spir.version named metadata.
	llvm::Metadata *SPIRVerElts[] = {
	llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
	Int32Ty, LangOpts.OpenCLVersion / 100)),
	llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
	Int32Ty, (LangOpts.OpenCLVersion / 100 > 1) ? 0 : 2))};
	llvm::NamedMDNode *SPIRVerMD =
	TheModule.getOrInsertNamedMetadata("opencl.spir.version");
	llvm::LLVMContext &Ctx = TheModule.getContext();
	SPIRVerMD->addOperand(llvm::MDNode::get(Ctx, SPIRVerElts));
	}
	}

	if (uint32_t PLevel = Context.getLangOpts().PICLevel) {
	assert(PLevel < 3 && "Invalid PIC Level");
	getModule().setPICLevel(static_cast<llvm::PICLevel::Level>(PLevel));
	if (Context.getLangOpts().PIE)
	getModule().setPIELevel(static_cast<llvm::PIELevel::Level>(PLevel));
	}

	SimplifyPersonality();

	if (getCodeGenOpts().EmitDeclMetadata)
	EmitDeclMetadata();

	if (getCodeGenOpts().EmitGcovArcs \|\| getCodeGenOpts().EmitGcovNotes)
	EmitCoverageFile();

	if (DebugInfo)
	DebugInfo->finalize();

	EmitVersionIdentMetadata();

	EmitTargetMetadata();
	}

	void CodeGenModule::EmitOpenCLMetadata() {
	// SPIR v2.0 s2.13 - The OpenCL version used by the module is stored in the
	// opencl.ocl.version named metadata node.
	llvm::Metadata *OCLVerElts[] = {
	llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
	Int32Ty, LangOpts.OpenCLVersion / 100)),
	llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
	Int32Ty, (LangOpts.OpenCLVersion % 100) / 10))};
	llvm::NamedMDNode *OCLVerMD =
	TheModule.getOrInsertNamedMetadata("opencl.ocl.version");
	llvm::LLVMContext &Ctx = TheModule.getContext();
	OCLVerMD->addOperand(llvm::MDNode::get(Ctx, OCLVerElts));
	}

	void CodeGenModule::UpdateCompletedType(const TagDecl *TD) {
	// Make sure that this type is translated.
	Types.UpdateCompletedType(TD);
	}

	void CodeGenModule::RefreshTypeCacheForClass(const CXXRecordDecl *RD) {
	// Make sure that this type is translated.
	Types.RefreshTypeCacheForClass(RD);
	}

	llvm::MDNode *CodeGenModule::getTBAATypeInfo(QualType QTy) {
	if (!TBAA)
	return nullptr;
	return TBAA->getTypeInfo(QTy);
	}

	TBAAAccessInfo CodeGenModule::getTBAAAccessInfo(QualType AccessType) {
	// Pointee values may have incomplete types, but they shall never be
	// dereferenced.
	if (AccessType->isIncompleteType())
	return TBAAAccessInfo::getIncompleteInfo();

	uint64_t Size = Context.getTypeSizeInChars(AccessType).getQuantity();
	return TBAAAccessInfo(getTBAATypeInfo(AccessType), Size);
	}

	TBAAAccessInfo
	CodeGenModule::getTBAAVTablePtrAccessInfo(llvm::Type *VTablePtrType) {
	if (!TBAA)
	return TBAAAccessInfo();
	return TBAA->getVTablePtrAccessInfo(VTablePtrType);
	}

	llvm::MDNode *CodeGenModule::getTBAAStructInfo(QualType QTy) {
	if (!TBAA)
	return nullptr;
	return TBAA->getTBAAStructInfo(QTy);
	}

	llvm::MDNode *CodeGenModule::getTBAABaseTypeInfo(QualType QTy) {
	if (!TBAA)
	return nullptr;
	return TBAA->getBaseTypeInfo(QTy);
	}

	llvm::MDNode *CodeGenModule::getTBAAAccessTagInfo(TBAAAccessInfo Info) {
	if (!TBAA)
	return nullptr;
	return TBAA->getAccessTagInfo(Info);
	}

	TBAAAccessInfo CodeGenModule::mergeTBAAInfoForCast(TBAAAccessInfo SourceInfo,
	TBAAAccessInfo TargetInfo) {
	if (!TBAA)
	return TBAAAccessInfo();
	return TBAA->mergeTBAAInfoForCast(SourceInfo, TargetInfo);
	}

	TBAAAccessInfo
	CodeGenModule::mergeTBAAInfoForConditionalOperator(TBAAAccessInfo InfoA,
	TBAAAccessInfo InfoB) {
	if (!TBAA)
	return TBAAAccessInfo();
	return TBAA->mergeTBAAInfoForConditionalOperator(InfoA, InfoB);
	}

	void CodeGenModule::DecorateInstructionWithTBAA(llvm::Instruction *Inst,
	TBAAAccessInfo TBAAInfo) {
	if (llvm::MDNode *Tag = getTBAAAccessTagInfo(TBAAInfo))
	Inst->setMetadata(llvm::LLVMContext::MD_tbaa, Tag);
	}

	void CodeGenModule::DecorateInstructionWithInvariantGroup(
	llvm::Instruction I, const CXXRecordDecl RD) {
	I->setMetadata(llvm::LLVMContext::MD_invariant_group,
	llvm::MDNode::get(getLLVMContext(), {}));
	}

	void CodeGenModule::Error(SourceLocation loc, StringRef message) {
	unsigned diagID = getDiags().getCustomDiagID(DiagnosticsEngine::Error, "%0");
	getDiags().Report(Context.getFullLoc(loc), diagID) << message;
	}

	/// ErrorUnsupported - Print out an error that codegen doesn't support the
	/// specified stmt yet.
	void CodeGenModule::ErrorUnsupported(const Stmt S, const char Type) {
	unsigned DiagID = getDiags().getCustomDiagID(DiagnosticsEngine::Error,
	"cannot compile this %0 yet");
	std::string Msg = Type;
	getDiags().Report(Context.getFullLoc(S->getLocStart()), DiagID)
	<< Msg << S->getSourceRange();
	}

	/// ErrorUnsupported - Print out an error that codegen doesn't support the
	/// specified decl yet.
	void CodeGenModule::ErrorUnsupported(const Decl D, const char Type) {
	unsigned DiagID = getDiags().getCustomDiagID(DiagnosticsEngine::Error,
	"cannot compile this %0 yet");
	std::string Msg = Type;
	getDiags().Report(Context.getFullLoc(D->getLocation()), DiagID) << Msg;
	}

	llvm::ConstantInt *CodeGenModule::getSize(CharUnits size) {
	return llvm::ConstantInt::get(SizeTy, size.getQuantity());
	}

	void CodeGenModule::setGlobalVisibility(llvm::GlobalValue *GV,
	const NamedDecl *D,
	ForDefinition_t IsForDefinition) const {
	// Internal definitions always have default visibility.
	if (GV->hasLocalLinkage()) {
	GV->setVisibility(llvm::GlobalValue::DefaultVisibility);
	return;
	}

	// Set visibility for definitions.
	LinkageInfo LV = D->getLinkageAndVisibility();
	if (LV.isVisibilityExplicit() \|\|
	(IsForDefinition && !GV->hasAvailableExternallyLinkage()))
	GV->setVisibility(GetLLVMVisibility(LV.getVisibility()));
	}

	static llvm::GlobalVariable::ThreadLocalMode GetLLVMTLSModel(StringRef S) {
	return llvm::StringSwitch<llvm::GlobalVariable::ThreadLocalMode>(S)
	.Case("global-dynamic", llvm::GlobalVariable::GeneralDynamicTLSModel)
	.Case("local-dynamic", llvm::GlobalVariable::LocalDynamicTLSModel)
	.Case("initial-exec", llvm::GlobalVariable::InitialExecTLSModel)
	.Case("local-exec", llvm::GlobalVariable::LocalExecTLSModel);
	}

	static llvm::GlobalVariable::ThreadLocalMode GetLLVMTLSModel(
	CodeGenOptions::TLSModel M) {
	switch (M) {
	case CodeGenOptions::GeneralDynamicTLSModel:
	return llvm::GlobalVariable::GeneralDynamicTLSModel;
	case CodeGenOptions::LocalDynamicTLSModel:
	return llvm::GlobalVariable::LocalDynamicTLSModel;
	case CodeGenOptions::InitialExecTLSModel:
	return llvm::GlobalVariable::InitialExecTLSModel;
	case CodeGenOptions::LocalExecTLSModel:
	return llvm::GlobalVariable::LocalExecTLSModel;
	}
	llvm_unreachable("Invalid TLS model!");
	}

	void CodeGenModule::setTLSMode(llvm::GlobalValue *GV, const VarDecl &D) const {
	assert(D.getTLSKind() && "setting TLS mode on non-TLS var!");

	llvm::GlobalValue::ThreadLocalMode TLM;
	TLM = GetLLVMTLSModel(CodeGenOpts.getDefaultTLSModel());

	// Override the TLS model if it is explicitly specified.
	if (const TLSModelAttr *Attr = D.getAttr<TLSModelAttr>()) {
	TLM = GetLLVMTLSModel(Attr->getModel());
	}

	GV->setThreadLocalMode(TLM);
	}

	StringRef CodeGenModule::getMangledName(GlobalDecl GD) {
	GlobalDecl CanonicalGD = GD.getCanonicalDecl();

	// Some ABIs don't have constructor variants. Make sure that base and
	// complete constructors get mangled the same.
	if (const auto *CD = dyn_cast<CXXConstructorDecl>(CanonicalGD.getDecl())) {
	if (!getTarget().getCXXABI().hasConstructorVariants()) {
	CXXCtorType OrigCtorType = GD.getCtorType();
	assert(OrigCtorType == Ctor_Base \|\| OrigCtorType == Ctor_Complete);
	if (OrigCtorType == Ctor_Base)
	CanonicalGD = GlobalDecl(CD, Ctor_Complete);
	}
	}

	auto FoundName = MangledDeclNames.find(CanonicalGD);
	if (FoundName != MangledDeclNames.end())
	return FoundName->second;

	const auto *ND = cast<NamedDecl>(GD.getDecl());
	SmallString<256> Buffer;
	StringRef Str;
	if (getCXXABI().getMangleContext().shouldMangleDeclName(ND)) {
	llvm::raw_svector_ostream Out(Buffer);
	if (const auto *D = dyn_cast<CXXConstructorDecl>(ND))
	getCXXABI().getMangleContext().mangleCXXCtor(D, GD.getCtorType(), Out);
	else if (const auto *D = dyn_cast<CXXDestructorDecl>(ND))
	getCXXABI().getMangleContext().mangleCXXDtor(D, GD.getDtorType(), Out);
	else
	getCXXABI().getMangleContext().mangleName(ND, Out);
	Str = Out.str();
	} else {
	IdentifierInfo *II = ND->getIdentifier();
	assert(II && "Attempt to mangle unnamed decl.");
	const auto *FD = dyn_cast<FunctionDecl>(ND);

	if (FD &&
	FD->getType()->castAs<FunctionType>()->getCallConv() == CC_X86RegCall) {
	llvm::raw_svector_ostream Out(Buffer);
	Out << "__regcall3__" << II->getName();
	Str = Out.str();
	} else {
	Str = II->getName();
	}
	}

	// Keep the first result in the case of a mangling collision.
	auto Result = Manglings.insert(std::make_pair(Str, GD));
	return MangledDeclNames[CanonicalGD] = Result.first->first();
	}

	StringRef CodeGenModule::getBlockMangledName(GlobalDecl GD,
	const BlockDecl *BD) {
	MangleContext &MangleCtx = getCXXABI().getMangleContext();
	const Decl *D = GD.getDecl();

	SmallString<256> Buffer;
	llvm::raw_svector_ostream Out(Buffer);
	if (!D)
	MangleCtx.mangleGlobalBlock(BD,
	dyn_cast_or_null<VarDecl>(initializedGlobalDecl.getDecl()), Out);
	else if (const auto *CD = dyn_cast<CXXConstructorDecl>(D))
	MangleCtx.mangleCtorBlock(CD, GD.getCtorType(), BD, Out);
	else if (const auto *DD = dyn_cast<CXXDestructorDecl>(D))
	MangleCtx.mangleDtorBlock(DD, GD.getDtorType(), BD, Out);
	else
	MangleCtx.mangleBlock(cast<DeclContext>(D), BD, Out);

	auto Result = Manglings.insert(std::make_pair(Out.str(), BD));
	return Result.first->first();
	}

	llvm::GlobalValue *CodeGenModule::GetGlobalValue(StringRef Name) {
	return getModule().getNamedValue(Name);
	}

	/// AddGlobalCtor - Add a function to the list that will be called before
	/// main() runs.
	void CodeGenModule::AddGlobalCtor(llvm::Function *Ctor, int Priority,
	llvm::Constant *AssociatedData) {
	// FIXME: Type coercion of void()* types.
	GlobalCtors.push_back(Structor(Priority, Ctor, AssociatedData));
	}

	/// AddGlobalDtor - Add a function to the list that will be called
	/// when the module is unloaded.
	void CodeGenModule::AddGlobalDtor(llvm::Function *Dtor, int Priority) {
	// FIXME: Type coercion of void()* types.
	GlobalDtors.push_back(Structor(Priority, Dtor, nullptr));
	}

	void CodeGenModule::EmitCtorList(CtorList &Fns, const char *GlobalName) {
	if (Fns.empty()) return;

	// Ctor function type is void()*.
	llvm::FunctionType* CtorFTy = llvm::FunctionType::get(VoidTy, false);
	llvm::Type *CtorPFTy = llvm::PointerType::getUnqual(CtorFTy);

	// Get the type of a ctor entry, { i32, void (), i8 }.
	llvm::StructType *CtorStructTy = llvm::StructType::get(
	Int32Ty, llvm::PointerType::getUnqual(CtorFTy), VoidPtrTy);

	// Construct the constructor and destructor arrays.
	ConstantInitBuilder builder(*this);
	auto ctors = builder.beginArray(CtorStructTy);
	for (const auto &I : Fns) {
	auto ctor = ctors.beginStruct(CtorStructTy);
	ctor.addInt(Int32Ty, I.Priority);
	ctor.add(llvm::ConstantExpr::getBitCast(I.Initializer, CtorPFTy));
	if (I.AssociatedData)
	ctor.add(llvm::ConstantExpr::getBitCast(I.AssociatedData, VoidPtrTy));
	else
	ctor.addNullPointer(VoidPtrTy);
	ctor.finishAndAddTo(ctors);
	}

	auto list =
	ctors.finishAndCreateGlobal(GlobalName, getPointerAlign(),
	/constant/ false,
	llvm::GlobalValue::AppendingLinkage);

	// The LTO linker doesn't seem to like it when we set an alignment
	// on appending variables. Take it off as a workaround.
	list->setAlignment(0);

	Fns.clear();
	}

	llvm::GlobalValue::LinkageTypes
	CodeGenModule::getFunctionLinkage(GlobalDecl GD) {
	const auto *D = cast<FunctionDecl>(GD.getDecl());

	GVALinkage Linkage = getContext().GetGVALinkageForFunction(D);

	if (isa<CXXDestructorDecl>(D) &&
	getCXXABI().useThunkForDtorVariant(cast<CXXDestructorDecl>(D),
	GD.getDtorType())) {
	// Destructor variants in the Microsoft C++ ABI are always internal or
	// linkonce_odr thunks emitted on an as-needed basis.
	return Linkage == GVA_Internal ? llvm::GlobalValue::InternalLinkage
	: llvm::GlobalValue::LinkOnceODRLinkage;
	}

	if (isa<CXXConstructorDecl>(D) &&
	cast<CXXConstructorDecl>(D)->isInheritingConstructor() &&
	Context.getTargetInfo().getCXXABI().isMicrosoft()) {
	// Our approach to inheriting constructors is fundamentally different from
	// that used by the MS ABI, so keep our inheriting constructor thunks
	// internal rather than trying to pick an unambiguous mangling for them.
	return llvm::GlobalValue::InternalLinkage;
	}

	return getLLVMLinkageForDeclarator(D, Linkage, /isConstantVariable=/false);
	}

	void CodeGenModule::setFunctionDLLStorageClass(GlobalDecl GD, llvm::Function *F) {
	const auto *FD = cast<FunctionDecl>(GD.getDecl());

	if (const auto *Dtor = dyn_cast_or_null<CXXDestructorDecl>(FD)) {
	if (getCXXABI().useThunkForDtorVariant(Dtor, GD.getDtorType())) {
	// Don't dllexport/import destructor thunks.
	F->setDLLStorageClass(llvm::GlobalValue::DefaultStorageClass);
	return;
	}
	}

	if (FD->hasAttr<DLLImportAttr>())
	F->setDLLStorageClass(llvm::GlobalVariable::DLLImportStorageClass);
	else if (FD->hasAttr<DLLExportAttr>())
	F->setDLLStorageClass(llvm::GlobalVariable::DLLExportStorageClass);
	else
	F->setDLLStorageClass(llvm::GlobalVariable::DefaultStorageClass);
	}

	llvm::ConstantInt CodeGenModule::CreateCrossDsoCfiTypeId(llvm::Metadata MD) {
	llvm::MDString *MDS = dyn_cast<llvm::MDString>(MD);
	if (!MDS) return nullptr;

	return llvm::ConstantInt::get(Int64Ty, llvm::MD5Hash(MDS->getString()));
	}

	void CodeGenModule::setFunctionDefinitionAttributes(const FunctionDecl *D,
	llvm::Function *F) {
	setNonAliasAttributes(D, F);
	}

	void CodeGenModule::SetLLVMFunctionAttributes(const Decl *D,
	const CGFunctionInfo &Info,
	llvm::Function *F) {
	unsigned CallingConv;
	llvm::AttributeList PAL;
	ConstructAttributeList(F->getName(), Info, D, PAL, CallingConv, false);
	F->setAttributes(PAL);
	F->setCallingConv(static_cast<llvm::CallingConv::ID>(CallingConv));
	}

	/// Determines whether the language options require us to model
	/// unwind exceptions. We treat -fexceptions as mandating this
	/// except under the fragile ObjC ABI with only ObjC exceptions
	/// enabled. This means, for example, that C with -fexceptions
	/// enables this.
	static bool hasUnwindExceptions(const LangOptions &LangOpts) {
	// If exceptions are completely disabled, obviously this is false.
	if (!LangOpts.Exceptions) return false;

	// If C++ exceptions are enabled, this is true.
	if (LangOpts.CXXExceptions) return true;

	// If ObjC exceptions are enabled, this depends on the ABI.
	if (LangOpts.ObjCExceptions) {
	return LangOpts.ObjCRuntime.hasUnwindExceptions();
	}

	return true;
	}

	void CodeGenModule::SetLLVMFunctionAttributesForDefinition(const Decl *D,
	llvm::Function *F) {
	llvm::AttrBuilder B;

	if (CodeGenOpts.UnwindTables)
	B.addAttribute(llvm::Attribute::UWTable);

	if (!hasUnwindExceptions(LangOpts))
	B.addAttribute(llvm::Attribute::NoUnwind);

	if (LangOpts.getStackProtector() == LangOptions::SSPOn)
	B.addAttribute(llvm::Attribute::StackProtect);
	else if (LangOpts.getStackProtector() == LangOptions::SSPStrong)
	B.addAttribute(llvm::Attribute::StackProtectStrong);
	else if (LangOpts.getStackProtector() == LangOptions::SSPReq)
	B.addAttribute(llvm::Attribute::StackProtectReq);

	if (!D) {
	// If we don't have a declaration to control inlining, the function isn't
	// explicitly marked as alwaysinline for semantic reasons, and inlining is
	// disabled, mark the function as noinline.
	if (!F->hasFnAttribute(llvm::Attribute::AlwaysInline) &&
	CodeGenOpts.getInlining() == CodeGenOptions::OnlyAlwaysInlining)
	B.addAttribute(llvm::Attribute::NoInline);

	F->addAttributes(llvm::AttributeList::FunctionIndex, B);
	return;
	}

	// Track whether we need to add the optnone LLVM attribute,
	// starting with the default for this optimization level.
	bool ShouldAddOptNone =
	!CodeGenOpts.DisableO0ImplyOptNone && CodeGenOpts.OptimizationLevel == 0;
	// We can't add optnone in the following cases, it won't pass the verifier.
	ShouldAddOptNone &= !D->hasAttr<MinSizeAttr>();
	ShouldAddOptNone &= !F->hasFnAttribute(llvm::Attribute::AlwaysInline);
	ShouldAddOptNone &= !D->hasAttr<AlwaysInlineAttr>();

	if (ShouldAddOptNone \|\| D->hasAttr<OptimizeNoneAttr>()) {
	B.addAttribute(llvm::Attribute::OptimizeNone);

	// OptimizeNone implies noinline; we should not be inlining such functions.
	B.addAttribute(llvm::Attribute::NoInline);
	assert(!F->hasFnAttribute(llvm::Attribute::AlwaysInline) &&
	"OptimizeNone and AlwaysInline on same function!");

	// We still need to handle naked functions even though optnone subsumes
	// much of their semantics.
	if (D->hasAttr<NakedAttr>())
	B.addAttribute(llvm::Attribute::Naked);

	// OptimizeNone wins over OptimizeForSize and MinSize.
	F->removeFnAttr(llvm::Attribute::OptimizeForSize);
	F->removeFnAttr(llvm::Attribute::MinSize);
	} else if (D->hasAttr<NakedAttr>()) {
	// Naked implies noinline: we should not be inlining such functions.
	B.addAttribute(llvm::Attribute::Naked);
	B.addAttribute(llvm::Attribute::NoInline);
	} else if (D->hasAttr<NoDuplicateAttr>()) {
	B.addAttribute(llvm::Attribute::NoDuplicate);
	} else if (D->hasAttr<NoInlineAttr>()) {
	B.addAttribute(llvm::Attribute::NoInline);
	} else if (D->hasAttr<AlwaysInlineAttr>() &&
	!F->hasFnAttribute(llvm::Attribute::NoInline)) {
	// (noinline wins over always_inline, and we can't specify both in IR)
	B.addAttribute(llvm::Attribute::AlwaysInline);
	} else if (CodeGenOpts.getInlining() == CodeGenOptions::OnlyAlwaysInlining) {
	// If we're not inlining, then force everything that isn't always_inline to
	// carry an explicit noinline attribute.
	if (!F->hasFnAttribute(llvm::Attribute::AlwaysInline))
	B.addAttribute(llvm::Attribute::NoInline);
	} else {
	// Otherwise, propagate the inline hint attribute and potentially use its
	// absence to mark things as noinline.
	if (auto *FD = dyn_cast<FunctionDecl>(D)) {
	if (any_of(FD->redecls(), [&](const FunctionDecl *Redecl) {
	return Redecl->isInlineSpecified();
	})) {
	B.addAttribute(llvm::Attribute::InlineHint);
	} else if (CodeGenOpts.getInlining() ==
	CodeGenOptions::OnlyHintInlining &&
	!FD->isInlined() &&
	!F->hasFnAttribute(llvm::Attribute::AlwaysInline)) {
	B.addAttribute(llvm::Attribute::NoInline);
	}
	}
	}

	// Add other optimization related attributes if we are optimizing this
	// function.
	if (!D->hasAttr<OptimizeNoneAttr>()) {
	if (D->hasAttr<ColdAttr>()) {
	if (!ShouldAddOptNone)
	B.addAttribute(llvm::Attribute::OptimizeForSize);
	B.addAttribute(llvm::Attribute::Cold);
	}

	if (D->hasAttr<MinSizeAttr>())
	B.addAttribute(llvm::Attribute::MinSize);
	}

	F->addAttributes(llvm::AttributeList::FunctionIndex, B);

	unsigned alignment = D->getMaxAlignment() / Context.getCharWidth();
	if (alignment)
	F->setAlignment(alignment);

	// Some C++ ABIs require 2-byte alignment for member functions, in order to
	// reserve a bit for differentiating between virtual and non-virtual member
	// functions. If the current target's C++ ABI requires this and this is a
	// member function, set its alignment accordingly.
	if (getTarget().getCXXABI().areMemberFunctionsAligned()) {
	if (F->getAlignment() < 2 && isa<CXXMethodDecl>(D))
	F->setAlignment(2);
	}

	// In the cross-dso CFI mode, we want !type attributes on definitions only.
	if (CodeGenOpts.SanitizeCfiCrossDso)
	if (auto *FD = dyn_cast<FunctionDecl>(D))
	CreateFunctionTypeMetadata(FD, F);
	}

	void CodeGenModule::SetCommonAttributes(const Decl *D,
	llvm::GlobalValue *GV) {
	if (const auto *ND = dyn_cast_or_null<NamedDecl>(D))
	setGlobalVisibility(GV, ND, ForDefinition);
	else
	GV->setVisibility(llvm::GlobalValue::DefaultVisibility);

	if (D && D->hasAttr<UsedAttr>())
	addUsedGlobal(GV);
	}

	void CodeGenModule::setAliasAttributes(const Decl *D,
	llvm::GlobalValue *GV) {
	SetCommonAttributes(D, GV);

	// Process the dllexport attribute based on whether the original definition
	// (not necessarily the aliasee) was exported.
	if (D->hasAttr<DLLExportAttr>())
	GV->setDLLStorageClass(llvm::GlobalValue::DLLExportStorageClass);
	}

	void CodeGenModule::setNonAliasAttributes(const Decl *D,
	llvm::GlobalObject *GO) {
	SetCommonAttributes(D, GO);

	if (D) {
	if (auto *GV = dyn_cast<llvm::GlobalVariable>(GO)) {
	if (auto *SA = D->getAttr<PragmaClangBSSSectionAttr>())
	GV->addAttribute("bss-section", SA->getName());
	if (auto *SA = D->getAttr<PragmaClangDataSectionAttr>())
	GV->addAttribute("data-section", SA->getName());
	if (auto *SA = D->getAttr<PragmaClangRodataSectionAttr>())
	GV->addAttribute("rodata-section", SA->getName());
	}

	if (auto *F = dyn_cast<llvm::Function>(GO)) {
	if (auto *SA = D->getAttr<PragmaClangTextSectionAttr>())
	if (!D->getAttr<SectionAttr>())
	F->addFnAttr("implicit-section-name", SA->getName());
	}

	if (const SectionAttr *SA = D->getAttr<SectionAttr>())
	GO->setSection(SA->getName());
	}

	getTargetCodeGenInfo().setTargetAttributes(D, GO, *this, ForDefinition);
	}

	void CodeGenModule::SetInternalFunctionAttributes(const Decl *D,
	llvm::Function *F,
	const CGFunctionInfo &FI) {
	SetLLVMFunctionAttributes(D, FI, F);
	SetLLVMFunctionAttributesForDefinition(D, F);

	F->setLinkage(llvm::Function::InternalLinkage);

	setNonAliasAttributes(D, F);
	}

	static void setLinkageForGV(llvm::GlobalValue *GV,
	const NamedDecl *ND) {
	// Set linkage and visibility in case we never see a definition.
	LinkageInfo LV = ND->getLinkageAndVisibility();
	if (!isExternallyVisible(LV.getLinkage())) {
	// Don't set internal linkage on declarations.
	} else {
	if (ND->hasAttr<DLLImportAttr>()) {
	GV->setLinkage(llvm::GlobalValue::ExternalLinkage);
	GV->setDLLStorageClass(llvm::GlobalValue::DLLImportStorageClass);
	} else if (ND->hasAttr<DLLExportAttr>()) {
	GV->setLinkage(llvm::GlobalValue::ExternalLinkage);
	} else if (ND->hasAttr<WeakAttr>() \|\| ND->isWeakImported()) {
	// "extern_weak" is overloaded in LLVM; we probably should have
	// separate linkage types for this.
	GV->setLinkage(llvm::GlobalValue::ExternalWeakLinkage);
	}
	}
	}

	void CodeGenModule::CreateFunctionTypeMetadata(const FunctionDecl *FD,
	llvm::Function *F) {
	// Only if we are checking indirect calls.
	if (!LangOpts.Sanitize.has(SanitizerKind::CFIICall))
	return;

	// Non-static class methods are handled via vtable pointer checks elsewhere.
	if (isa<CXXMethodDecl>(FD) && !cast<CXXMethodDecl>(FD)->isStatic())
	return;

	// Additionally, if building with cross-DSO support...
	if (CodeGenOpts.SanitizeCfiCrossDso) {
	// Skip available_externally functions. They won't be codegen'ed in the
	// current module anyway.
	if (getContext().GetGVALinkageForFunction(FD) == GVA_AvailableExternally)
	return;
	}

	llvm::Metadata *MD = CreateMetadataIdentifierForType(FD->getType());
	F->addTypeMetadata(0, MD);
	F->addTypeMetadata(0, CreateMetadataIdentifierGeneralized(FD->getType()));

	// Emit a hash-based bit set entry for cross-DSO calls.
	if (CodeGenOpts.SanitizeCfiCrossDso)
	if (auto CrossDsoTypeId = CreateCrossDsoCfiTypeId(MD))
	F->addTypeMetadata(0, llvm::ConstantAsMetadata::get(CrossDsoTypeId));
	}

	void CodeGenModule::SetFunctionAttributes(GlobalDecl GD, llvm::Function *F,
	bool IsIncompleteFunction,
	bool IsThunk,
	ForDefinition_t IsForDefinition) {

	if (llvm::Intrinsic::ID IID = F->getIntrinsicID()) {
	// If this is an intrinsic function, set the function's attributes
	// to the intrinsic's attributes.
	F->setAttributes(llvm::Intrinsic::getAttributes(getLLVMContext(), IID));
	return;
	}

	const auto *FD = cast<FunctionDecl>(GD.getDecl());

	if (!IsIncompleteFunction) {
	SetLLVMFunctionAttributes(FD, getTypes().arrangeGlobalDeclaration(GD), F);
	// Setup target-specific attributes.
	if (!IsForDefinition)
	getTargetCodeGenInfo().setTargetAttributes(FD, F, *this,
	NotForDefinition);
	}

	// Add the Returned attribute for "this", except for iOS 5 and earlier
	// where substantial code, including the libstdc++ dylib, was compiled with
	// GCC and does not actually return "this".
	if (!IsThunk && getCXXABI().HasThisReturn(GD) &&
	!(getTriple().isiOS() && getTriple().isOSVersionLT(6))) {
	assert(!F->arg_empty() &&
	F->arg_begin()->getType()
	->canLosslesslyBitCastTo(F->getReturnType()) &&
	"unexpected this return");
	F->addAttribute(1, llvm::Attribute::Returned);
	}

	// Only a few attributes are set on declarations; these may later be
	// overridden by a definition.

	setLinkageForGV(F, FD);
	setGlobalVisibility(F, FD, NotForDefinition);

	if (FD->getAttr<PragmaClangTextSectionAttr>()) {
	F->addFnAttr("implicit-section-name");
	}

	if (const SectionAttr *SA = FD->getAttr<SectionAttr>())
	F->setSection(SA->getName());

	if (FD->isReplaceableGlobalAllocationFunction()) {
	// A replaceable global allocation function does not act like a builtin by
	// default, only if it is invoked by a new-expression or delete-expression.
	F->addAttribute(llvm::AttributeList::FunctionIndex,
	llvm::Attribute::NoBuiltin);

	// A sane operator new returns a non-aliasing pointer.
	// FIXME: Also add NonNull attribute to the return value
	// for the non-nothrow forms?
	auto Kind = FD->getDeclName().getCXXOverloadedOperator();
	if (getCodeGenOpts().AssumeSaneOperatorNew &&
	(Kind == OO_New \|\| Kind == OO_Array_New))
	F->addAttribute(llvm::AttributeList::ReturnIndex,
	llvm::Attribute::NoAlias);
	}

	if (isa<CXXConstructorDecl>(FD) \|\| isa<CXXDestructorDecl>(FD))
	F->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
	else if (const auto *MD = dyn_cast<CXXMethodDecl>(FD))
	if (MD->isVirtual())
	F->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);

	// Don't emit entries for function declarations in the cross-DSO mode. This
	// is handled with better precision by the receiving DSO.
	if (!CodeGenOpts.SanitizeCfiCrossDso)
	CreateFunctionTypeMetadata(FD, F);

	if (getLangOpts().OpenMP && FD->hasAttr<OMPDeclareSimdDeclAttr>())
	getOpenMPRuntime().emitDeclareSimdFunction(FD, F);
	}

	void CodeGenModule::addUsedGlobal(llvm::GlobalValue *GV) {
	assert(!GV->isDeclaration() &&
	"Only globals with definition can force usage.");
	LLVMUsed.emplace_back(GV);
	}

	void CodeGenModule::addCompilerUsedGlobal(llvm::GlobalValue *GV) {
	assert(!GV->isDeclaration() &&
	"Only globals with definition can force usage.");
	LLVMCompilerUsed.emplace_back(GV);
	}

	static void emitUsed(CodeGenModule &CGM, StringRef Name,
	std::vector<llvm::WeakTrackingVH> &List) {
	// Don't create llvm.used if there is no need.
	if (List.empty())
	return;

	// Convert List to what ConstantArray needs.
	SmallVector<llvm::Constant*, 8> UsedArray;
	UsedArray.resize(List.size());
	for (unsigned i = 0, e = List.size(); i != e; ++i) {
	UsedArray[i] =
	llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
	cast<llvm::Constant>(&*List[i]), CGM.Int8PtrTy);
	}

	if (UsedArray.empty())
	return;
	llvm::ArrayType *ATy = llvm::ArrayType::get(CGM.Int8PtrTy, UsedArray.size());

	auto *GV = new llvm::GlobalVariable(
	CGM.getModule(), ATy, false, llvm::GlobalValue::AppendingLinkage,
	llvm::ConstantArray::get(ATy, UsedArray), Name);

	GV->setSection("llvm.metadata");
	}

	void CodeGenModule::emitLLVMUsed() {
	emitUsed(*this, "llvm.used", LLVMUsed);
	emitUsed(*this, "llvm.compiler.used", LLVMCompilerUsed);
	}

	void CodeGenModule::AppendLinkerOptions(StringRef Opts) {
	auto *MDOpts = llvm::MDString::get(getLLVMContext(), Opts);
	LinkerOptionsMetadata.push_back(llvm::MDNode::get(getLLVMContext(), MDOpts));
	}

	void CodeGenModule::AddDetectMismatch(StringRef Name, StringRef Value) {
	llvm::SmallString<32> Opt;
	getTargetCodeGenInfo().getDetectMismatchOption(Name, Value, Opt);
	auto *MDOpts = llvm::MDString::get(getLLVMContext(), Opt);
	LinkerOptionsMetadata.push_back(llvm::MDNode::get(getLLVMContext(), MDOpts));
	}

	void CodeGenModule::AddDependentLib(StringRef Lib) {
	llvm::SmallString<24> Opt;
	getTargetCodeGenInfo().getDependentLibraryOption(Lib, Opt);
	auto *MDOpts = llvm::MDString::get(getLLVMContext(), Opt);
	LinkerOptionsMetadata.push_back(llvm::MDNode::get(getLLVMContext(), MDOpts));
	}

	/// \brief Add link options implied by the given module, including modules
	/// it depends on, using a postorder walk.
	static void addLinkOptionsPostorder(CodeGenModule &CGM, Module *Mod,
	SmallVectorImpl<llvm::MDNode *> &Metadata,
	llvm::SmallPtrSet<Module *, 16> &Visited) {
	// Import this module's parent.
	if (Mod->Parent && Visited.insert(Mod->Parent).second) {
	addLinkOptionsPostorder(CGM, Mod->Parent, Metadata, Visited);
	}

	// Import this module's dependencies.
	for (unsigned I = Mod->Imports.size(); I > 0; --I) {
	if (Visited.insert(Mod->Imports[I - 1]).second)
	addLinkOptionsPostorder(CGM, Mod->Imports[I-1], Metadata, Visited);
	}

	// Add linker options to link against the libraries/frameworks
	// described by this module.
	llvm::LLVMContext &Context = CGM.getLLVMContext();
	for (unsigned I = Mod->LinkLibraries.size(); I > 0; --I) {
	// Link against a framework. Frameworks are currently Darwin only, so we
	// don't to ask TargetCodeGenInfo for the spelling of the linker option.
	if (Mod->LinkLibraries[I-1].IsFramework) {
	llvm::Metadata *Args[2] = {
	llvm::MDString::get(Context, "-framework"),
	llvm::MDString::get(Context, Mod->LinkLibraries[I - 1].Library)};

	Metadata.push_back(llvm::MDNode::get(Context, Args));
	continue;
	}

	// Link against a library.
	llvm::SmallString<24> Opt;
	CGM.getTargetCodeGenInfo().getDependentLibraryOption(
	Mod->LinkLibraries[I-1].Library, Opt);
	auto *OptString = llvm::MDString::get(Context, Opt);
	Metadata.push_back(llvm::MDNode::get(Context, OptString));
	}
	}

	void CodeGenModule::EmitModuleLinkOptions() {
	// Collect the set of all of the modules we want to visit to emit link
	// options, which is essentially the imported modules and all of their
	// non-explicit child modules.
	llvm::SetVector<clang::Module *> LinkModules;
	llvm::SmallPtrSet<clang::Module *, 16> Visited;
	SmallVector<clang::Module *, 16> Stack;

	// Seed the stack with imported modules.
	for (Module *M : ImportedModules) {
	// Do not add any link flags when an implementation TU of a module imports
	// a header of that same module.
	if (M->getTopLevelModuleName() == getLangOpts().CurrentModule &&
	!getLangOpts().isCompilingModule())
	continue;
	if (Visited.insert(M).second)
	Stack.push_back(M);
	}

	// Find all of the modules to import, making a little effort to prune
	// non-leaf modules.
	while (!Stack.empty()) {
	clang::Module *Mod = Stack.pop_back_val();

	bool AnyChildren = false;

	// Visit the submodules of this module.
	for (clang::Module::submodule_iterator Sub = Mod->submodule_begin(),
	SubEnd = Mod->submodule_end();
	Sub != SubEnd; ++Sub) {
	// Skip explicit children; they need to be explicitly imported to be
	// linked against.
	if ((*Sub)->IsExplicit)
	continue;

	if (Visited.insert(*Sub).second) {
	Stack.push_back(*Sub);
	AnyChildren = true;
	}
	}

	// We didn't find any children, so add this module to the list of
	// modules to link against.
	if (!AnyChildren) {
	LinkModules.insert(Mod);
	}
	}

	// Add link options for all of the imported modules in reverse topological
	// order. We don't do anything to try to order import link flags with respect
	// to linker options inserted by things like #pragma comment().
	SmallVector<llvm::MDNode *, 16> MetadataArgs;
	Visited.clear();
	for (Module *M : LinkModules)
	if (Visited.insert(M).second)
	addLinkOptionsPostorder(*this, M, MetadataArgs, Visited);
	std::reverse(MetadataArgs.begin(), MetadataArgs.end());
	LinkerOptionsMetadata.append(MetadataArgs.begin(), MetadataArgs.end());

	// Add the linker options metadata flag.
	auto *NMD = getModule().getOrInsertNamedMetadata("llvm.linker.options");
	for (auto *MD : LinkerOptionsMetadata)
	NMD->addOperand(MD);
	}

	void CodeGenModule::EmitDeferred() {
	// Emit code for any potentially referenced deferred decls. Since a
	// previously unused static decl may become used during the generation of code
	// for a static function, iterate until no changes are made.

	if (!DeferredVTables.empty()) {
	EmitDeferredVTables();

	// Emitting a vtable doesn't directly cause more vtables to
	// become deferred, although it can cause functions to be
	// emitted that then need those vtables.
	assert(DeferredVTables.empty());
	}

	// Stop if we're out of both deferred vtables and deferred declarations.
	if (DeferredDeclsToEmit.empty())
	return;

	// Grab the list of decls to emit. If EmitGlobalDefinition schedules more
	// work, it will not interfere with this.
	std::vector<GlobalDecl> CurDeclsToEmit;
	CurDeclsToEmit.swap(DeferredDeclsToEmit);

	for (GlobalDecl &D : CurDeclsToEmit) {
	// We should call GetAddrOfGlobal with IsForDefinition set to true in order
	// to get GlobalValue with exactly the type we need, not something that
	// might had been created for another decl with the same mangled name but
	// different type.
	llvm::GlobalValue *GV = dyn_cast<llvm::GlobalValue>(
	GetAddrOfGlobal(D, ForDefinition));

	// In case of different address spaces, we may still get a cast, even with
	// IsForDefinition equal to true. Query mangled names table to get
	// GlobalValue.
	if (!GV)
	GV = GetGlobalValue(getMangledName(D));

	// Make sure GetGlobalValue returned non-null.
	assert(GV);

	// Check to see if we've already emitted this. This is necessary
	// for a couple of reasons: first, decls can end up in the
	// deferred-decls queue multiple times, and second, decls can end
	// up with definitions in unusual ways (e.g. by an extern inline
	// function acquiring a strong function redefinition). Just
	// ignore these cases.
	if (!GV->isDeclaration())
	continue;

	// Otherwise, emit the definition and move on to the next one.
	EmitGlobalDefinition(D, GV);

	// If we found out that we need to emit more decls, do that recursively.
	// This has the advantage that the decls are emitted in a DFS and related
	// ones are close together, which is convenient for testing.
	if (!DeferredVTables.empty() \|\| !DeferredDeclsToEmit.empty()) {
	EmitDeferred();
	assert(DeferredVTables.empty() && DeferredDeclsToEmit.empty());
	}
	}
	}

	void CodeGenModule::EmitVTablesOpportunistically() {
	// Try to emit external vtables as available_externally if they have emitted
	// all inlined virtual functions. It runs after EmitDeferred() and therefore
	// is not allowed to create new references to things that need to be emitted
	// lazily. Note that it also uses fact that we eagerly emitting RTTI.

	assert((OpportunisticVTables.empty() \|\| shouldOpportunisticallyEmitVTables())
	&& "Only emit opportunistic vtables with optimizations");

	for (const CXXRecordDecl *RD : OpportunisticVTables) {
	assert(getVTables().isVTableExternal(RD) &&
	"This queue should only contain external vtables");
	if (getCXXABI().canSpeculativelyEmitVTable(RD))
	VTables.GenerateClassData(RD);
	}
	OpportunisticVTables.clear();
	}

	void CodeGenModule::EmitGlobalAnnotations() {
	if (Annotations.empty())
	return;

	// Create a new global variable for the ConstantStruct in the Module.
	llvm::Constant *Array = llvm::ConstantArray::get(llvm::ArrayType::get(
	Annotations[0]->getType(), Annotations.size()), Annotations);
	auto *gv = new llvm::GlobalVariable(getModule(), Array->getType(), false,
	llvm::GlobalValue::AppendingLinkage,
	Array, "llvm.global.annotations");
	gv->setSection(AnnotationSection);
	}

	llvm::Constant *CodeGenModule::EmitAnnotationString(StringRef Str) {
	llvm::Constant *&AStr = AnnotationStrings[Str];
	if (AStr)
	return AStr;

	// Not found yet, create a new global.
	llvm::Constant *s = llvm::ConstantDataArray::getString(getLLVMContext(), Str);
	auto *gv =
	new llvm::GlobalVariable(getModule(), s->getType(), true,
	llvm::GlobalValue::PrivateLinkage, s, ".str");
	gv->setSection(AnnotationSection);
	gv->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
	AStr = gv;
	return gv;
	}

	llvm::Constant *CodeGenModule::EmitAnnotationUnit(SourceLocation Loc) {
	SourceManager &SM = getContext().getSourceManager();
	PresumedLoc PLoc = SM.getPresumedLoc(Loc);
	if (PLoc.isValid())
	return EmitAnnotationString(PLoc.getFilename());
	return EmitAnnotationString(SM.getBufferName(Loc));
	}

	llvm::Constant *CodeGenModule::EmitAnnotationLineNo(SourceLocation L) {
	SourceManager &SM = getContext().getSourceManager();
	PresumedLoc PLoc = SM.getPresumedLoc(L);
	unsigned LineNo = PLoc.isValid() ? PLoc.getLine() :
	SM.getExpansionLineNumber(L);
	return llvm::ConstantInt::get(Int32Ty, LineNo);
	}

	llvm::Constant CodeGenModule::EmitAnnotateAttr(llvm::GlobalValue GV,
	const AnnotateAttr *AA,
	SourceLocation L) {
	// Get the globals for file name, annotation, and the line number.
	llvm::Constant *AnnoGV = EmitAnnotationString(AA->getAnnotation()),
	*UnitGV = EmitAnnotationUnit(L),
	*LineNoCst = EmitAnnotationLineNo(L);

	// Create the ConstantStruct for the global annotation.
	llvm::Constant *Fields[4] = {
	llvm::ConstantExpr::getBitCast(GV, Int8PtrTy),
	llvm::ConstantExpr::getBitCast(AnnoGV, Int8PtrTy),
	llvm::ConstantExpr::getBitCast(UnitGV, Int8PtrTy),
	LineNoCst
	};
	return llvm::ConstantStruct::getAnon(Fields);
	}

	void CodeGenModule::AddGlobalAnnotations(const ValueDecl *D,
	llvm::GlobalValue *GV) {
	assert(D->hasAttr<AnnotateAttr>() && "no annotate attribute");
	// Get the struct elements for these annotations.
	for (const auto *I : D->specific_attrs<AnnotateAttr>())
	Annotations.push_back(EmitAnnotateAttr(GV, I, D->getLocation()));
	}

	bool CodeGenModule::isInSanitizerBlacklist(SanitizerMask Kind,
	llvm::Function *Fn,
	SourceLocation Loc) const {
	const auto &SanitizerBL = getContext().getSanitizerBlacklist();
	// Blacklist by function name.
	if (SanitizerBL.isBlacklistedFunction(Kind, Fn->getName()))
	return true;
	// Blacklist by location.
	if (Loc.isValid())
	return SanitizerBL.isBlacklistedLocation(Kind, Loc);
	// If location is unknown, this may be a compiler-generated function. Assume
	// it's located in the main file.
	auto &SM = Context.getSourceManager();
	if (const auto *MainFile = SM.getFileEntryForID(SM.getMainFileID())) {
	return SanitizerBL.isBlacklistedFile(Kind, MainFile->getName());
	}
	return false;
	}

	bool CodeGenModule::isInSanitizerBlacklist(llvm::GlobalVariable *GV,
	SourceLocation Loc, QualType Ty,
	StringRef Category) const {
	// For now globals can be blacklisted only in ASan and KASan.
	const SanitizerMask EnabledAsanMask = LangOpts.Sanitize.Mask &
	(SanitizerKind::Address \| SanitizerKind::KernelAddress \| SanitizerKind::HWAddress);
	if (!EnabledAsanMask)
	return false;
	const auto &SanitizerBL = getContext().getSanitizerBlacklist();
	if (SanitizerBL.isBlacklistedGlobal(EnabledAsanMask, GV->getName(), Category))
	return true;
	if (SanitizerBL.isBlacklistedLocation(EnabledAsanMask, Loc, Category))
	return true;
	// Check global type.
	if (!Ty.isNull()) {
	// Drill down the array types: if global variable of a fixed type is
	// blacklisted, we also don't instrument arrays of them.
	while (auto AT = dyn_cast<ArrayType>(Ty.getTypePtr()))
	Ty = AT->getElementType();
	Ty = Ty.getCanonicalType().getUnqualifiedType();
	// We allow to blacklist only record types (classes, structs etc.)
	if (Ty->isRecordType()) {
	std::string TypeStr = Ty.getAsString(getContext().getPrintingPolicy());
	if (SanitizerBL.isBlacklistedType(EnabledAsanMask, TypeStr, Category))
	return true;
	}
	}
	return false;
	}

	bool CodeGenModule::imbueXRayAttrs(llvm::Function *Fn, SourceLocation Loc,
	StringRef Category) const {
	if (!LangOpts.XRayInstrument)
	return false;
	const auto &XRayFilter = getContext().getXRayFilter();
	using ImbueAttr = XRayFunctionFilter::ImbueAttribute;
	auto Attr = XRayFunctionFilter::ImbueAttribute::NONE;
	if (Loc.isValid())
	Attr = XRayFilter.shouldImbueLocation(Loc, Category);
	if (Attr == ImbueAttr::NONE)
	Attr = XRayFilter.shouldImbueFunction(Fn->getName());
	switch (Attr) {
	case ImbueAttr::NONE:
	return false;
	case ImbueAttr::ALWAYS:
	Fn->addFnAttr("function-instrument", "xray-always");
	break;
	case ImbueAttr::ALWAYS_ARG1:
	Fn->addFnAttr("function-instrument", "xray-always");
	Fn->addFnAttr("xray-log-args", "1");
	break;
	case ImbueAttr::NEVER:
	Fn->addFnAttr("function-instrument", "xray-never");
	break;
	}
	return true;
	}

	bool CodeGenModule::MustBeEmitted(const ValueDecl *Global) {
	// Never defer when EmitAllDecls is specified.
	if (LangOpts.EmitAllDecls)
	return true;

	return getContext().DeclMustBeEmitted(Global);
	}

	bool CodeGenModule::MayBeEmittedEagerly(const ValueDecl *Global) {
	if (const auto *FD = dyn_cast<FunctionDecl>(Global))
	if (FD->getTemplateSpecializationKind() == TSK_ImplicitInstantiation)
	// Implicit template instantiations may change linkage if they are later
	// explicitly instantiated, so they should not be emitted eagerly.
	return false;
	if (const auto *VD = dyn_cast<VarDecl>(Global))
	if (Context.getInlineVariableDefinitionKind(VD) ==
	ASTContext::InlineVariableDefinitionKind::WeakUnknown)
	// A definition of an inline constexpr static data member may change
	// linkage later if it's redeclared outside the class.
	return false;
	// If OpenMP is enabled and threadprivates must be generated like TLS, delay
	// codegen for global variables, because they may be marked as threadprivate.
	if (LangOpts.OpenMP && LangOpts.OpenMPUseTLS &&
	getContext().getTargetInfo().isTLSSupported() && isa<VarDecl>(Global))
	return false;

	return true;
	}

	ConstantAddress CodeGenModule::GetAddrOfUuidDescriptor(
	const CXXUuidofExpr* E) {
	// Sema has verified that IIDSource has a __declspec(uuid()), and that its
	// well-formed.
	StringRef Uuid = E->getUuidStr();
	std::string Name = "_GUID_" + Uuid.lower();
	std::replace(Name.begin(), Name.end(), '-', '_');

	// The UUID descriptor should be pointer aligned.
	CharUnits Alignment = CharUnits::fromQuantity(PointerAlignInBytes);

	// Look for an existing global.
	if (llvm::GlobalVariable *GV = getModule().getNamedGlobal(Name))
	return ConstantAddress(GV, Alignment);

	llvm::Constant *Init = EmitUuidofInitializer(Uuid);
	assert(Init && "failed to initialize as constant");

	auto *GV = new llvm::GlobalVariable(
	getModule(), Init->getType(),
	/isConstant=/true, llvm::GlobalValue::LinkOnceODRLinkage, Init, Name);
	if (supportsCOMDAT())
	GV->setComdat(TheModule.getOrInsertComdat(GV->getName()));
	return ConstantAddress(GV, Alignment);
	}

	ConstantAddress CodeGenModule::GetWeakRefReference(const ValueDecl *VD) {
	const AliasAttr *AA = VD->getAttr<AliasAttr>();
	assert(AA && "No alias?");

	CharUnits Alignment = getContext().getDeclAlign(VD);
	llvm::Type *DeclTy = getTypes().ConvertTypeForMem(VD->getType());

	// See if there is already something with the target's name in the module.
	llvm::GlobalValue *Entry = GetGlobalValue(AA->getAliasee());
	if (Entry) {
	unsigned AS = getContext().getTargetAddressSpace(VD->getType());
	auto Ptr = llvm::ConstantExpr::getBitCast(Entry, DeclTy->getPointerTo(AS));
	return ConstantAddress(Ptr, Alignment);
	}

	llvm::Constant *Aliasee;
	if (isa<llvm::FunctionType>(DeclTy))
	Aliasee = GetOrCreateLLVMFunction(AA->getAliasee(), DeclTy,
	GlobalDecl(cast<FunctionDecl>(VD)),
	/ForVTable=/false);
	else
	Aliasee = GetOrCreateLLVMGlobal(AA->getAliasee(),
	llvm::PointerType::getUnqual(DeclTy),
	nullptr);

	auto *F = cast<llvm::GlobalValue>(Aliasee);
	F->setLinkage(llvm::Function::ExternalWeakLinkage);
	WeakRefReferences.insert(F);

	return ConstantAddress(Aliasee, Alignment);
	}

	void CodeGenModule::EmitGlobal(GlobalDecl GD) {
	const auto *Global = cast<ValueDecl>(GD.getDecl());

	// Weak references don't produce any output by themselves.
	if (Global->hasAttr<WeakRefAttr>())
	return;

	// If this is an alias definition (which otherwise looks like a declaration)
	// emit it now.
	if (Global->hasAttr<AliasAttr>())
	return EmitAliasDefinition(GD);

	// IFunc like an alias whose value is resolved at runtime by calling resolver.
	if (Global->hasAttr<IFuncAttr>())
	return emitIFuncDefinition(GD);

	// If this is CUDA, be selective about which declarations we emit.
	if (LangOpts.CUDA) {
	if (LangOpts.CUDAIsDevice) {
	if (!Global->hasAttr<CUDADeviceAttr>() &&
	!Global->hasAttr<CUDAGlobalAttr>() &&
	!Global->hasAttr<CUDAConstantAttr>() &&
	!Global->hasAttr<CUDASharedAttr>())
	return;
	} else {
	// We need to emit host-side 'shadows' for all global
	// device-side variables because the CUDA runtime needs their
	// size and host-side address in order to provide access to
	// their device-side incarnations.

	// So device-only functions are the only things we skip.
	if (isa<FunctionDecl>(Global) && !Global->hasAttr<CUDAHostAttr>() &&
	Global->hasAttr<CUDADeviceAttr>())
	return;

	assert((isa<FunctionDecl>(Global) \|\| isa<VarDecl>(Global)) &&
	"Expected Variable or Function");
	}
	}

	if (LangOpts.OpenMP) {
	// If this is OpenMP device, check if it is legal to emit this global
	// normally.
	if (OpenMPRuntime && OpenMPRuntime->emitTargetGlobal(GD))
	return;
	if (auto *DRD = dyn_cast<OMPDeclareReductionDecl>(Global)) {
	if (MustBeEmitted(Global))
	EmitOMPDeclareReduction(DRD);
	return;
	}
	}

	// Ignore declarations, they will be emitted on their first use.
	if (const auto *FD = dyn_cast<FunctionDecl>(Global)) {
	// Forward declarations are emitted lazily on first use.
	if (!FD->doesThisDeclarationHaveABody()) {
	if (!FD->doesDeclarationForceExternallyVisibleDefinition())
	return;

	StringRef MangledName = getMangledName(GD);

	// Compute the function info and LLVM type.
	const CGFunctionInfo &FI = getTypes().arrangeGlobalDeclaration(GD);
	llvm::Type *Ty = getTypes().GetFunctionType(FI);

	GetOrCreateLLVMFunction(MangledName, Ty, GD, /ForVTable=/false,
	/DontDefer=/false);
	return;
	}
	} else {
	const auto *VD = cast<VarDecl>(Global);
	assert(VD->isFileVarDecl() && "Cannot emit local var decl as global.");
	// We need to emit device-side global CUDA variables even if a
	// variable does not have a definition -- we still need to define
	// host-side shadow for it.
	bool MustEmitForCuda = LangOpts.CUDA && !LangOpts.CUDAIsDevice &&
	!VD->hasDefinition() &&
	(VD->hasAttr<CUDAConstantAttr>() \|\|
	VD->hasAttr<CUDADeviceAttr>());
	if (!MustEmitForCuda &&
	VD->isThisDeclarationADefinition() != VarDecl::Definition &&
	!Context.isMSStaticDataMemberInlineDefinition(VD)) {
	// If this declaration may have caused an inline variable definition to
	// change linkage, make sure that it's emitted.
	if (Context.getInlineVariableDefinitionKind(VD) ==
	ASTContext::InlineVariableDefinitionKind::Strong)
	GetAddrOfGlobalVar(VD);
	return;
	}
	}

	// Defer code generation to first use when possible, e.g. if this is an inline
	// function. If the global must always be emitted, do it eagerly if possible
	// to benefit from cache locality.
	if (MustBeEmitted(Global) && MayBeEmittedEagerly(Global)) {
	// Emit the definition if it can't be deferred.
	EmitGlobalDefinition(GD);
	return;
	}

	// If we're deferring emission of a C++ variable with an
	// initializer, remember the order in which it appeared in the file.
	if (getLangOpts().CPlusPlus && isa<VarDecl>(Global) &&
	cast<VarDecl>(Global)->hasInit()) {
	DelayedCXXInitPosition[Global] = CXXGlobalInits.size();
	CXXGlobalInits.push_back(nullptr);
	}

	StringRef MangledName = getMangledName(GD);
	if (GetGlobalValue(MangledName) != nullptr) {
	// The value has already been used and should therefore be emitted.
	addDeferredDeclToEmit(GD);
	} else if (MustBeEmitted(Global)) {
	// The value must be emitted, but cannot be emitted eagerly.
	assert(!MayBeEmittedEagerly(Global));
	addDeferredDeclToEmit(GD);
	} else {
	// Otherwise, remember that we saw a deferred decl with this name. The
	// first use of the mangled name will cause it to move into
	// DeferredDeclsToEmit.
	DeferredDecls[MangledName] = GD;
	}
	}

	// Check if T is a class type with a destructor that's not dllimport.
	static bool HasNonDllImportDtor(QualType T) {
	if (const auto *RT = T->getBaseElementTypeUnsafe()->getAs<RecordType>())
	if (CXXRecordDecl *RD = dyn_cast<CXXRecordDecl>(RT->getDecl()))
	if (RD->getDestructor() && !RD->getDestructor()->hasAttr<DLLImportAttr>())
	return true;

	return false;
	}

	namespace {
	struct FunctionIsDirectlyRecursive :
	public RecursiveASTVisitor<FunctionIsDirectlyRecursive> {
	const StringRef Name;
	const Builtin::Context &BI;
	bool Result;
	FunctionIsDirectlyRecursive(StringRef N, const Builtin::Context &C) :
	Name(N), BI(C), Result(false) {
	}
	typedef RecursiveASTVisitor<FunctionIsDirectlyRecursive> Base;

	bool TraverseCallExpr(CallExpr *E) {
	const FunctionDecl *FD = E->getDirectCallee();
	if (!FD)
	return true;
	AsmLabelAttr *Attr = FD->getAttr<AsmLabelAttr>();
	if (Attr && Name == Attr->getLabel()) {
	Result = true;
	return false;
	}
	unsigned BuiltinID = FD->getBuiltinID();
	if (!BuiltinID \|\| !BI.isLibFunction(BuiltinID))
	return true;
	StringRef BuiltinName = BI.getName(BuiltinID);
	if (BuiltinName.startswith("__builtin_") &&
	Name == BuiltinName.slice(strlen("__builtin_"), StringRef::npos)) {
	Result = true;
	return false;
	}
	return true;
	}
	};

	// Make sure we're not referencing non-imported vars or functions.
	struct DLLImportFunctionVisitor
	: public RecursiveASTVisitor<DLLImportFunctionVisitor> {
	bool SafeToInline = true;

	bool shouldVisitImplicitCode() const { return true; }

	bool VisitVarDecl(VarDecl *VD) {
	if (VD->getTLSKind()) {
	// A thread-local variable cannot be imported.
	SafeToInline = false;
	return SafeToInline;
	}

	// A variable definition might imply a destructor call.
	if (VD->isThisDeclarationADefinition())
	SafeToInline = !HasNonDllImportDtor(VD->getType());

	return SafeToInline;
	}

	bool VisitCXXBindTemporaryExpr(CXXBindTemporaryExpr *E) {
	if (const auto *D = E->getTemporary()->getDestructor())
	SafeToInline = D->hasAttr<DLLImportAttr>();
	return SafeToInline;
	}

	bool VisitDeclRefExpr(DeclRefExpr *E) {
	ValueDecl *VD = E->getDecl();
	if (isa<FunctionDecl>(VD))
	SafeToInline = VD->hasAttr<DLLImportAttr>();
	else if (VarDecl *V = dyn_cast<VarDecl>(VD))
	SafeToInline = !V->hasGlobalStorage() \|\| V->hasAttr<DLLImportAttr>();
	return SafeToInline;
	}

	bool VisitCXXConstructExpr(CXXConstructExpr *E) {
	SafeToInline = E->getConstructor()->hasAttr<DLLImportAttr>();
	return SafeToInline;
	}

	bool VisitCXXMemberCallExpr(CXXMemberCallExpr *E) {
	CXXMethodDecl *M = E->getMethodDecl();
	if (!M) {
	// Call through a pointer to member function. This is safe to inline.
	SafeToInline = true;
	} else {
	SafeToInline = M->hasAttr<DLLImportAttr>();
	}
	return SafeToInline;
	}

	bool VisitCXXDeleteExpr(CXXDeleteExpr *E) {
	SafeToInline = E->getOperatorDelete()->hasAttr<DLLImportAttr>();
	return SafeToInline;
	}

	bool VisitCXXNewExpr(CXXNewExpr *E) {
	SafeToInline = E->getOperatorNew()->hasAttr<DLLImportAttr>();
	return SafeToInline;
	}
	};
	}

	// isTriviallyRecursive - Check if this function calls another
	// decl that, because of the asm attribute or the other decl being a builtin,
	// ends up pointing to itself.
	bool
	CodeGenModule::isTriviallyRecursive(const FunctionDecl *FD) {
	StringRef Name;
	if (getCXXABI().getMangleContext().shouldMangleDeclName(FD)) {
	// asm labels are a special kind of mangling we have to support.
	AsmLabelAttr *Attr = FD->getAttr<AsmLabelAttr>();
	if (!Attr)
	return false;
	Name = Attr->getLabel();
	} else {
	Name = FD->getName();
	}

	FunctionIsDirectlyRecursive Walker(Name, Context.BuiltinInfo);
	Walker.TraverseFunctionDecl(const_cast<FunctionDecl*>(FD));
	return Walker.Result;
	}

	bool CodeGenModule::shouldEmitFunction(GlobalDecl GD) {
	if (getFunctionLinkage(GD) != llvm::Function::AvailableExternallyLinkage)
	return true;
	const auto *F = cast<FunctionDecl>(GD.getDecl());
	if (CodeGenOpts.OptimizationLevel == 0 && !F->hasAttr<AlwaysInlineAttr>())
	return false;

	if (F->hasAttr<DLLImportAttr>()) {
	// Check whether it would be safe to inline this dllimport function.
	DLLImportFunctionVisitor Visitor;
	Visitor.TraverseFunctionDecl(const_cast<FunctionDecl*>(F));
	if (!Visitor.SafeToInline)
	return false;

	if (const CXXDestructorDecl *Dtor = dyn_cast<CXXDestructorDecl>(F)) {
	// Implicit destructor invocations aren't captured in the AST, so the
	// check above can't see them. Check for them manually here.
	for (const Decl *Member : Dtor->getParent()->decls())
	if (isa<FieldDecl>(Member))
	if (HasNonDllImportDtor(cast<FieldDecl>(Member)->getType()))
	return false;
	for (const CXXBaseSpecifier &B : Dtor->getParent()->bases())
	if (HasNonDllImportDtor(B.getType()))
	return false;
	}
	}

	// PR9614. Avoid cases where the source code is lying to us. An available
	// externally function should have an equivalent function somewhere else,
	// but a function that calls itself is clearly not equivalent to the real
	// implementation.
	// This happens in glibc's btowc and in some configure checks.
	return !isTriviallyRecursive(F);
	}

	bool CodeGenModule::shouldOpportunisticallyEmitVTables() {
	return CodeGenOpts.OptimizationLevel > 0;
	}

	void CodeGenModule::EmitGlobalDefinition(GlobalDecl GD, llvm::GlobalValue *GV) {
	const auto *D = cast<ValueDecl>(GD.getDecl());

	PrettyStackTraceDecl CrashInfo(const_cast<ValueDecl *>(D), D->getLocation(),
	Context.getSourceManager(),
	"Generating code for declaration");

	if (isa<FunctionDecl>(D)) {
	// At -O0, don't generate IR for functions with available_externally
	// linkage.
	if (!shouldEmitFunction(GD))
	return;

	if (const auto *Method = dyn_cast<CXXMethodDecl>(D)) {
	// Make sure to emit the definition(s) before we emit the thunks.
	// This is necessary for the generation of certain thunks.
	if (const auto *CD = dyn_cast<CXXConstructorDecl>(Method))
	ABI->emitCXXStructor(CD, getFromCtorType(GD.getCtorType()));
	else if (const auto *DD = dyn_cast<CXXDestructorDecl>(Method))
	ABI->emitCXXStructor(DD, getFromDtorType(GD.getDtorType()));
	else
	EmitGlobalFunctionDefinition(GD, GV);

	if (Method->isVirtual())
	getVTables().EmitThunks(GD);

	return;
	}

	return EmitGlobalFunctionDefinition(GD, GV);
	}

	if (const auto *VD = dyn_cast<VarDecl>(D))
	return EmitGlobalVarDefinition(VD, !VD->hasDefinition());

	llvm_unreachable("Invalid argument to EmitGlobalDefinition()");
	}

	static void ReplaceUsesOfNonProtoTypeWithRealFunction(llvm::GlobalValue *Old,
	llvm::Function *NewFn);

	/// GetOrCreateLLVMFunction - If the specified mangled name is not in the
	/// module, create and return an llvm Function with the specified type. If there
	/// is something in the module with the specified name, return it potentially
	/// bitcasted to the right type.
	///
	/// If D is non-null, it specifies a decl that correspond to this. This is used
	/// to set the attributes on the function when it is first created.
	llvm::Constant *CodeGenModule::GetOrCreateLLVMFunction(
	StringRef MangledName, llvm::Type *Ty, GlobalDecl GD, bool ForVTable,
	bool DontDefer, bool IsThunk, llvm::AttributeList ExtraAttrs,
	ForDefinition_t IsForDefinition) {
	const Decl *D = GD.getDecl();

	// Lookup the entry, lazily creating it if necessary.
	llvm::GlobalValue *Entry = GetGlobalValue(MangledName);
	if (Entry) {
	if (WeakRefReferences.erase(Entry)) {
	const FunctionDecl *FD = cast_or_null<FunctionDecl>(D);
	if (FD && !FD->hasAttr<WeakAttr>())
	Entry->setLinkage(llvm::Function::ExternalLinkage);
	}

	// Handle dropped DLL attributes.
	if (D && !D->hasAttr<DLLImportAttr>() && !D->hasAttr<DLLExportAttr>())
	Entry->setDLLStorageClass(llvm::GlobalValue::DefaultStorageClass);

	// If there are two attempts to define the same mangled name, issue an
	// error.
	if (IsForDefinition && !Entry->isDeclaration()) {
	GlobalDecl OtherGD;
	// Check that GD is not yet in DiagnosedConflictingDefinitions is required
	// to make sure that we issue an error only once.
	if (lookupRepresentativeDecl(MangledName, OtherGD) &&
	(GD.getCanonicalDecl().getDecl() !=
	OtherGD.getCanonicalDecl().getDecl()) &&
	DiagnosedConflictingDefinitions.insert(GD).second) {
	getDiags().Report(D->getLocation(),
	diag::err_duplicate_mangled_name);
	getDiags().Report(OtherGD.getDecl()->getLocation(),
	diag::note_previous_definition);
	}
	}

	if ((isa<llvm::Function>(Entry) \|\| isa<llvm::GlobalAlias>(Entry)) &&
	(Entry->getType()->getElementType() == Ty)) {
	return Entry;
	}

	// Make sure the result is of the correct type.
	// (If function is requested for a definition, we always need to create a new
	// function, not just return a bitcast.)
	if (!IsForDefinition)
	return llvm::ConstantExpr::getBitCast(Entry, Ty->getPointerTo());
	}

	// This function doesn't have a complete type (for example, the return
	// type is an incomplete struct). Use a fake type instead, and make
	// sure not to try to set attributes.
	bool IsIncompleteFunction = false;

	llvm::FunctionType *FTy;
	if (isa<llvm::FunctionType>(Ty)) {
	FTy = cast<llvm::FunctionType>(Ty);
	} else {
	FTy = llvm::FunctionType::get(VoidTy, false);
	IsIncompleteFunction = true;
	}

	llvm::Function *F =
	llvm::Function::Create(FTy, llvm::Function::ExternalLinkage,
	Entry ? StringRef() : MangledName, &getModule());

	// If we already created a function with the same mangled name (but different
	// type) before, take its name and add it to the list of functions to be
	// replaced with F at the end of CodeGen.
	//
	// This happens if there is a prototype for a function (e.g. "int f()") and
	// then a definition of a different type (e.g. "int f(int x)").
	if (Entry) {
	F->takeName(Entry);

	// This might be an implementation of a function without a prototype, in
	// which case, try to do special replacement of calls which match the new
	// prototype. The really key thing here is that we also potentially drop
	// arguments from the call site so as to make a direct call, which makes the
	// inliner happier and suppresses a number of optimizer warnings (!) about
	// dropping arguments.
	if (!Entry->use_empty()) {
	ReplaceUsesOfNonProtoTypeWithRealFunction(Entry, F);
	Entry->removeDeadConstantUsers();
	}

	llvm::Constant *BC = llvm::ConstantExpr::getBitCast(
	F, Entry->getType()->getElementType()->getPointerTo());
	addGlobalValReplacement(Entry, BC);
	}

	assert(F->getName() == MangledName && "name was uniqued!");
	if (D)
	SetFunctionAttributes(GD, F, IsIncompleteFunction, IsThunk,
	IsForDefinition);
	if (ExtraAttrs.hasAttributes(llvm::AttributeList::FunctionIndex)) {
	llvm::AttrBuilder B(ExtraAttrs, llvm::AttributeList::FunctionIndex);
	F->addAttributes(llvm::AttributeList::FunctionIndex, B);
	}

	if (!DontDefer) {
	// All MSVC dtors other than the base dtor are linkonce_odr and delegate to
	// each other bottoming out with the base dtor. Therefore we emit non-base
	// dtors on usage, even if there is no dtor definition in the TU.
	if (D && isa<CXXDestructorDecl>(D) &&
	getCXXABI().useThunkForDtorVariant(cast<CXXDestructorDecl>(D),
	GD.getDtorType()))
	addDeferredDeclToEmit(GD);

	// This is the first use or definition of a mangled name. If there is a
	// deferred decl with this name, remember that we need to emit it at the end
	// of the file.
	auto DDI = DeferredDecls.find(MangledName);
	if (DDI != DeferredDecls.end()) {
	// Move the potentially referenced deferred decl to the
	// DeferredDeclsToEmit list, and remove it from DeferredDecls (since we
	// don't need it anymore).
	addDeferredDeclToEmit(DDI->second);
	DeferredDecls.erase(DDI);

	// Otherwise, there are cases we have to worry about where we're
	// using a declaration for which we must emit a definition but where
	// we might not find a top-level definition:
	// - member functions defined inline in their classes
	// - friend functions defined inline in some class
	// - special member functions with implicit definitions
	// If we ever change our AST traversal to walk into class methods,
	// this will be unnecessary.
	//
	// We also don't emit a definition for a function if it's going to be an
	// entry in a vtable, unless it's already marked as used.
	} else if (getLangOpts().CPlusPlus && D) {
	// Look for a declaration that's lexically in a record.
	for (const auto *FD = cast<FunctionDecl>(D)->getMostRecentDecl(); FD;
	FD = FD->getPreviousDecl()) {
	if (isa<CXXRecordDecl>(FD->getLexicalDeclContext())) {
	if (FD->doesThisDeclarationHaveABody()) {
	addDeferredDeclToEmit(GD.getWithDecl(FD));
	break;
	}
	}
	}
	}
	}

	// Make sure the result is of the requested type.
	if (!IsIncompleteFunction) {
	assert(F->getType()->getElementType() == Ty);
	return F;
	}

	llvm::Type *PTy = llvm::PointerType::getUnqual(Ty);
	return llvm::ConstantExpr::getBitCast(F, PTy);
	}

	/// GetAddrOfFunction - Return the address of the given function. If Ty is
	/// non-null, then this function will use the specified type if it has to
	/// create it (this occurs when we see a definition of the function).
	llvm::Constant *CodeGenModule::GetAddrOfFunction(GlobalDecl GD,
	llvm::Type *Ty,
	bool ForVTable,
	bool DontDefer,
	ForDefinition_t IsForDefinition) {
	// If there was no specific requested type, just convert it now.
	if (!Ty) {
	const auto *FD = cast<FunctionDecl>(GD.getDecl());
	auto CanonTy = Context.getCanonicalType(FD->getType());
	Ty = getTypes().ConvertFunctionType(CanonTy, FD);
	}

	StringRef MangledName = getMangledName(GD);
	return GetOrCreateLLVMFunction(MangledName, Ty, GD, ForVTable, DontDefer,
	/IsThunk=/false, llvm::AttributeList(),
	IsForDefinition);
	}

	static const FunctionDecl *
	GetRuntimeFunctionDecl(ASTContext &C, StringRef Name) {
	TranslationUnitDecl *TUDecl = C.getTranslationUnitDecl();
	DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);

	IdentifierInfo &CII = C.Idents.get(Name);
	for (const auto &Result : DC->lookup(&CII))
	if (const auto FD = dyn_cast<FunctionDecl>(Result))
	return FD;

	if (!C.getLangOpts().CPlusPlus)
	return nullptr;

	// Demangle the premangled name from getTerminateFn()
	IdentifierInfo &CXXII =
	(Name == "_ZSt9terminatev" \|\| Name == "\01?terminate@@YAXXZ")
	? C.Idents.get("terminate")
	: C.Idents.get(Name);

	for (const auto &N : {"__cxxabiv1", "std"}) {
	IdentifierInfo &NS = C.Idents.get(N);
	for (const auto &Result : DC->lookup(&NS)) {
	NamespaceDecl *ND = dyn_cast<NamespaceDecl>(Result);
	if (auto LSD = dyn_cast<LinkageSpecDecl>(Result))
	for (const auto &Result : LSD->lookup(&NS))
	if ((ND = dyn_cast<NamespaceDecl>(Result)))
	break;

	if (ND)
	for (const auto &Result : ND->lookup(&CXXII))
	if (const auto *FD = dyn_cast<FunctionDecl>(Result))
	return FD;
	}
	}

	return nullptr;
	}

	/// CreateRuntimeFunction - Create a new runtime function with the specified
	/// type and name.
	llvm::Constant *
	CodeGenModule::CreateRuntimeFunction(llvm::FunctionType *FTy, StringRef Name,
	llvm::AttributeList ExtraAttrs,
	bool Local) {
	llvm::Constant *C =
	GetOrCreateLLVMFunction(Name, FTy, GlobalDecl(), /ForVTable=/false,
	/DontDefer=/false, /IsThunk=/false,
	ExtraAttrs);

	if (auto *F = dyn_cast<llvm::Function>(C)) {
	if (F->empty()) {
	F->setCallingConv(getRuntimeCC());

	if (!Local && getTriple().isOSBinFormatCOFF() &&
	!getCodeGenOpts().LTOVisibilityPublicStd &&
	!getTriple().isWindowsGNUEnvironment()) {
	const FunctionDecl *FD = GetRuntimeFunctionDecl(Context, Name);
	if (!FD \|\| FD->hasAttr<DLLImportAttr>()) {
	F->setDLLStorageClass(llvm::GlobalValue::DLLImportStorageClass);
	F->setLinkage(llvm::GlobalValue::ExternalLinkage);
	}
	}
	}
	}

	return C;
	}

	/// CreateBuiltinFunction - Create a new builtin function with the specified
	/// type and name.
	llvm::Constant *
	CodeGenModule::CreateBuiltinFunction(llvm::FunctionType *FTy, StringRef Name,
	llvm::AttributeList ExtraAttrs) {
	llvm::Constant *C =
	GetOrCreateLLVMFunction(Name, FTy, GlobalDecl(), /ForVTable=/false,
	/DontDefer=/false, /IsThunk=/false, ExtraAttrs);
	if (auto *F = dyn_cast<llvm::Function>(C))
	if (F->empty())
	F->setCallingConv(getBuiltinCC());
	return C;
	}

	/// isTypeConstant - Determine whether an object of this type can be emitted
	/// as a constant.
	///
	/// If ExcludeCtor is true, the duration when the object's constructor runs
	/// will not be considered. The caller will need to verify that the object is
	/// not written to during its construction.
	bool CodeGenModule::isTypeConstant(QualType Ty, bool ExcludeCtor) {
	if (!Ty.isConstant(Context) && !Ty->isReferenceType())
	return false;

	if (Context.getLangOpts().CPlusPlus) {
	if (const CXXRecordDecl *Record
	= Context.getBaseElementType(Ty)->getAsCXXRecordDecl())
	return ExcludeCtor && !Record->hasMutableFields() &&
	Record->hasTrivialDestructor();
	}

	return true;
	}

	/// GetOrCreateLLVMGlobal - If the specified mangled name is not in the module,
	/// create and return an llvm GlobalVariable with the specified type. If there
	/// is something in the module with the specified name, return it potentially
	/// bitcasted to the right type.
	///
	/// If D is non-null, it specifies a decl that correspond to this. This is used
	/// to set the attributes on the global when it is first created.
	///
	/// If IsForDefinition is true, it is guranteed that an actual global with
	/// type Ty will be returned, not conversion of a variable with the same
	/// mangled name but some other type.
	llvm::Constant *
	CodeGenModule::GetOrCreateLLVMGlobal(StringRef MangledName,
	llvm::PointerType *Ty,
	const VarDecl *D,
	ForDefinition_t IsForDefinition) {
	// Lookup the entry, lazily creating it if necessary.
	llvm::GlobalValue *Entry = GetGlobalValue(MangledName);
	if (Entry) {
	if (WeakRefReferences.erase(Entry)) {
	if (D && !D->hasAttr<WeakAttr>())
	Entry->setLinkage(llvm::Function::ExternalLinkage);
	}

	// Handle dropped DLL attributes.
	if (D && !D->hasAttr<DLLImportAttr>() && !D->hasAttr<DLLExportAttr>())
	Entry->setDLLStorageClass(llvm::GlobalValue::DefaultStorageClass);

	if (Entry->getType() == Ty)
	return Entry;

	// If there are two attempts to define the same mangled name, issue an
	// error.
	if (IsForDefinition && !Entry->isDeclaration()) {
	GlobalDecl OtherGD;
	const VarDecl *OtherD;

	// Check that D is not yet in DiagnosedConflictingDefinitions is required
	// to make sure that we issue an error only once.
	if (D && lookupRepresentativeDecl(MangledName, OtherGD) &&
	(D->getCanonicalDecl() != OtherGD.getCanonicalDecl().getDecl()) &&
	(OtherD = dyn_cast<VarDecl>(OtherGD.getDecl())) &&
	OtherD->hasInit() &&
	DiagnosedConflictingDefinitions.insert(D).second) {
	getDiags().Report(D->getLocation(),
	diag::err_duplicate_mangled_name);
	getDiags().Report(OtherGD.getDecl()->getLocation(),
	diag::note_previous_definition);
	}
	}

	// Make sure the result is of the correct type.
	if (Entry->getType()->getAddressSpace() != Ty->getAddressSpace())
	return llvm::ConstantExpr::getAddrSpaceCast(Entry, Ty);

	// (If global is requested for a definition, we always need to create a new
	// global, not just return a bitcast.)
	if (!IsForDefinition)
	return llvm::ConstantExpr::getBitCast(Entry, Ty);
	}

	auto AddrSpace = GetGlobalVarAddressSpace(D);
	auto TargetAddrSpace = getContext().getTargetAddressSpace(AddrSpace);

	auto *GV = new llvm::GlobalVariable(
	getModule(), Ty->getElementType(), false,
	llvm::GlobalValue::ExternalLinkage, nullptr, MangledName, nullptr,
	llvm::GlobalVariable::NotThreadLocal, TargetAddrSpace);

	// If we already created a global with the same mangled name (but different
	// type) before, take its name and remove it from its parent.
	if (Entry) {
	GV->takeName(Entry);

	if (!Entry->use_empty()) {
	llvm::Constant *NewPtrForOldDecl =
	llvm::ConstantExpr::getBitCast(GV, Entry->getType());
	Entry->replaceAllUsesWith(NewPtrForOldDecl);
	}

	Entry->eraseFromParent();
	}

	// This is the first use or definition of a mangled name. If there is a
	// deferred decl with this name, remember that we need to emit it at the end
	// of the file.
	auto DDI = DeferredDecls.find(MangledName);
	if (DDI != DeferredDecls.end()) {
	// Move the potentially referenced deferred decl to the DeferredDeclsToEmit
	// list, and remove it from DeferredDecls (since we don't need it anymore).
	addDeferredDeclToEmit(DDI->second);
	DeferredDecls.erase(DDI);
	}

	// Handle things which are present even on external declarations.
	if (D) {
	// FIXME: This code is overly simple and should be merged with other global
	// handling.
	GV->setConstant(isTypeConstant(D->getType(), false));

	GV->setAlignment(getContext().getDeclAlign(D).getQuantity());

	setLinkageForGV(GV, D);
	setGlobalVisibility(GV, D, NotForDefinition);

	if (D->getTLSKind()) {
	if (D->getTLSKind() == VarDecl::TLS_Dynamic)
	CXXThreadLocals.push_back(D);
	setTLSMode(GV, *D);
	}

	// If required by the ABI, treat declarations of static data members with
	// inline initializers as definitions.
	if (getContext().isMSStaticDataMemberInlineDefinition(D)) {
	EmitGlobalVarDefinition(D);
	}

	// Emit section information for extern variables.
	if (D->hasExternalStorage()) {
	if (const SectionAttr *SA = D->getAttr<SectionAttr>())
	GV->setSection(SA->getName());
	}

	// Handle XCore specific ABI requirements.
	if (getTriple().getArch() == llvm::Triple::xcore &&
	D->getLanguageLinkage() == CLanguageLinkage &&
	D->getType().isConstant(Context) &&
	isExternallyVisible(D->getLinkageAndVisibility().getLinkage()))
	GV->setSection(".cp.rodata");

	// Check if we a have a const declaration with an initializer, we may be
	// able to emit it as available_externally to expose it's value to the
	// optimizer.
	if (Context.getLangOpts().CPlusPlus && GV->hasExternalLinkage() &&
	D->getType().isConstQualified() && !GV->hasInitializer() &&
	!D->hasDefinition() && D->hasInit() && !D->hasAttr<DLLImportAttr>()) {
	const auto *Record =
	Context.getBaseElementType(D->getType())->getAsCXXRecordDecl();
	bool HasMutableFields = Record && Record->hasMutableFields();
	if (!HasMutableFields) {
	const VarDecl *InitDecl;
	const Expr *InitExpr = D->getAnyInitializer(InitDecl);
	if (InitExpr) {
	ConstantEmitter emitter(*this);
	llvm::Constant Init = emitter.tryEmitForInitializer(InitDecl);
	if (Init) {
	auto *InitType = Init->getType();
	if (GV->getType()->getElementType() != InitType) {
	// The type of the initializer does not match the definition.
	// This happens when an initializer has a different type from
	// the type of the global (because of padding at the end of a
	// structure for instance).
	GV->setName(StringRef());
	// Make a new global with the correct type, this is now guaranteed
	// to work.
	auto *NewGV = cast<llvm::GlobalVariable>(
	GetAddrOfGlobalVar(D, InitType, IsForDefinition));

	// Erase the old global, since it is no longer used.
	cast<llvm::GlobalValue>(GV)->eraseFromParent();
	GV = NewGV;
	} else {
	GV->setInitializer(Init);
	GV->setConstant(true);
	GV->setLinkage(llvm::GlobalValue::AvailableExternallyLinkage);
	}
	emitter.finalize(GV);
	}
	}
	}
	}
	}

	LangAS ExpectedAS =
	D ? D->getType().getAddressSpace()
	: (LangOpts.OpenCL ? LangAS::opencl_global : LangAS::Default);
	assert(getContext().getTargetAddressSpace(ExpectedAS) ==
	Ty->getPointerAddressSpace());
	if (AddrSpace != ExpectedAS)
	return getTargetCodeGenInfo().performAddrSpaceCast(*this, GV, AddrSpace,
	ExpectedAS, Ty);

	return GV;
	}

	llvm::Constant *
	CodeGenModule::GetAddrOfGlobal(GlobalDecl GD,
	ForDefinition_t IsForDefinition) {
	const Decl *D = GD.getDecl();
	if (isa<CXXConstructorDecl>(D))
	return getAddrOfCXXStructor(cast<CXXConstructorDecl>(D),
	getFromCtorType(GD.getCtorType()),
	/FnInfo=/nullptr, /FnType=/nullptr,
	/DontDefer=/false, IsForDefinition);
	else if (isa<CXXDestructorDecl>(D))
	return getAddrOfCXXStructor(cast<CXXDestructorDecl>(D),
	getFromDtorType(GD.getDtorType()),
	/FnInfo=/nullptr, /FnType=/nullptr,
	/DontDefer=/false, IsForDefinition);
	else if (isa<CXXMethodDecl>(D)) {
	auto FInfo = &getTypes().arrangeCXXMethodDeclaration(
	cast<CXXMethodDecl>(D));
	auto Ty = getTypes().GetFunctionType(*FInfo);
	return GetAddrOfFunction(GD, Ty, /ForVTable=/false, /DontDefer=/false,
	IsForDefinition);
	} else if (isa<FunctionDecl>(D)) {
	const CGFunctionInfo &FI = getTypes().arrangeGlobalDeclaration(GD);
	llvm::FunctionType *Ty = getTypes().GetFunctionType(FI);
	return GetAddrOfFunction(GD, Ty, /ForVTable=/false, /DontDefer=/false,
	IsForDefinition);
	} else
	return GetAddrOfGlobalVar(cast<VarDecl>(D), /Ty=/nullptr,
	IsForDefinition);
	}

	llvm::GlobalVariable *
	CodeGenModule::CreateOrReplaceCXXRuntimeVariable(StringRef Name,
	llvm::Type *Ty,
	llvm::GlobalValue::LinkageTypes Linkage) {
	llvm::GlobalVariable *GV = getModule().getNamedGlobal(Name);
	llvm::GlobalVariable *OldGV = nullptr;

	if (GV) {
	// Check if the variable has the right type.
	if (GV->getType()->getElementType() == Ty)
	return GV;

	// Because C++ name mangling, the only way we can end up with an already
	// existing global with the same name is if it has been declared extern "C".
	assert(GV->isDeclaration() && "Declaration has wrong type!");
	OldGV = GV;
	}

	// Create a new variable.
	GV = new llvm::GlobalVariable(getModule(), Ty, /isConstant=/true,
	Linkage, nullptr, Name);

	if (OldGV) {
	// Replace occurrences of the old variable if needed.
	GV->takeName(OldGV);

	if (!OldGV->use_empty()) {
	llvm::Constant *NewPtrForOldDecl =
	llvm::ConstantExpr::getBitCast(GV, OldGV->getType());
	OldGV->replaceAllUsesWith(NewPtrForOldDecl);
	}

	OldGV->eraseFromParent();
	}

	if (supportsCOMDAT() && GV->isWeakForLinker() &&
	!GV->hasAvailableExternallyLinkage())
	GV->setComdat(TheModule.getOrInsertComdat(GV->getName()));

	return GV;
	}

	/// GetAddrOfGlobalVar - Return the llvm::Constant for the address of the
	/// given global variable. If Ty is non-null and if the global doesn't exist,
	/// then it will be created with the specified type instead of whatever the
	/// normal requested type would be. If IsForDefinition is true, it is guranteed
	/// that an actual global with type Ty will be returned, not conversion of a
	/// variable with the same mangled name but some other type.
	llvm::Constant CodeGenModule::GetAddrOfGlobalVar(const VarDecl D,
	llvm::Type *Ty,
	ForDefinition_t IsForDefinition) {
	assert(D->hasGlobalStorage() && "Not a global variable");
	QualType ASTTy = D->getType();
	if (!Ty)
	Ty = getTypes().ConvertTypeForMem(ASTTy);

	llvm::PointerType *PTy =
	llvm::PointerType::get(Ty, getContext().getTargetAddressSpace(ASTTy));

	StringRef MangledName = getMangledName(D);
	return GetOrCreateLLVMGlobal(MangledName, PTy, D, IsForDefinition);
	}

	/// CreateRuntimeVariable - Create a new runtime global variable with the
	/// specified type and name.
	llvm::Constant *
	CodeGenModule::CreateRuntimeVariable(llvm::Type *Ty,
	StringRef Name) {
	return GetOrCreateLLVMGlobal(Name, llvm::PointerType::getUnqual(Ty), nullptr);
	}

	void CodeGenModule::EmitTentativeDefinition(const VarDecl *D) {
	assert(!D->getInit() && "Cannot emit definite definitions here!");

	StringRef MangledName = getMangledName(D);
	llvm::GlobalValue *GV = GetGlobalValue(MangledName);

	// We already have a definition, not declaration, with the same mangled name.
	// Emitting of declaration is not required (and actually overwrites emitted
	// definition).
	if (GV && !GV->isDeclaration())
	return;

	// If we have not seen a reference to this variable yet, place it into the
	// deferred declarations table to be emitted if needed later.
	if (!MustBeEmitted(D) && !GV) {
	DeferredDecls[MangledName] = D;
	return;
	}

	// The tentative definition is the only definition.
	EmitGlobalVarDefinition(D);
	}

	CharUnits CodeGenModule::GetTargetTypeStoreSize(llvm::Type *Ty) const {
	return Context.toCharUnitsFromBits(
	getDataLayout().getTypeStoreSizeInBits(Ty));
	}

	LangAS CodeGenModule::GetGlobalVarAddressSpace(const VarDecl *D) {
	LangAS AddrSpace = LangAS::Default;
	if (LangOpts.OpenCL) {
	AddrSpace = D ? D->getType().getAddressSpace() : LangAS::opencl_global;
	assert(AddrSpace == LangAS::opencl_global \|\|
	AddrSpace == LangAS::opencl_constant \|\|
	AddrSpace == LangAS::opencl_local \|\|
	AddrSpace >= LangAS::FirstTargetAddressSpace);
	return AddrSpace;
	}

	if (LangOpts.CUDA && LangOpts.CUDAIsDevice) {
	if (D && D->hasAttr<CUDAConstantAttr>())
	return LangAS::cuda_constant;
	else if (D && D->hasAttr<CUDASharedAttr>())
	return LangAS::cuda_shared;
	else
	return LangAS::cuda_device;
	}

	return getTargetCodeGenInfo().getGlobalVarAddressSpace(*this, D);
	}

	template<typename SomeDecl>
	void CodeGenModule::MaybeHandleStaticInExternC(const SomeDecl *D,
	llvm::GlobalValue *GV) {
	if (!getLangOpts().CPlusPlus)
	return;

	// Must have 'used' attribute, or else inline assembly can't rely on
	// the name existing.
	if (!D->template hasAttr<UsedAttr>())
	return;

	// Must have internal linkage and an ordinary name.
	if (!D->getIdentifier() \|\| D->getFormalLinkage() != InternalLinkage)
	return;

	// Must be in an extern "C" context. Entities declared directly within
	// a record are not extern "C" even if the record is in such a context.
	const SomeDecl *First = D->getFirstDecl();
	if (First->getDeclContext()->isRecord() \|\| !First->isInExternCContext())
	return;

	// OK, this is an internal linkage entity inside an extern "C" linkage
	// specification. Make a note of that so we can give it the "expected"
	// mangled name if nothing else is using that name.
	std::pair<StaticExternCMap::iterator, bool> R =
	StaticExternCValues.insert(std::make_pair(D->getIdentifier(), GV));

	// If we have multiple internal linkage entities with the same name
	// in extern "C" regions, none of them gets that name.
	if (!R.second)
	R.first->second = nullptr;
	}

	static bool shouldBeInCOMDAT(CodeGenModule &CGM, const Decl &D) {
	if (!CGM.supportsCOMDAT())
	return false;

	if (D.hasAttr<SelectAnyAttr>())
	return true;

	GVALinkage Linkage;
	if (auto *VD = dyn_cast<VarDecl>(&D))
	Linkage = CGM.getContext().GetGVALinkageForVariable(VD);
	else
	Linkage = CGM.getContext().GetGVALinkageForFunction(cast<FunctionDecl>(&D));

	switch (Linkage) {
	case GVA_Internal:
	case GVA_AvailableExternally:
	case GVA_StrongExternal:
	return false;
	case GVA_DiscardableODR:
	case GVA_StrongODR:
	return true;
	}
	llvm_unreachable("No such linkage");
	}

	void CodeGenModule::maybeSetTrivialComdat(const Decl &D,
	llvm::GlobalObject &GO) {
	if (!shouldBeInCOMDAT(*this, D))
	return;
	GO.setComdat(TheModule.getOrInsertComdat(GO.getName()));
	}

	/// Pass IsTentative as true if you want to create a tentative definition.
	void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,
	bool IsTentative) {
	// OpenCL global variables of sampler type are translated to function calls,
	// therefore no need to be translated.
	QualType ASTTy = D->getType();
	if (getLangOpts().OpenCL && ASTTy->isSamplerT())
	return;

	llvm::Constant *Init = nullptr;
	CXXRecordDecl *RD = ASTTy->getBaseElementTypeUnsafe()->getAsCXXRecordDecl();
	bool NeedsGlobalCtor = false;
	bool NeedsGlobalDtor = RD && !RD->hasTrivialDestructor();

	const VarDecl *InitDecl;
	const Expr *InitExpr = D->getAnyInitializer(InitDecl);

	Optional<ConstantEmitter> emitter;

	// CUDA E.2.4.1 "__shared__ variables cannot have an initialization
	// as part of their declaration." Sema has already checked for
	// error cases, so we just need to set Init to UndefValue.
	if (getLangOpts().CUDA && getLangOpts().CUDAIsDevice &&
	D->hasAttr<CUDASharedAttr>())
	Init = llvm::UndefValue::get(getTypes().ConvertType(ASTTy));
	else if (!InitExpr) {
	// This is a tentative definition; tentative definitions are
	// implicitly initialized with { 0 }.
	//
	// Note that tentative definitions are only emitted at the end of
	// a translation unit, so they should never have incomplete
	// type. In addition, EmitTentativeDefinition makes sure that we
	// never attempt to emit a tentative definition if a real one
	// exists. A use may still exists, however, so we still may need
	// to do a RAUW.
	assert(!ASTTy->isIncompleteType() && "Unexpected incomplete type");
	Init = EmitNullConstant(D->getType());
	} else {
	initializedGlobalDecl = GlobalDecl(D);
	emitter.emplace(*this);
	Init = emitter->tryEmitForInitializer(*InitDecl);

	if (!Init) {
	QualType T = InitExpr->getType();
	if (D->getType()->isReferenceType())
	T = D->getType();

	if (getLangOpts().CPlusPlus) {
	Init = EmitNullConstant(T);
	NeedsGlobalCtor = true;
	} else {
	ErrorUnsupported(D, "static initializer");
	Init = llvm::UndefValue::get(getTypes().ConvertType(T));
	}
	} else {
	// We don't need an initializer, so remove the entry for the delayed
	// initializer position (just in case this entry was delayed) if we
	// also don't need to register a destructor.
	if (getLangOpts().CPlusPlus && !NeedsGlobalDtor)
	DelayedCXXInitPosition.erase(D);
	}
	}

	llvm::Type* InitType = Init->getType();
	llvm::Constant *Entry =
	GetAddrOfGlobalVar(D, InitType, ForDefinition_t(!IsTentative));

	// Strip off a bitcast if we got one back.
	if (auto *CE = dyn_cast<llvm::ConstantExpr>(Entry)) {
	assert(CE->getOpcode() == llvm::Instruction::BitCast \|\|
	CE->getOpcode() == llvm::Instruction::AddrSpaceCast \|\|
	// All zero index gep.
	CE->getOpcode() == llvm::Instruction::GetElementPtr);
	Entry = CE->getOperand(0);
	}

	// Entry is now either a Function or GlobalVariable.
	auto *GV = dyn_cast<llvm::GlobalVariable>(Entry);

	// We have a definition after a declaration with the wrong type.
	// We must make a new GlobalVariable* and update everything that used OldGV
	// (a declaration or tentative definition) with the new GlobalVariable*
	// (which will be a definition).
	//
	// This happens if there is a prototype for a global (e.g.
	// "extern int x[];") and then a definition of a different type (e.g.
	// "int x[10];"). This also happens when an initializer has a different type
	// from the type of the global (this happens with unions).
	if (!GV \|\| GV->getType()->getElementType() != InitType \|\|
	GV->getType()->getAddressSpace() !=
	getContext().getTargetAddressSpace(GetGlobalVarAddressSpace(D))) {

	// Move the old entry aside so that we'll create a new one.
	Entry->setName(StringRef());

	// Make a new global with the correct type, this is now guaranteed to work.
	GV = cast<llvm::GlobalVariable>(
	GetAddrOfGlobalVar(D, InitType, ForDefinition_t(!IsTentative)));

	// Replace all uses of the old global with the new global
	llvm::Constant *NewPtrForOldDecl =
	llvm::ConstantExpr::getBitCast(GV, Entry->getType());
	Entry->replaceAllUsesWith(NewPtrForOldDecl);

	// Erase the old global, since it is no longer used.
	cast<llvm::GlobalValue>(Entry)->eraseFromParent();
	}

	MaybeHandleStaticInExternC(D, GV);

	if (D->hasAttr<AnnotateAttr>())
	AddGlobalAnnotations(D, GV);

	// Set the llvm linkage type as appropriate.
	llvm::GlobalValue::LinkageTypes Linkage =
	getLLVMLinkageVarDefinition(D, GV->isConstant());

	// CUDA B.2.1 "The __device__ qualifier declares a variable that resides on
	// the device. [...]"
	// CUDA B.2.2 "The __constant__ qualifier, optionally used together with
	// __device__, declares a variable that: [...]
	// Is accessible from all the threads within the grid and from the host
	// through the runtime library (cudaGetSymbolAddress() / cudaGetSymbolSize()
	// / cudaMemcpyToSymbol() / cudaMemcpyFromSymbol())."
	if (GV && LangOpts.CUDA) {
	if (LangOpts.CUDAIsDevice) {
	if (D->hasAttr<CUDADeviceAttr>() \|\| D->hasAttr<CUDAConstantAttr>())
	GV->setExternallyInitialized(true);
	} else {
	// Host-side shadows of external declarations of device-side
	// global variables become internal definitions. These have to
	// be internal in order to prevent name conflicts with global
	// host variables with the same name in a different TUs.
	if (D->hasAttr<CUDADeviceAttr>() \|\| D->hasAttr<CUDAConstantAttr>()) {
	Linkage = llvm::GlobalValue::InternalLinkage;

	// Shadow variables and their properties must be registered
	// with CUDA runtime.
	unsigned Flags = 0;
	if (!D->hasDefinition())
	Flags \|= CGCUDARuntime::ExternDeviceVar;
	if (D->hasAttr<CUDAConstantAttr>())
	Flags \|= CGCUDARuntime::ConstantDeviceVar;
	getCUDARuntime().registerDeviceVar(*GV, Flags);
	} else if (D->hasAttr<CUDASharedAttr>())
	// __shared__ variables are odd. Shadows do get created, but
	// they are not registered with the CUDA runtime, so they
	// can't really be used to access their device-side
	// counterparts. It's not clear yet whether it's nvcc's bug or
	// a feature, but we've got to do the same for compatibility.
	Linkage = llvm::GlobalValue::InternalLinkage;
	}
	}

	GV->setInitializer(Init);
	if (emitter) emitter->finalize(GV);

	// If it is safe to mark the global 'constant', do so now.
	GV->setConstant(!NeedsGlobalCtor && !NeedsGlobalDtor &&
	isTypeConstant(D->getType(), true));

	// If it is in a read-only section, mark it 'constant'.
	if (const SectionAttr *SA = D->getAttr<SectionAttr>()) {
	const ASTContext::SectionInfo &SI = Context.SectionInfos[SA->getName()];
	if ((SI.SectionFlags & ASTContext::PSF_Write) == 0)
	GV->setConstant(true);
	}

	GV->setAlignment(getContext().getDeclAlign(D).getQuantity());


	// On Darwin, if the normal linkage of a C++ thread_local variable is
	// LinkOnce or Weak, we keep the normal linkage to prevent multiple
	// copies within a linkage unit; otherwise, the backing variable has
	// internal linkage and all accesses should just be calls to the
	// Itanium-specified entry point, which has the normal linkage of the
	// variable. This is to preserve the ability to change the implementation
	// behind the scenes.
	if (!D->isStaticLocal() && D->getTLSKind() == VarDecl::TLS_Dynamic &&
	Context.getTargetInfo().getTriple().isOSDarwin() &&
	!llvm::GlobalVariable::isLinkOnceLinkage(Linkage) &&
	!llvm::GlobalVariable::isWeakLinkage(Linkage))
	Linkage = llvm::GlobalValue::InternalLinkage;

	GV->setLinkage(Linkage);
	if (D->hasAttr<DLLImportAttr>())
	GV->setDLLStorageClass(llvm::GlobalVariable::DLLImportStorageClass);
	else if (D->hasAttr<DLLExportAttr>())
	GV->setDLLStorageClass(llvm::GlobalVariable::DLLExportStorageClass);
	else
	GV->setDLLStorageClass(llvm::GlobalVariable::DefaultStorageClass);

	if (Linkage == llvm::GlobalVariable::CommonLinkage) {
	// common vars aren't constant even if declared const.
	GV->setConstant(false);
	// Tentative definition of global variables may be initialized with
	// non-zero null pointers. In this case they should have weak linkage
	// since common linkage must have zero initializer and must not have
	// explicit section therefore cannot have non-zero initial value.
	if (!GV->getInitializer()->isNullValue())
	GV->setLinkage(llvm::GlobalVariable::WeakAnyLinkage);
	}

	setNonAliasAttributes(D, GV);

	if (D->getTLSKind() && !GV->isThreadLocal()) {
	if (D->getTLSKind() == VarDecl::TLS_Dynamic)
	CXXThreadLocals.push_back(D);
	setTLSMode(GV, *D);
	}

	maybeSetTrivialComdat(D, GV);

	// Emit the initializer function if necessary.
	if (NeedsGlobalCtor \|\| NeedsGlobalDtor)
	EmitCXXGlobalVarDeclInitFunc(D, GV, NeedsGlobalCtor);

	SanitizerMD->reportGlobalToASan(GV, *D, NeedsGlobalCtor);

	// Emit global variable debug information.
	if (CGDebugInfo *DI = getModuleDebugInfo())
	if (getCodeGenOpts().getDebugInfo() >= codegenoptions::LimitedDebugInfo)
	DI->EmitGlobalVariable(GV, D);
	}

	static bool isVarDeclStrongDefinition(const ASTContext &Context,
	CodeGenModule &CGM, const VarDecl *D,
	bool NoCommon) {
	// Don't give variables common linkage if -fno-common was specified unless it
	// was overridden by a NoCommon attribute.
	if ((NoCommon \|\| D->hasAttr<NoCommonAttr>()) && !D->hasAttr<CommonAttr>())
	return true;

	// C11 6.9.2/2:
	// A declaration of an identifier for an object that has file scope without
	// an initializer, and without a storage-class specifier or with the
	// storage-class specifier static, constitutes a tentative definition.
	if (D->getInit() \|\| D->hasExternalStorage())
	return true;

	// A variable cannot be both common and exist in a section.
	if (D->hasAttr<SectionAttr>())
	return true;

	// A variable cannot be both common and exist in a section.
	// We dont try to determine which is the right section in the front-end.
	// If no specialized section name is applicable, it will resort to default.
	if (D->hasAttr<PragmaClangBSSSectionAttr>() \|\|
	D->hasAttr<PragmaClangDataSectionAttr>() \|\|
	D->hasAttr<PragmaClangRodataSectionAttr>())
	return true;

	// Thread local vars aren't considered common linkage.
	if (D->getTLSKind())
	return true;

	// Tentative definitions marked with WeakImportAttr are true definitions.
	if (D->hasAttr<WeakImportAttr>())
	return true;

	// A variable cannot be both common and exist in a comdat.
	if (shouldBeInCOMDAT(CGM, *D))
	return true;

	// Declarations with a required alignment do not have common linkage in MSVC
	// mode.
	if (Context.getTargetInfo().getCXXABI().isMicrosoft()) {
	if (D->hasAttr<AlignedAttr>())
	return true;
	QualType VarType = D->getType();
	if (Context.isAlignmentRequired(VarType))
	return true;

	if (const auto *RT = VarType->getAs<RecordType>()) {
	const RecordDecl *RD = RT->getDecl();
	for (const FieldDecl *FD : RD->fields()) {
	if (FD->isBitField())
	continue;
	if (FD->hasAttr<AlignedAttr>())
	return true;
	if (Context.isAlignmentRequired(FD->getType()))
	return true;
	}
	}
	}

	return false;
	}

	llvm::GlobalValue::LinkageTypes CodeGenModule::getLLVMLinkageForDeclarator(
	const DeclaratorDecl *D, GVALinkage Linkage, bool IsConstantVariable) {
	if (Linkage == GVA_Internal)
	return llvm::Function::InternalLinkage;

	if (D->hasAttr<WeakAttr>()) {
	if (IsConstantVariable)
	return llvm::GlobalVariable::WeakODRLinkage;
	else
	return llvm::GlobalVariable::WeakAnyLinkage;
	}

	// We are guaranteed to have a strong definition somewhere else,
	// so we can use available_externally linkage.
	if (Linkage == GVA_AvailableExternally)
	return llvm::GlobalValue::AvailableExternallyLinkage;

	// Note that Apple's kernel linker doesn't support symbol
	// coalescing, so we need to avoid linkonce and weak linkages there.
	// Normally, this means we just map to internal, but for explicit
	// instantiations we'll map to external.

	// In C++, the compiler has to emit a definition in every translation unit
	// that references the function. We should use linkonce_odr because
	// a) if all references in this translation unit are optimized away, we
	// don't need to codegen it. b) if the function persists, it needs to be
	// merged with other definitions. c) C++ has the ODR, so we know the
	// definition is dependable.
	if (Linkage == GVA_DiscardableODR)
	return !Context.getLangOpts().AppleKext ? llvm::Function::LinkOnceODRLinkage
	: llvm::Function::InternalLinkage;

	// An explicit instantiation of a template has weak linkage, since
	// explicit instantiations can occur in multiple translation units
	// and must all be equivalent. However, we are not allowed to
	// throw away these explicit instantiations.
	//
	// We don't currently support CUDA device code spread out across multiple TUs,
	// so say that CUDA templates are either external (for kernels) or internal.
	// This lets llvm perform aggressive inter-procedural optimizations.
	if (Linkage == GVA_StrongODR) {
	if (Context.getLangOpts().AppleKext)
	return llvm::Function::ExternalLinkage;
	if (Context.getLangOpts().CUDA && Context.getLangOpts().CUDAIsDevice)
	return D->hasAttr<CUDAGlobalAttr>() ? llvm::Function::ExternalLinkage
	: llvm::Function::InternalLinkage;
	return llvm::Function::WeakODRLinkage;
	}

	// C++ doesn't have tentative definitions and thus cannot have common
	// linkage.
	if (!getLangOpts().CPlusPlus && isa<VarDecl>(D) &&
	!isVarDeclStrongDefinition(Context, *this, cast<VarDecl>(D),
	CodeGenOpts.NoCommon))
	return llvm::GlobalVariable::CommonLinkage;

	// selectany symbols are externally visible, so use weak instead of
	// linkonce. MSVC optimizes away references to const selectany globals, so
	// all definitions should be the same and ODR linkage should be used.
	// http://msdn.microsoft.com/en-us/library/5tkz6s71.aspx
	if (D->hasAttr<SelectAnyAttr>())
	return llvm::GlobalVariable::WeakODRLinkage;

	// Otherwise, we have strong external linkage.
	assert(Linkage == GVA_StrongExternal);
	return llvm::GlobalVariable::ExternalLinkage;
	}

	llvm::GlobalValue::LinkageTypes CodeGenModule::getLLVMLinkageVarDefinition(
	const VarDecl *VD, bool IsConstant) {
	GVALinkage Linkage = getContext().GetGVALinkageForVariable(VD);
	return getLLVMLinkageForDeclarator(VD, Linkage, IsConstant);
	}

	/// Replace the uses of a function that was declared with a non-proto type.
	/// We want to silently drop extra arguments from call sites
	static void replaceUsesOfNonProtoConstant(llvm::Constant *old,
	llvm::Function *newFn) {
	// Fast path.
	if (old->use_empty()) return;

	llvm::Type *newRetTy = newFn->getReturnType();
	SmallVector<llvm::Value*, 4> newArgs;
	SmallVector<llvm::OperandBundleDef, 1> newBundles;

	for (llvm::Value::use_iterator ui = old->use_begin(), ue = old->use_end();
	ui != ue; ) {
	llvm::Value::use_iterator use = ui++; // Increment before the use is erased.
	llvm::User *user = use->getUser();

	// Recognize and replace uses of bitcasts. Most calls to
	// unprototyped functions will use bitcasts.
	if (auto *bitcast = dyn_cast<llvm::ConstantExpr>(user)) {
	if (bitcast->getOpcode() == llvm::Instruction::BitCast)
	replaceUsesOfNonProtoConstant(bitcast, newFn);
	continue;
	}

	// Recognize calls to the function.
	llvm::CallSite callSite(user);
	if (!callSite) continue;
	if (!callSite.isCallee(&*use)) continue;

	// If the return types don't match exactly, then we can't
	// transform this call unless it's dead.
	if (callSite->getType() != newRetTy && !callSite->use_empty())
	continue;

	// Get the call site's attribute list.
	SmallVector<llvm::AttributeSet, 8> newArgAttrs;
	llvm::AttributeList oldAttrs = callSite.getAttributes();

	// If the function was passed too few arguments, don't transform.
	unsigned newNumArgs = newFn->arg_size();
	if (callSite.arg_size() < newNumArgs) continue;

	// If extra arguments were passed, we silently drop them.
	// If any of the types mismatch, we don't transform.
	unsigned argNo = 0;
	bool dontTransform = false;
	for (llvm::Argument &A : newFn->args()) {
	if (callSite.getArgument(argNo)->getType() != A.getType()) {
	dontTransform = true;
	break;
	}

	// Add any parameter attributes.
	newArgAttrs.push_back(oldAttrs.getParamAttributes(argNo));
	argNo++;
	}
	if (dontTransform)
	continue;

	// Okay, we can transform this. Create the new call instruction and copy
	// over the required information.
	newArgs.append(callSite.arg_begin(), callSite.arg_begin() + argNo);

	// Copy over any operand bundles.
	callSite.getOperandBundlesAsDefs(newBundles);

	llvm::CallSite newCall;
	if (callSite.isCall()) {
	newCall = llvm::CallInst::Create(newFn, newArgs, newBundles, "",
	callSite.getInstruction());
	} else {
	auto *oldInvoke = cast<llvm::InvokeInst>(callSite.getInstruction());
	newCall = llvm::InvokeInst::Create(newFn,
	oldInvoke->getNormalDest(),
	oldInvoke->getUnwindDest(),
	newArgs, newBundles, "",
	callSite.getInstruction());
	}
	newArgs.clear(); // for the next iteration

	if (!newCall->getType()->isVoidTy())
	newCall->takeName(callSite.getInstruction());
	newCall.setAttributes(llvm::AttributeList::get(
	newFn->getContext(), oldAttrs.getFnAttributes(),
	oldAttrs.getRetAttributes(), newArgAttrs));
	newCall.setCallingConv(callSite.getCallingConv());

	// Finally, remove the old call, replacing any uses with the new one.
	if (!callSite->use_empty())
	callSite->replaceAllUsesWith(newCall.getInstruction());

	// Copy debug location attached to CI.
	if (callSite->getDebugLoc())
	newCall->setDebugLoc(callSite->getDebugLoc());

	callSite->eraseFromParent();
	}
	}

	/// ReplaceUsesOfNonProtoTypeWithRealFunction - This function is called when we
	/// implement a function with no prototype, e.g. "int foo() {}". If there are
	/// existing call uses of the old function in the module, this adjusts them to
	/// call the new function directly.
	///
	/// This is not just a cleanup: the always_inline pass requires direct calls to
	/// functions to be able to inline them. If there is a bitcast in the way, it
	/// won't inline them. Instcombine normally deletes these calls, but it isn't
	/// run at -O0.
	static void ReplaceUsesOfNonProtoTypeWithRealFunction(llvm::GlobalValue *Old,
	llvm::Function *NewFn) {
	// If we're redefining a global as a function, don't transform it.
	if (!isa<llvm::Function>(Old)) return;

	replaceUsesOfNonProtoConstant(Old, NewFn);
	}

	void CodeGenModule::HandleCXXStaticMemberVarInstantiation(VarDecl *VD) {
	auto DK = VD->isThisDeclarationADefinition();
	if (DK == VarDecl::Definition && VD->hasAttr<DLLImportAttr>())
	return;

	TemplateSpecializationKind TSK = VD->getTemplateSpecializationKind();
	// If we have a definition, this might be a deferred decl. If the
	// instantiation is explicit, make sure we emit it at the end.
	if (VD->getDefinition() && TSK == TSK_ExplicitInstantiationDefinition)
	GetAddrOfGlobalVar(VD);

	EmitTopLevelDecl(VD);
	}

	void CodeGenModule::EmitGlobalFunctionDefinition(GlobalDecl GD,
	llvm::GlobalValue *GV) {
	const auto *D = cast<FunctionDecl>(GD.getDecl());

	// Compute the function info and LLVM type.
	const CGFunctionInfo &FI = getTypes().arrangeGlobalDeclaration(GD);
	llvm::FunctionType *Ty = getTypes().GetFunctionType(FI);

	// Get or create the prototype for the function.
	if (!GV \|\| (GV->getType()->getElementType() != Ty))
	GV = cast<llvm::GlobalValue>(GetAddrOfFunction(GD, Ty, /ForVTable=/false,
	/DontDefer=/true,
	ForDefinition));

	// Already emitted.
	if (!GV->isDeclaration())
	return;

	// We need to set linkage and visibility on the function before
	// generating code for it because various parts of IR generation
	// want to propagate this information down (e.g. to local static
	// declarations).
	auto *Fn = cast<llvm::Function>(GV);
	setFunctionLinkage(GD, Fn);
	setFunctionDLLStorageClass(GD, Fn);

	// FIXME: this is redundant with part of setFunctionDefinitionAttributes
	setGlobalVisibility(Fn, D, ForDefinition);

	MaybeHandleStaticInExternC(D, Fn);

	maybeSetTrivialComdat(D, Fn);

	CodeGenFunction(*this).GenerateCode(D, Fn, FI);

	setFunctionDefinitionAttributes(D, Fn);
	SetLLVMFunctionAttributesForDefinition(D, Fn);

	if (const ConstructorAttr *CA = D->getAttr<ConstructorAttr>())
	AddGlobalCtor(Fn, CA->getPriority());
	if (const DestructorAttr *DA = D->getAttr<DestructorAttr>())
	AddGlobalDtor(Fn, DA->getPriority());
	if (D->hasAttr<AnnotateAttr>())
	AddGlobalAnnotations(D, Fn);
	}

	void CodeGenModule::EmitAliasDefinition(GlobalDecl GD) {
	const auto *D = cast<ValueDecl>(GD.getDecl());
	const AliasAttr *AA = D->getAttr<AliasAttr>();
	assert(AA && "Not an alias?");

	StringRef MangledName = getMangledName(GD);

	if (AA->getAliasee() == MangledName) {
	Diags.Report(AA->getLocation(), diag::err_cyclic_alias) << 0;
	return;
	}

	// If there is a definition in the module, then it wins over the alias.
	// This is dubious, but allow it to be safe. Just ignore the alias.
	llvm::GlobalValue *Entry = GetGlobalValue(MangledName);
	if (Entry && !Entry->isDeclaration())
	return;

	Aliases.push_back(GD);

	llvm::Type *DeclTy = getTypes().ConvertTypeForMem(D->getType());

	// Create a reference to the named value. This ensures that it is emitted
	// if a deferred decl.
	llvm::Constant *Aliasee;
	if (isa<llvm::FunctionType>(DeclTy))
	Aliasee = GetOrCreateLLVMFunction(AA->getAliasee(), DeclTy, GD,
	/ForVTable=/false);
	else
	Aliasee = GetOrCreateLLVMGlobal(AA->getAliasee(),
	llvm::PointerType::getUnqual(DeclTy),
	/D=/nullptr);

	// Create the new alias itself, but don't set a name yet.
	auto *GA = llvm::GlobalAlias::create(
	DeclTy, 0, llvm::Function::ExternalLinkage, "", Aliasee, &getModule());

	if (Entry) {
	if (GA->getAliasee() == Entry) {
	Diags.Report(AA->getLocation(), diag::err_cyclic_alias) << 0;
	return;
	}

	assert(Entry->isDeclaration());

	// If there is a declaration in the module, then we had an extern followed
	// by the alias, as in:
	// extern int test6();
	// ...
	// int test6() __attribute__((alias("test7")));
	//
	// Remove it and replace uses of it with the alias.
	GA->takeName(Entry);

	Entry->replaceAllUsesWith(llvm::ConstantExpr::getBitCast(GA,
	Entry->getType()));
	Entry->eraseFromParent();
	} else {
	GA->setName(MangledName);
	}

	// Set attributes which are particular to an alias; this is a
	// specialization of the attributes which may be set on a global
	// variable/function.
	if (D->hasAttr<WeakAttr>() \|\| D->hasAttr<WeakRefAttr>() \|\|
	D->isWeakImported()) {
	GA->setLinkage(llvm::Function::WeakAnyLinkage);
	}

	if (const auto *VD = dyn_cast<VarDecl>(D))
	if (VD->getTLSKind())
	setTLSMode(GA, *VD);

	setAliasAttributes(D, GA);
	}

	void CodeGenModule::emitIFuncDefinition(GlobalDecl GD) {
	const auto *D = cast<ValueDecl>(GD.getDecl());
	const IFuncAttr *IFA = D->getAttr<IFuncAttr>();
	assert(IFA && "Not an ifunc?");

	StringRef MangledName = getMangledName(GD);

	if (IFA->getResolver() == MangledName) {
	Diags.Report(IFA->getLocation(), diag::err_cyclic_alias) << 1;
	return;
	}

	// Report an error if some definition overrides ifunc.
	llvm::GlobalValue *Entry = GetGlobalValue(MangledName);
	if (Entry && !Entry->isDeclaration()) {
	GlobalDecl OtherGD;
	if (lookupRepresentativeDecl(MangledName, OtherGD) &&
	DiagnosedConflictingDefinitions.insert(GD).second) {
	Diags.Report(D->getLocation(), diag::err_duplicate_mangled_name);
	Diags.Report(OtherGD.getDecl()->getLocation(),
	diag::note_previous_definition);
	}
	return;
	}

	Aliases.push_back(GD);

	llvm::Type *DeclTy = getTypes().ConvertTypeForMem(D->getType());
	llvm::Constant *Resolver =
	GetOrCreateLLVMFunction(IFA->getResolver(), DeclTy, GD,
	/ForVTable=/false);
	llvm::GlobalIFunc *GIF =
	llvm::GlobalIFunc::create(DeclTy, 0, llvm::Function::ExternalLinkage,
	"", Resolver, &getModule());
	if (Entry) {
	if (GIF->getResolver() == Entry) {
	Diags.Report(IFA->getLocation(), diag::err_cyclic_alias) << 1;
	return;
	}
	assert(Entry->isDeclaration());

	// If there is a declaration in the module, then we had an extern followed
	// by the ifunc, as in:
	// extern int test();
	// ...
	// int test() __attribute__((ifunc("resolver")));
	//
	// Remove it and replace uses of it with the ifunc.
	GIF->takeName(Entry);

	Entry->replaceAllUsesWith(llvm::ConstantExpr::getBitCast(GIF,
	Entry->getType()));
	Entry->eraseFromParent();
	} else
	GIF->setName(MangledName);

	SetCommonAttributes(D, GIF);
	}

	llvm::Function *CodeGenModule::getIntrinsic(unsigned IID,
	ArrayRef<llvm::Type*> Tys) {
	return llvm::Intrinsic::getDeclaration(&getModule(), (llvm::Intrinsic::ID)IID,
	Tys);
	}

	static llvm::StringMapEntry<llvm::GlobalVariable *> &
	GetConstantCFStringEntry(llvm::StringMap<llvm::GlobalVariable *> &Map,
	const StringLiteral *Literal, bool TargetIsLSB,
	bool &IsUTF16, unsigned &StringLength) {
	StringRef String = Literal->getString();
	unsigned NumBytes = String.size();

	// Check for simple case.
	if (!Literal->containsNonAsciiOrNull()) {
	StringLength = NumBytes;
	return *Map.insert(std::make_pair(String, nullptr)).first;
	}

	// Otherwise, convert the UTF8 literals into a string of shorts.
	IsUTF16 = true;

	SmallVector<llvm::UTF16, 128> ToBuf(NumBytes + 1); // +1 for ending nulls.
	const llvm::UTF8 FromPtr = (const llvm::UTF8 )String.data();
	llvm::UTF16 *ToPtr = &ToBuf[0];

	(void)llvm::ConvertUTF8toUTF16(&FromPtr, FromPtr + NumBytes, &ToPtr,
	ToPtr + NumBytes, llvm::strictConversion);

	// ConvertUTF8toUTF16 returns the length in ToPtr.
	StringLength = ToPtr - &ToBuf[0];

	// Add an explicit null.
	*ToPtr = 0;
	return *Map.insert(std::make_pair(
	StringRef(reinterpret_cast<const char *>(ToBuf.data()),
	(StringLength + 1) * 2),
	nullptr)).first;
	}

	ConstantAddress
	CodeGenModule::GetAddrOfConstantCFString(const StringLiteral *Literal) {
	unsigned StringLength = 0;
	bool isUTF16 = false;
	llvm::StringMapEntry<llvm::GlobalVariable *> &Entry =
	GetConstantCFStringEntry(CFConstantStringMap, Literal,
	getDataLayout().isLittleEndian(), isUTF16,
	StringLength);

	if (auto *C = Entry.second)
	return ConstantAddress(C, CharUnits::fromQuantity(C->getAlignment()));

	llvm::Constant *Zero = llvm::Constant::getNullValue(Int32Ty);
	llvm::Constant *Zeros[] = { Zero, Zero };

	// If we don't already have it, get __CFConstantStringClassReference.
	if (!CFConstantStringClassRef) {
	llvm::Type *Ty = getTypes().ConvertType(getContext().IntTy);
	Ty = llvm::ArrayType::get(Ty, 0);
	llvm::Constant *GV =
	CreateRuntimeVariable(Ty, "__CFConstantStringClassReference");

	if (getTriple().isOSBinFormatCOFF()) {
	IdentifierInfo &II = getContext().Idents.get(GV->getName());
	TranslationUnitDecl *TUDecl = getContext().getTranslationUnitDecl();
	DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
	llvm::GlobalValue *CGV = cast<llvm::GlobalValue>(GV);

	const VarDecl *VD = nullptr;
	for (const auto &Result : DC->lookup(&II))
	if ((VD = dyn_cast<VarDecl>(Result)))
	break;

	if (!VD \|\| !VD->hasAttr<DLLExportAttr>()) {
	CGV->setDLLStorageClass(llvm::GlobalValue::DLLImportStorageClass);
	CGV->setLinkage(llvm::GlobalValue::ExternalLinkage);
	} else {
	CGV->setDLLStorageClass(llvm::GlobalValue::DLLExportStorageClass);
	CGV->setLinkage(llvm::GlobalValue::ExternalLinkage);
	}
	}

	// Decay array -> ptr
	CFConstantStringClassRef =
	llvm::ConstantExpr::getGetElementPtr(Ty, GV, Zeros);
	}

	QualType CFTy = getContext().getCFConstantStringType();

	auto *STy = cast<llvm::StructType>(getTypes().ConvertType(CFTy));

	ConstantInitBuilder Builder(*this);
	auto Fields = Builder.beginStruct(STy);

	// Class pointer.
	Fields.add(cast<llvm::ConstantExpr>(CFConstantStringClassRef));

	// Flags.
	Fields.addInt(IntTy, isUTF16 ? 0x07d0 : 0x07C8);

	// String pointer.
	llvm::Constant *C = nullptr;
	if (isUTF16) {
	auto Arr = llvm::makeArrayRef(
	reinterpret_cast<uint16_t >(const_cast<char >(Entry.first().data())),
	Entry.first().size() / 2);
	C = llvm::ConstantDataArray::get(VMContext, Arr);
	} else {
	C = llvm::ConstantDataArray::getString(VMContext, Entry.first());
	}

	// Note: -fwritable-strings doesn't make the backing store strings of
	// CFStrings writable. (See <rdar://problem/10657500>)
	auto *GV =
	new llvm::GlobalVariable(getModule(), C->getType(), /isConstant=/true,
	llvm::GlobalValue::PrivateLinkage, C, ".str");
	GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
	// Don't enforce the target's minimum global alignment, since the only use
	// of the string is via this class initializer.
	CharUnits Align = isUTF16
	? getContext().getTypeAlignInChars(getContext().ShortTy)
	: getContext().getTypeAlignInChars(getContext().CharTy);
	GV->setAlignment(Align.getQuantity());

	// FIXME: We set the section explicitly to avoid a bug in ld64 224.1.
	// Without it LLVM can merge the string with a non unnamed_addr one during
	// LTO. Doing that changes the section it ends in, which surprises ld64.
	if (getTriple().isOSBinFormatMachO())
	GV->setSection(isUTF16 ? "__TEXT,__ustring"
	: "__TEXT,__cstring,cstring_literals");

	// String.
	llvm::Constant *Str =
	llvm::ConstantExpr::getGetElementPtr(GV->getValueType(), GV, Zeros);

	if (isUTF16)
	// Cast the UTF16 string to the correct type.
	Str = llvm::ConstantExpr::getBitCast(Str, Int8PtrTy);
	Fields.add(Str);

	// String length.
	auto Ty = getTypes().ConvertType(getContext().LongTy);
	Fields.addInt(cast<llvm::IntegerType>(Ty), StringLength);

	CharUnits Alignment = getPointerAlign();

	// The struct.
	GV = Fields.finishAndCreateGlobal("_unnamed_cfstring_", Alignment,
	/isConstant=/false,
	llvm::GlobalVariable::PrivateLinkage);
	switch (getTriple().getObjectFormat()) {
	case llvm::Triple::UnknownObjectFormat:
	llvm_unreachable("unknown file format");
	case llvm::Triple::COFF:
	case llvm::Triple::ELF:
	case llvm::Triple::Wasm:
	GV->setSection("cfstring");
	break;
	case llvm::Triple::MachO:
	GV->setSection("__DATA,__cfstring");
	break;
	}
	Entry.second = GV;

	return ConstantAddress(GV, Alignment);
	}

	bool CodeGenModule::getExpressionLocationsEnabled() const {
	return !CodeGenOpts.EmitCodeView \|\| CodeGenOpts.DebugColumnInfo;
	}

	QualType CodeGenModule::getObjCFastEnumerationStateType() {
	if (ObjCFastEnumerationStateType.isNull()) {
	RecordDecl *D = Context.buildImplicitRecord("__objcFastEnumerationState");
	D->startDefinition();

	QualType FieldTypes[] = {
	Context.UnsignedLongTy,
	Context.getPointerType(Context.getObjCIdType()),
	Context.getPointerType(Context.UnsignedLongTy),
	Context.getConstantArrayType(Context.UnsignedLongTy,
	llvm::APInt(32, 5), ArrayType::Normal, 0)
	};

	for (size_t i = 0; i < 4; ++i) {
	FieldDecl *Field = FieldDecl::Create(Context,
	D,
	SourceLocation(),
	SourceLocation(), nullptr,
	FieldTypes[i], /TInfo=/nullptr,
	/BitWidth=/nullptr,
	/Mutable=/false,
	ICIS_NoInit);
	Field->setAccess(AS_public);
	D->addDecl(Field);
	}

	D->completeDefinition();
	ObjCFastEnumerationStateType = Context.getTagDeclType(D);
	}

	return ObjCFastEnumerationStateType;
	}

	llvm::Constant *
	CodeGenModule::GetConstantArrayFromStringLiteral(const StringLiteral *E) {
	assert(!E->getType()->isPointerType() && "Strings are always arrays");

	// Don't emit it as the address of the string, emit the string data itself
	// as an inline array.
	if (E->getCharByteWidth() == 1) {
	SmallString<64> Str(E->getString());

	// Resize the string to the right size, which is indicated by its type.
	const ConstantArrayType *CAT = Context.getAsConstantArrayType(E->getType());
	Str.resize(CAT->getSize().getZExtValue());
	return llvm::ConstantDataArray::getString(VMContext, Str, false);
	}

	auto *AType = cast<llvm::ArrayType>(getTypes().ConvertType(E->getType()));
	llvm::Type *ElemTy = AType->getElementType();
	unsigned NumElements = AType->getNumElements();

	// Wide strings have either 2-byte or 4-byte elements.
	if (ElemTy->getPrimitiveSizeInBits() == 16) {
	SmallVector<uint16_t, 32> Elements;
	Elements.reserve(NumElements);

	for(unsigned i = 0, e = E->getLength(); i != e; ++i)
	Elements.push_back(E->getCodeUnit(i));
	Elements.resize(NumElements);
	return llvm::ConstantDataArray::get(VMContext, Elements);
	}

	assert(ElemTy->getPrimitiveSizeInBits() == 32);
	SmallVector<uint32_t, 32> Elements;
	Elements.reserve(NumElements);

	for(unsigned i = 0, e = E->getLength(); i != e; ++i)
	Elements.push_back(E->getCodeUnit(i));
	Elements.resize(NumElements);
	return llvm::ConstantDataArray::get(VMContext, Elements);
	}

	static llvm::GlobalVariable *
	GenerateStringLiteral(llvm::Constant *C, llvm::GlobalValue::LinkageTypes LT,
	CodeGenModule &CGM, StringRef GlobalName,
	CharUnits Alignment) {
	// OpenCL v1.2 s6.5.3: a string literal is in the constant address space.
	unsigned AddrSpace = 0;
	if (CGM.getLangOpts().OpenCL)
	AddrSpace = CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant);

	llvm::Module &M = CGM.getModule();
	// Create a global variable for this string
	auto *GV = new llvm::GlobalVariable(
	M, C->getType(), !CGM.getLangOpts().WritableStrings, LT, C, GlobalName,
	nullptr, llvm::GlobalVariable::NotThreadLocal, AddrSpace);
	GV->setAlignment(Alignment.getQuantity());
	GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
	if (GV->isWeakForLinker()) {
	assert(CGM.supportsCOMDAT() && "Only COFF uses weak string literals");
	GV->setComdat(M.getOrInsertComdat(GV->getName()));
	}

	return GV;
	}

	/// GetAddrOfConstantStringFromLiteral - Return a pointer to a
	/// constant array for the given string literal.
	ConstantAddress
	CodeGenModule::GetAddrOfConstantStringFromLiteral(const StringLiteral *S,
	StringRef Name) {
	CharUnits Alignment = getContext().getAlignOfGlobalVarInChars(S->getType());

	llvm::Constant *C = GetConstantArrayFromStringLiteral(S);
	llvm::GlobalVariable **Entry = nullptr;
	if (!LangOpts.WritableStrings) {
	Entry = &ConstantStringMap[C];
	if (auto GV = *Entry) {
	if (Alignment.getQuantity() > GV->getAlignment())
	GV->setAlignment(Alignment.getQuantity());
	return ConstantAddress(GV, Alignment);
	}
	}

	SmallString<256> MangledNameBuffer;
	StringRef GlobalVariableName;
	llvm::GlobalValue::LinkageTypes LT;

	// Mangle the string literal if the ABI allows for it. However, we cannot
	// do this if we are compiling with ASan or -fwritable-strings because they
	// rely on strings having normal linkage.
	if (!LangOpts.WritableStrings &&
	!LangOpts.Sanitize.has(SanitizerKind::Address) &&
	getCXXABI().getMangleContext().shouldMangleStringLiteral(S)) {
	llvm::raw_svector_ostream Out(MangledNameBuffer);
	getCXXABI().getMangleContext().mangleStringLiteral(S, Out);

	LT = llvm::GlobalValue::LinkOnceODRLinkage;
	GlobalVariableName = MangledNameBuffer;
	} else {
	LT = llvm::GlobalValue::PrivateLinkage;
	GlobalVariableName = Name;
	}

	auto GV = GenerateStringLiteral(C, LT, *this, GlobalVariableName, Alignment);
	if (Entry)
	*Entry = GV;

	SanitizerMD->reportGlobalToASan(GV, S->getStrTokenLoc(0), "<string literal>",
	QualType());
	return ConstantAddress(GV, Alignment);
	}

	/// GetAddrOfConstantStringFromObjCEncode - Return a pointer to a constant
	/// array for the given ObjCEncodeExpr node.
	ConstantAddress
	CodeGenModule::GetAddrOfConstantStringFromObjCEncode(const ObjCEncodeExpr *E) {
	std::string Str;
	getContext().getObjCEncodingForType(E->getEncodedType(), Str);

	return GetAddrOfConstantCString(Str);
	}

	/// GetAddrOfConstantCString - Returns a pointer to a character array containing
	/// the literal and a terminating '\0' character.
	/// The result has pointer to array type.
	ConstantAddress CodeGenModule::GetAddrOfConstantCString(
	const std::string &Str, const char *GlobalName) {
	StringRef StrWithNull(Str.c_str(), Str.size() + 1);
	CharUnits Alignment =
	getContext().getAlignOfGlobalVarInChars(getContext().CharTy);

	llvm::Constant *C =
	llvm::ConstantDataArray::getString(getLLVMContext(), StrWithNull, false);

	// Don't share any string literals if strings aren't constant.
	llvm::GlobalVariable **Entry = nullptr;
	if (!LangOpts.WritableStrings) {
	Entry = &ConstantStringMap[C];
	if (auto GV = *Entry) {
	if (Alignment.getQuantity() > GV->getAlignment())
	GV->setAlignment(Alignment.getQuantity());
	return ConstantAddress(GV, Alignment);
	}
	}

	// Get the default prefix if a name wasn't specified.
	if (!GlobalName)
	GlobalName = ".str";
	// Create a global variable for this.
	auto GV = GenerateStringLiteral(C, llvm::GlobalValue::PrivateLinkage, *this,
	GlobalName, Alignment);
	if (Entry)
	*Entry = GV;
	return ConstantAddress(GV, Alignment);
	}

	ConstantAddress CodeGenModule::GetAddrOfGlobalTemporary(
	const MaterializeTemporaryExpr E, const Expr Init) {
	assert((E->getStorageDuration() == SD_Static \|\|
	E->getStorageDuration() == SD_Thread) && "not a global temporary");
	const auto *VD = cast<VarDecl>(E->getExtendingDecl());

	// If we're not materializing a subobject of the temporary, keep the
	// cv-qualifiers from the type of the MaterializeTemporaryExpr.
	QualType MaterializedType = Init->getType();
	if (Init == E->GetTemporaryExpr())
	MaterializedType = E->getType();

	CharUnits Align = getContext().getTypeAlignInChars(MaterializedType);

	if (llvm::Constant *Slot = MaterializedGlobalTemporaryMap[E])
	return ConstantAddress(Slot, Align);

	// FIXME: If an externally-visible declaration extends multiple temporaries,
	// we need to give each temporary the same name in every translation unit (and
	// we also need to make the temporaries externally-visible).
	SmallString<256> Name;
	llvm::raw_svector_ostream Out(Name);
	getCXXABI().getMangleContext().mangleReferenceTemporary(
	VD, E->getManglingNumber(), Out);

	APValue *Value = nullptr;
	if (E->getStorageDuration() == SD_Static) {
	// We might have a cached constant initializer for this temporary. Note
	// that this might have a different value from the value computed by
	// evaluating the initializer if the surrounding constant expression
	// modifies the temporary.
	Value = getContext().getMaterializedTemporaryValue(E, false);
	if (Value && Value->isUninit())
	Value = nullptr;
	}

	// Try evaluating it now, it might have a constant initializer.
	Expr::EvalResult EvalResult;
	if (!Value && Init->EvaluateAsRValue(EvalResult, getContext()) &&
	!EvalResult.hasSideEffects())
	Value = &EvalResult.Val;

	LangAS AddrSpace =
	VD ? GetGlobalVarAddressSpace(VD) : MaterializedType.getAddressSpace();

	Optional<ConstantEmitter> emitter;
	llvm::Constant *InitialValue = nullptr;
	bool Constant = false;
	llvm::Type *Type;
	if (Value) {
	// The temporary has a constant initializer, use it.
	emitter.emplace(*this);
	InitialValue = emitter->emitForInitializer(*Value, AddrSpace,
	MaterializedType);
	Constant = isTypeConstant(MaterializedType, /ExcludeCtor/Value);
	Type = InitialValue->getType();
	} else {
	// No initializer, the initialization will be provided when we
	// initialize the declaration which performed lifetime extension.
	Type = getTypes().ConvertTypeForMem(MaterializedType);
	}

	// Create a global variable for this lifetime-extended temporary.
	llvm::GlobalValue::LinkageTypes Linkage =
	getLLVMLinkageVarDefinition(VD, Constant);
	if (Linkage == llvm::GlobalVariable::ExternalLinkage) {
	const VarDecl *InitVD;
	if (VD->isStaticDataMember() && VD->getAnyInitializer(InitVD) &&
	isa<CXXRecordDecl>(InitVD->getLexicalDeclContext())) {
	// Temporaries defined inside a class get linkonce_odr linkage because the
	// class can be defined in multipe translation units.
	Linkage = llvm::GlobalVariable::LinkOnceODRLinkage;
	} else {
	// There is no need for this temporary to have external linkage if the
	// VarDecl has external linkage.
	Linkage = llvm::GlobalVariable::InternalLinkage;
	}
	}
	auto TargetAS = getContext().getTargetAddressSpace(AddrSpace);
	auto *GV = new llvm::GlobalVariable(
	getModule(), Type, Constant, Linkage, InitialValue, Name.c_str(),
	/InsertBefore=/nullptr, llvm::GlobalVariable::NotThreadLocal, TargetAS);
	if (emitter) emitter->finalize(GV);
	setGlobalVisibility(GV, VD, ForDefinition);
	GV->setAlignment(Align.getQuantity());
	if (supportsCOMDAT() && GV->isWeakForLinker())
	GV->setComdat(TheModule.getOrInsertComdat(GV->getName()));
	if (VD->getTLSKind())
	setTLSMode(GV, *VD);
	llvm::Constant *CV = GV;
	if (AddrSpace != LangAS::Default)
	CV = getTargetCodeGenInfo().performAddrSpaceCast(
	*this, GV, AddrSpace, LangAS::Default,
	Type->getPointerTo(
	getContext().getTargetAddressSpace(LangAS::Default)));
	MaterializedGlobalTemporaryMap[E] = CV;
	return ConstantAddress(CV, Align);
	}

	/// EmitObjCPropertyImplementations - Emit information for synthesized
	/// properties for an implementation.
	void CodeGenModule::EmitObjCPropertyImplementations(const
	ObjCImplementationDecl *D) {
	for (const auto *PID : D->property_impls()) {
	// Dynamic is just for type-checking.
	if (PID->getPropertyImplementation() == ObjCPropertyImplDecl::Synthesize) {
	ObjCPropertyDecl *PD = PID->getPropertyDecl();

	// Determine which methods need to be implemented, some may have
	// been overridden. Note that ::isPropertyAccessor is not the method
	// we want, that just indicates if the decl came from a
	// property. What we want to know is if the method is defined in
	// this implementation.
	if (!D->getInstanceMethod(PD->getGetterName()))
	CodeGenFunction(*this).GenerateObjCGetter(
	const_cast<ObjCImplementationDecl *>(D), PID);
	if (!PD->isReadOnly() &&
	!D->getInstanceMethod(PD->getSetterName()))
	CodeGenFunction(*this).GenerateObjCSetter(
	const_cast<ObjCImplementationDecl *>(D), PID);
	}
	}
	}

	static bool needsDestructMethod(ObjCImplementationDecl *impl) {
	const ObjCInterfaceDecl *iface = impl->getClassInterface();
	for (const ObjCIvarDecl *ivar = iface->all_declared_ivar_begin();
	ivar; ivar = ivar->getNextIvar())
	if (ivar->getType().isDestructedType())
	return true;

	return false;
	}

	static bool AllTrivialInitializers(CodeGenModule &CGM,
	ObjCImplementationDecl *D) {
	CodeGenFunction CGF(CGM);
	for (ObjCImplementationDecl::init_iterator B = D->init_begin(),
	E = D->init_end(); B != E; ++B) {
	CXXCtorInitializer CtorInitExp = B;
	Expr *Init = CtorInitExp->getInit();
	if (!CGF.isTrivialInitializer(Init))
	return false;
	}
	return true;
	}

	/// EmitObjCIvarInitializations - Emit information for ivar initialization
	/// for an implementation.
	void CodeGenModule::EmitObjCIvarInitializations(ObjCImplementationDecl *D) {
	// We might need a .cxx_destruct even if we don't have any ivar initializers.
	if (needsDestructMethod(D)) {
	IdentifierInfo *II = &getContext().Idents.get(".cxx_destruct");
	Selector cxxSelector = getContext().Selectors.getSelector(0, &II);
	ObjCMethodDecl *DTORMethod =
	ObjCMethodDecl::Create(getContext(), D->getLocation(), D->getLocation(),
	cxxSelector, getContext().VoidTy, nullptr, D,
	/isInstance=/true, /isVariadic=/false,
	/isPropertyAccessor=/true, /isImplicitlyDeclared=/true,
	/isDefined=/false, ObjCMethodDecl::Required);
	D->addInstanceMethod(DTORMethod);
	CodeGenFunction(*this).GenerateObjCCtorDtorMethod(D, DTORMethod, false);
	D->setHasDestructors(true);
	}

	// If the implementation doesn't have any ivar initializers, we don't need
	// a .cxx_construct.
	if (D->getNumIvarInitializers() == 0 \|\|
	AllTrivialInitializers(*this, D))
	return;

	IdentifierInfo *II = &getContext().Idents.get(".cxx_construct");
	Selector cxxSelector = getContext().Selectors.getSelector(0, &II);
	// The constructor returns 'self'.
	ObjCMethodDecl *CTORMethod = ObjCMethodDecl::Create(getContext(),
	D->getLocation(),
	D->getLocation(),
	cxxSelector,
	getContext().getObjCIdType(),
	nullptr, D, /isInstance=/true,
	/isVariadic=/false,
	/isPropertyAccessor=/true,
	/isImplicitlyDeclared=/true,
	/isDefined=/false,
	ObjCMethodDecl::Required);
	D->addInstanceMethod(CTORMethod);
	CodeGenFunction(*this).GenerateObjCCtorDtorMethod(D, CTORMethod, true);
	D->setHasNonZeroConstructors(true);
	}

	// EmitLinkageSpec - Emit all declarations in a linkage spec.
	void CodeGenModule::EmitLinkageSpec(const LinkageSpecDecl *LSD) {
	if (LSD->getLanguage() != LinkageSpecDecl::lang_c &&
	LSD->getLanguage() != LinkageSpecDecl::lang_cxx) {
	ErrorUnsupported(LSD, "linkage spec");
	return;
	}

	EmitDeclContext(LSD);
	}

	void CodeGenModule::EmitDeclContext(const DeclContext *DC) {
	for (auto *I : DC->decls()) {
	// Unlike other DeclContexts, the contents of an ObjCImplDecl at TU scope
	// are themselves considered "top-level", so EmitTopLevelDecl on an
	// ObjCImplDecl does not recursively visit them. We need to do that in
	// case they're nested inside another construct (LinkageSpecDecl /
	// ExportDecl) that does stop them from being considered "top-level".
	if (auto *OID = dyn_cast<ObjCImplDecl>(I)) {
	for (auto *M : OID->methods())
	EmitTopLevelDecl(M);
	}

	EmitTopLevelDecl(I);
	}
	}

	/// EmitTopLevelDecl - Emit code for a single top level declaration.
	void CodeGenModule::EmitTopLevelDecl(Decl *D) {
	// Ignore dependent declarations.
	- if (D->getDeclContext() && D->getDeclContext()->isDependentContext())
	+ if (D->isTemplated())
	return;

	switch (D->getKind()) {
	case Decl::CXXConversion:
	case Decl::CXXMethod:
	case Decl::Function:
	- // Skip function templates
	- if (cast<FunctionDecl>(D)->getDescribedFunctionTemplate() \|\|
	- cast<FunctionDecl>(D)->isLateTemplateParsed())
	- return;
	-
	EmitGlobal(cast<FunctionDecl>(D));
	// Always provide some coverage mapping
	// even for the functions that aren't emitted.
	AddDeferredUnusedCoverageMapping(D);
	break;

	case Decl::CXXDeductionGuide:
	// Function-like, but does not result in code emission.
	break;

	case Decl::Var:
	case Decl::Decomposition:
	- // Skip variable templates
	- if (cast<VarDecl>(D)->getDescribedVarTemplate())
	- return;
	- LLVM_FALLTHROUGH;
	case Decl::VarTemplateSpecialization:
	EmitGlobal(cast<VarDecl>(D));
	if (auto *DD = dyn_cast<DecompositionDecl>(D))
	for (auto *B : DD->bindings())
	if (auto *HD = B->getHoldingVar())
	EmitGlobal(HD);
	break;

	// Indirect fields from global anonymous structs and unions can be
	// ignored; only the actual variable requires IR gen support.
	case Decl::IndirectField:
	break;

	// C++ Decls
	case Decl::Namespace:
	EmitDeclContext(cast<NamespaceDecl>(D));
	break;
	case Decl::ClassTemplateSpecialization: {
	const auto *Spec = cast<ClassTemplateSpecializationDecl>(D);
	if (DebugInfo &&
	Spec->getSpecializationKind() == TSK_ExplicitInstantiationDefinition &&
	Spec->hasDefinition())
	DebugInfo->completeTemplateDefinition(*Spec);
	} LLVM_FALLTHROUGH;
	case Decl::CXXRecord:
	if (DebugInfo) {
	if (auto *ES = D->getASTContext().getExternalSource())
	if (ES->hasExternalDefinitions(D) == ExternalASTSource::EK_Never)
	DebugInfo->completeUnusedClass(cast<CXXRecordDecl>(*D));
	}
	// Emit any static data members, they may be definitions.
	for (auto *I : cast<CXXRecordDecl>(D)->decls())
	if (isa<VarDecl>(I) \|\| isa<CXXRecordDecl>(I))
	EmitTopLevelDecl(I);
	break;
	// No code generation needed.
	case Decl::UsingShadow:
	case Decl::ClassTemplate:
	case Decl::VarTemplate:
	case Decl::VarTemplatePartialSpecialization:
	case Decl::FunctionTemplate:
	case Decl::TypeAliasTemplate:
	case Decl::Block:
	case Decl::Empty:
	break;
	case Decl::Using: // using X; [C++]
	if (CGDebugInfo *DI = getModuleDebugInfo())
	DI->EmitUsingDecl(cast<UsingDecl>(*D));
	return;
	case Decl::NamespaceAlias:
	if (CGDebugInfo *DI = getModuleDebugInfo())
	DI->EmitNamespaceAlias(cast<NamespaceAliasDecl>(*D));
	return;
	case Decl::UsingDirective: // using namespace X; [C++]
	if (CGDebugInfo *DI = getModuleDebugInfo())
	DI->EmitUsingDirective(cast<UsingDirectiveDecl>(*D));
	return;
	case Decl::CXXConstructor:
	- // Skip function templates
	- if (cast<FunctionDecl>(D)->getDescribedFunctionTemplate() \|\|
	- cast<FunctionDecl>(D)->isLateTemplateParsed())
	- return;
	-
	getCXXABI().EmitCXXConstructors(cast<CXXConstructorDecl>(D));
	break;
	case Decl::CXXDestructor:
	- if (cast<FunctionDecl>(D)->isLateTemplateParsed())
	- return;
	getCXXABI().EmitCXXDestructors(cast<CXXDestructorDecl>(D));
	break;

	case Decl::StaticAssert:
	// Nothing to do.
	break;

	// Objective-C Decls

	// Forward declarations, no (immediate) code generation.
	case Decl::ObjCInterface:
	case Decl::ObjCCategory:
	break;

	case Decl::ObjCProtocol: {
	auto *Proto = cast<ObjCProtocolDecl>(D);
	if (Proto->isThisDeclarationADefinition())
	ObjCRuntime->GenerateProtocol(Proto);
	break;
	}

	case Decl::ObjCCategoryImpl:
	// Categories have properties but don't support synthesize so we
	// can ignore them here.
	ObjCRuntime->GenerateCategory(cast<ObjCCategoryImplDecl>(D));
	break;

	case Decl::ObjCImplementation: {
	auto *OMD = cast<ObjCImplementationDecl>(D);
	EmitObjCPropertyImplementations(OMD);
	EmitObjCIvarInitializations(OMD);
	ObjCRuntime->GenerateClass(OMD);
	// Emit global variable debug information.
	if (CGDebugInfo *DI = getModuleDebugInfo())
	if (getCodeGenOpts().getDebugInfo() >= codegenoptions::LimitedDebugInfo)
	DI->getOrCreateInterfaceType(getContext().getObjCInterfaceType(
	OMD->getClassInterface()), OMD->getLocation());
	break;
	}
	case Decl::ObjCMethod: {
	auto *OMD = cast<ObjCMethodDecl>(D);
	// If this is not a prototype, emit the body.
	if (OMD->getBody())
	CodeGenFunction(*this).GenerateObjCMethod(OMD);
	break;
	}
	case Decl::ObjCCompatibleAlias:
	ObjCRuntime->RegisterAlias(cast<ObjCCompatibleAliasDecl>(D));
	break;

	case Decl::PragmaComment: {
	const auto *PCD = cast<PragmaCommentDecl>(D);
	switch (PCD->getCommentKind()) {
	case PCK_Unknown:
	llvm_unreachable("unexpected pragma comment kind");
	case PCK_Linker:
	AppendLinkerOptions(PCD->getArg());
	break;
	case PCK_Lib:
	AddDependentLib(PCD->getArg());
	break;
	case PCK_Compiler:
	case PCK_ExeStr:
	case PCK_User:
	break; // We ignore all of these.
	}
	break;
	}

	case Decl::PragmaDetectMismatch: {
	const auto *PDMD = cast<PragmaDetectMismatchDecl>(D);
	AddDetectMismatch(PDMD->getName(), PDMD->getValue());
	break;
	}

	case Decl::LinkageSpec:
	EmitLinkageSpec(cast<LinkageSpecDecl>(D));
	break;

	case Decl::FileScopeAsm: {
	// File-scope asm is ignored during device-side CUDA compilation.
	if (LangOpts.CUDA && LangOpts.CUDAIsDevice)
	break;
	// File-scope asm is ignored during device-side OpenMP compilation.
	if (LangOpts.OpenMPIsDevice)
	break;
	auto *AD = cast<FileScopeAsmDecl>(D);
	getModule().appendModuleInlineAsm(AD->getAsmString()->getString());
	break;
	}

	case Decl::Import: {
	auto *Import = cast<ImportDecl>(D);

	// If we've already imported this module, we're done.
	if (!ImportedModules.insert(Import->getImportedModule()))
	break;

	// Emit debug information for direct imports.
	if (!Import->getImportedOwningModule()) {
	if (CGDebugInfo *DI = getModuleDebugInfo())
	DI->EmitImportDecl(*Import);
	}

	// Find all of the submodules and emit the module initializers.
	llvm::SmallPtrSet<clang::Module *, 16> Visited;
	SmallVector<clang::Module *, 16> Stack;
	Visited.insert(Import->getImportedModule());
	Stack.push_back(Import->getImportedModule());

	while (!Stack.empty()) {
	clang::Module *Mod = Stack.pop_back_val();
	if (!EmittedModuleInitializers.insert(Mod).second)
	continue;

	for (auto *D : Context.getModuleInitializers(Mod))
	EmitTopLevelDecl(D);

	// Visit the submodules of this module.
	for (clang::Module::submodule_iterator Sub = Mod->submodule_begin(),
	SubEnd = Mod->submodule_end();
	Sub != SubEnd; ++Sub) {
	// Skip explicit children; they need to be explicitly imported to emit
	// the initializers.
	if ((*Sub)->IsExplicit)
	continue;

	if (Visited.insert(*Sub).second)
	Stack.push_back(*Sub);
	}
	}
	break;
	}

	case Decl::Export:
	EmitDeclContext(cast<ExportDecl>(D));
	break;

	case Decl::OMPThreadPrivate:
	EmitOMPThreadPrivateDecl(cast<OMPThreadPrivateDecl>(D));
	break;

	case Decl::OMPDeclareReduction:
	EmitOMPDeclareReduction(cast<OMPDeclareReductionDecl>(D));
	break;

	default:
	// Make sure we handled everything we should, every other kind is a
	// non-top-level decl. FIXME: Would be nice to have an isTopLevelDeclKind
	// function. Need to recode Decl::Kind to do that easily.
	assert(isa<TypeDecl>(D) && "Unsupported decl kind");
	break;
	}
	}

	void CodeGenModule::AddDeferredUnusedCoverageMapping(Decl *D) {
	// Do we need to generate coverage mapping?
	if (!CodeGenOpts.CoverageMapping)
	return;
	switch (D->getKind()) {
	case Decl::CXXConversion:
	case Decl::CXXMethod:
	case Decl::Function:
	case Decl::ObjCMethod:
	case Decl::CXXConstructor:
	case Decl::CXXDestructor: {
	if (!cast<FunctionDecl>(D)->doesThisDeclarationHaveABody())
	return;
	SourceManager &SM = getContext().getSourceManager();
	if (LimitedCoverage && SM.getMainFileID() != SM.getFileID(D->getLocStart()))
	return;
	auto I = DeferredEmptyCoverageMappingDecls.find(D);
	if (I == DeferredEmptyCoverageMappingDecls.end())
	DeferredEmptyCoverageMappingDecls[D] = true;
	break;
	}
	default:
	break;
	};
	}

	void CodeGenModule::ClearUnusedCoverageMapping(const Decl *D) {
	// Do we need to generate coverage mapping?
	if (!CodeGenOpts.CoverageMapping)
	return;
	if (const auto *Fn = dyn_cast<FunctionDecl>(D)) {
	if (Fn->isTemplateInstantiation())
	ClearUnusedCoverageMapping(Fn->getTemplateInstantiationPattern());
	}
	auto I = DeferredEmptyCoverageMappingDecls.find(D);
	if (I == DeferredEmptyCoverageMappingDecls.end())
	DeferredEmptyCoverageMappingDecls[D] = false;
	else
	I->second = false;
	}

	void CodeGenModule::EmitDeferredUnusedCoverageMappings() {
	// We call takeVector() here to avoid use-after-free.
	// FIXME: DeferredEmptyCoverageMappingDecls is getting mutated because
	// we deserialize function bodies to emit coverage info for them, and that
	// deserializes more declarations. How should we handle that case?
	for (const auto &Entry : DeferredEmptyCoverageMappingDecls.takeVector()) {
	if (!Entry.second)
	continue;
	const Decl *D = Entry.first;
	switch (D->getKind()) {
	case Decl::CXXConversion:
	case Decl::CXXMethod:
	case Decl::Function:
	case Decl::ObjCMethod: {
	CodeGenPGO PGO(*this);
	GlobalDecl GD(cast<FunctionDecl>(D));
	PGO.emitEmptyCounterMapping(D, getMangledName(GD),
	getFunctionLinkage(GD));
	break;
	}
	case Decl::CXXConstructor: {
	CodeGenPGO PGO(*this);
	GlobalDecl GD(cast<CXXConstructorDecl>(D), Ctor_Base);
	PGO.emitEmptyCounterMapping(D, getMangledName(GD),
	getFunctionLinkage(GD));
	break;
	}
	case Decl::CXXDestructor: {
	CodeGenPGO PGO(*this);
	GlobalDecl GD(cast<CXXDestructorDecl>(D), Dtor_Base);
	PGO.emitEmptyCounterMapping(D, getMangledName(GD),
	getFunctionLinkage(GD));
	break;
	}
	default:
	break;
	};
	}
	}

	/// Turns the given pointer into a constant.
	static llvm::Constant *GetPointerConstant(llvm::LLVMContext &Context,
	const void *Ptr) {
	uintptr_t PtrInt = reinterpret_cast<uintptr_t>(Ptr);
	llvm::Type *i64 = llvm::Type::getInt64Ty(Context);
	return llvm::ConstantInt::get(i64, PtrInt);
	}

	static void EmitGlobalDeclMetadata(CodeGenModule &CGM,
	llvm::NamedMDNode *&GlobalMetadata,
	GlobalDecl D,
	llvm::GlobalValue *Addr) {
	if (!GlobalMetadata)
	GlobalMetadata =
	CGM.getModule().getOrInsertNamedMetadata("clang.global.decl.ptrs");

	// TODO: should we report variant information for ctors/dtors?
	llvm::Metadata *Ops[] = {llvm::ConstantAsMetadata::get(Addr),
	llvm::ConstantAsMetadata::get(GetPointerConstant(
	CGM.getLLVMContext(), D.getDecl()))};
	GlobalMetadata->addOperand(llvm::MDNode::get(CGM.getLLVMContext(), Ops));
	}

	/// For each function which is declared within an extern "C" region and marked
	/// as 'used', but has internal linkage, create an alias from the unmangled
	/// name to the mangled name if possible. People expect to be able to refer
	/// to such functions with an unmangled name from inline assembly within the
	/// same translation unit.
	void CodeGenModule::EmitStaticExternCAliases() {
	// Don't do anything if we're generating CUDA device code -- the NVPTX
	// assembly target doesn't support aliases.
	if (Context.getTargetInfo().getTriple().isNVPTX())
	return;
	for (auto &I : StaticExternCValues) {
	IdentifierInfo *Name = I.first;
	llvm::GlobalValue *Val = I.second;
	if (Val && !getModule().getNamedValue(Name->getName()))
	addUsedGlobal(llvm::GlobalAlias::create(Name->getName(), Val));
	}
	}

	bool CodeGenModule::lookupRepresentativeDecl(StringRef MangledName,
	GlobalDecl &Result) const {
	auto Res = Manglings.find(MangledName);
	if (Res == Manglings.end())
	return false;
	Result = Res->getValue();
	return true;
	}

	/// Emits metadata nodes associating all the global values in the
	/// current module with the Decls they came from. This is useful for
	/// projects using IR gen as a subroutine.
	///
	/// Since there's currently no way to associate an MDNode directly
	/// with an llvm::GlobalValue, we create a global named metadata
	/// with the name 'clang.global.decl.ptrs'.
	void CodeGenModule::EmitDeclMetadata() {
	llvm::NamedMDNode *GlobalMetadata = nullptr;

	for (auto &I : MangledDeclNames) {
	llvm::GlobalValue *Addr = getModule().getNamedValue(I.second);
	// Some mangled names don't necessarily have an associated GlobalValue
	// in this module, e.g. if we mangled it for DebugInfo.
	if (Addr)
	EmitGlobalDeclMetadata(*this, GlobalMetadata, I.first, Addr);
	}
	}

	/// Emits metadata nodes for all the local variables in the current
	/// function.
	void CodeGenFunction::EmitDeclMetadata() {
	if (LocalDeclMap.empty()) return;

	llvm::LLVMContext &Context = getLLVMContext();

	// Find the unique metadata ID for this name.
	unsigned DeclPtrKind = Context.getMDKindID("clang.decl.ptr");

	llvm::NamedMDNode *GlobalMetadata = nullptr;

	for (auto &I : LocalDeclMap) {
	const Decl *D = I.first;
	llvm::Value *Addr = I.second.getPointer();
	if (auto *Alloca = dyn_cast<llvm::AllocaInst>(Addr)) {
	llvm::Value *DAddr = GetPointerConstant(getLLVMContext(), D);
	Alloca->setMetadata(
	DeclPtrKind, llvm::MDNode::get(
	Context, llvm::ValueAsMetadata::getConstant(DAddr)));
	} else if (auto *GV = dyn_cast<llvm::GlobalValue>(Addr)) {
	GlobalDecl GD = GlobalDecl(cast<VarDecl>(D));
	EmitGlobalDeclMetadata(CGM, GlobalMetadata, GD, GV);
	}
	}
	}

	void CodeGenModule::EmitVersionIdentMetadata() {
	llvm::NamedMDNode *IdentMetadata =
	TheModule.getOrInsertNamedMetadata("llvm.ident");
	std::string Version = getClangFullVersion();
	llvm::LLVMContext &Ctx = TheModule.getContext();

	llvm::Metadata *IdentNode[] = {llvm::MDString::get(Ctx, Version)};
	IdentMetadata->addOperand(llvm::MDNode::get(Ctx, IdentNode));
	}

	void CodeGenModule::EmitTargetMetadata() {
	// Warning, new MangledDeclNames may be appended within this loop.
	// We rely on MapVector insertions adding new elements to the end
	// of the container.
	// FIXME: Move this loop into the one target that needs it, and only
	// loop over those declarations for which we couldn't emit the target
	// metadata when we emitted the declaration.
	for (unsigned I = 0; I != MangledDeclNames.size(); ++I) {
	auto Val = *(MangledDeclNames.begin() + I);
	const Decl *D = Val.first.getDecl()->getMostRecentDecl();
	llvm::GlobalValue *GV = GetGlobalValue(Val.second);
	getTargetCodeGenInfo().emitTargetMD(D, GV, *this);
	}
	}

	void CodeGenModule::EmitCoverageFile() {
	if (getCodeGenOpts().CoverageDataFile.empty() &&
	getCodeGenOpts().CoverageNotesFile.empty())
	return;

	llvm::NamedMDNode *CUNode = TheModule.getNamedMetadata("llvm.dbg.cu");
	if (!CUNode)
	return;

	llvm::NamedMDNode *GCov = TheModule.getOrInsertNamedMetadata("llvm.gcov");
	llvm::LLVMContext &Ctx = TheModule.getContext();
	auto *CoverageDataFile =
	llvm::MDString::get(Ctx, getCodeGenOpts().CoverageDataFile);
	auto *CoverageNotesFile =
	llvm::MDString::get(Ctx, getCodeGenOpts().CoverageNotesFile);
	for (int i = 0, e = CUNode->getNumOperands(); i != e; ++i) {
	llvm::MDNode *CU = CUNode->getOperand(i);
	llvm::Metadata *Elts[] = {CoverageNotesFile, CoverageDataFile, CU};
	GCov->addOperand(llvm::MDNode::get(Ctx, Elts));
	}
	}

	llvm::Constant *CodeGenModule::EmitUuidofInitializer(StringRef Uuid) {
	// Sema has checked that all uuid strings are of the form
	// "12345678-1234-1234-1234-1234567890ab".
	assert(Uuid.size() == 36);
	for (unsigned i = 0; i < 36; ++i) {
	if (i == 8 \|\| i == 13 \|\| i == 18 \|\| i == 23) assert(Uuid[i] == '-');
	else assert(isHexDigit(Uuid[i]));
	}

	// The starts of all bytes of Field3 in Uuid. Field 3 is "1234-1234567890ab".
	const unsigned Field3ValueOffsets[8] = { 19, 21, 24, 26, 28, 30, 32, 34 };

	llvm::Constant *Field3[8];
	for (unsigned Idx = 0; Idx < 8; ++Idx)
	Field3[Idx] = llvm::ConstantInt::get(
	Int8Ty, Uuid.substr(Field3ValueOffsets[Idx], 2), 16);

	llvm::Constant *Fields[4] = {
	llvm::ConstantInt::get(Int32Ty, Uuid.substr(0, 8), 16),
	llvm::ConstantInt::get(Int16Ty, Uuid.substr(9, 4), 16),
	llvm::ConstantInt::get(Int16Ty, Uuid.substr(14, 4), 16),
	llvm::ConstantArray::get(llvm::ArrayType::get(Int8Ty, 8), Field3)
	};

	return llvm::ConstantStruct::getAnon(Fields);
	}

	llvm::Constant *CodeGenModule::GetAddrOfRTTIDescriptor(QualType Ty,
	bool ForEH) {
	// Return a bogus pointer if RTTI is disabled, unless it's for EH.
	// FIXME: should we even be calling this method if RTTI is disabled
	// and it's not for EH?
	if (!ForEH && !getLangOpts().RTTI)
	return llvm::Constant::getNullValue(Int8PtrTy);

	if (ForEH && Ty->isObjCObjectPointerType() &&
	LangOpts.ObjCRuntime.isGNUFamily())
	return ObjCRuntime->GetEHType(Ty);

	return getCXXABI().getAddrOfRTTIDescriptor(Ty);
	}

	void CodeGenModule::EmitOMPThreadPrivateDecl(const OMPThreadPrivateDecl *D) {
	// Do not emit threadprivates in simd-only mode.
	if (LangOpts.OpenMP && LangOpts.OpenMPSimd)
	return;
	for (auto RefExpr : D->varlists()) {
	auto *VD = cast<VarDecl>(cast<DeclRefExpr>(RefExpr)->getDecl());
	bool PerformInit =
	VD->getAnyInitializer() &&
	!VD->getAnyInitializer()->isConstantInitializer(getContext(),
	/ForRef=/false);

	Address Addr(GetAddrOfGlobalVar(VD), getContext().getDeclAlign(VD));
	if (auto InitFunction = getOpenMPRuntime().emitThreadPrivateVarDefinition(
	VD, Addr, RefExpr->getLocStart(), PerformInit))
	CXXGlobalInits.push_back(InitFunction);
	}
	}

	llvm::Metadata *CodeGenModule::CreateMetadataIdentifierForType(QualType T) {
	llvm::Metadata *&InternalId = MetadataIdMap[T.getCanonicalType()];
	if (InternalId)
	return InternalId;

	if (isExternallyVisible(T->getLinkage())) {
	std::string OutName;
	llvm::raw_string_ostream Out(OutName);
	getCXXABI().getMangleContext().mangleTypeName(T, Out);

	InternalId = llvm::MDString::get(getLLVMContext(), Out.str());
	} else {
	InternalId = llvm::MDNode::getDistinct(getLLVMContext(),
	llvm::ArrayRef<llvm::Metadata *>());
	}

	return InternalId;
	}

	// Generalize pointer types to a void pointer with the qualifiers of the
	// originally pointed-to type, e.g. 'const char ' and 'char const *'
	// generalize to 'const void ' while 'char ' and 'const char **' generalize to
	// 'void *'.
	static QualType GeneralizeType(ASTContext &Ctx, QualType Ty) {
	if (!Ty->isPointerType())
	return Ty;

	return Ctx.getPointerType(
	QualType(Ctx.VoidTy).withCVRQualifiers(
	Ty->getPointeeType().getCVRQualifiers()));
	}

	// Apply type generalization to a FunctionType's return and argument types
	static QualType GeneralizeFunctionType(ASTContext &Ctx, QualType Ty) {
	if (auto *FnType = Ty->getAs<FunctionProtoType>()) {
	SmallVector<QualType, 8> GeneralizedParams;
	for (auto &Param : FnType->param_types())
	GeneralizedParams.push_back(GeneralizeType(Ctx, Param));

	return Ctx.getFunctionType(
	GeneralizeType(Ctx, FnType->getReturnType()),
	GeneralizedParams, FnType->getExtProtoInfo());
	}

	if (auto *FnType = Ty->getAs<FunctionNoProtoType>())
	return Ctx.getFunctionNoProtoType(
	GeneralizeType(Ctx, FnType->getReturnType()));

	llvm_unreachable("Encountered unknown FunctionType");
	}

	llvm::Metadata *CodeGenModule::CreateMetadataIdentifierGeneralized(QualType T) {
	T = GeneralizeFunctionType(getContext(), T);

	llvm::Metadata *&InternalId = GeneralizedMetadataIdMap[T.getCanonicalType()];
	if (InternalId)
	return InternalId;

	if (isExternallyVisible(T->getLinkage())) {
	std::string OutName;
	llvm::raw_string_ostream Out(OutName);
	getCXXABI().getMangleContext().mangleTypeName(T, Out);
	Out << ".generalized";

	InternalId = llvm::MDString::get(getLLVMContext(), Out.str());
	} else {
	InternalId = llvm::MDNode::getDistinct(getLLVMContext(),
	llvm::ArrayRef<llvm::Metadata *>());
	}

	return InternalId;
	}

	/// Returns whether this module needs the "all-vtables" type identifier.
	bool CodeGenModule::NeedAllVtablesTypeId() const {
	// Returns true if at least one of vtable-based CFI checkers is enabled and
	// is not in the trapping mode.
	return ((LangOpts.Sanitize.has(SanitizerKind::CFIVCall) &&
	!CodeGenOpts.SanitizeTrap.has(SanitizerKind::CFIVCall)) \|\|
	(LangOpts.Sanitize.has(SanitizerKind::CFINVCall) &&
	!CodeGenOpts.SanitizeTrap.has(SanitizerKind::CFINVCall)) \|\|
	(LangOpts.Sanitize.has(SanitizerKind::CFIDerivedCast) &&
	!CodeGenOpts.SanitizeTrap.has(SanitizerKind::CFIDerivedCast)) \|\|
	(LangOpts.Sanitize.has(SanitizerKind::CFIUnrelatedCast) &&
	!CodeGenOpts.SanitizeTrap.has(SanitizerKind::CFIUnrelatedCast)));
	}

	void CodeGenModule::AddVTableTypeMetadata(llvm::GlobalVariable *VTable,
	CharUnits Offset,
	const CXXRecordDecl *RD) {
	llvm::Metadata *MD =
	CreateMetadataIdentifierForType(QualType(RD->getTypeForDecl(), 0));
	VTable->addTypeMetadata(Offset.getQuantity(), MD);

	if (CodeGenOpts.SanitizeCfiCrossDso)
	if (auto CrossDsoTypeId = CreateCrossDsoCfiTypeId(MD))
	VTable->addTypeMetadata(Offset.getQuantity(),
	llvm::ConstantAsMetadata::get(CrossDsoTypeId));

	if (NeedAllVtablesTypeId()) {
	llvm::Metadata *MD = llvm::MDString::get(getLLVMContext(), "all-vtables");
	VTable->addTypeMetadata(Offset.getQuantity(), MD);
	}
	}

	// Fills in the supplied string map with the set of target features for the
	// passed in function.
	void CodeGenModule::getFunctionFeatureMap(llvm::StringMap<bool> &FeatureMap,
	const FunctionDecl *FD) {
	StringRef TargetCPU = Target.getTargetOpts().CPU;
	if (const auto *TD = FD->getAttr<TargetAttr>()) {
	// If we have a TargetAttr build up the feature map based on that.
	TargetAttr::ParsedTargetAttr ParsedAttr = TD->parse();

	ParsedAttr.Features.erase(
	llvm::remove_if(ParsedAttr.Features,
	[&](const std::string &Feat) {
	return !Target.isValidFeatureName(
	StringRef{Feat}.substr(1));
	}),
	ParsedAttr.Features.end());

	// Make a copy of the features as passed on the command line into the
	// beginning of the additional features from the function to override.
	ParsedAttr.Features.insert(ParsedAttr.Features.begin(),
	Target.getTargetOpts().FeaturesAsWritten.begin(),
	Target.getTargetOpts().FeaturesAsWritten.end());

	if (ParsedAttr.Architecture != "" &&
	Target.isValidCPUName(ParsedAttr.Architecture))
	TargetCPU = ParsedAttr.Architecture;

	// Now populate the feature map, first with the TargetCPU which is either
	// the default or a new one from the target attribute string. Then we'll use
	// the passed in features (FeaturesAsWritten) along with the new ones from
	// the attribute.
	Target.initFeatureMap(FeatureMap, getDiags(), TargetCPU,
	ParsedAttr.Features);
	} else {
	Target.initFeatureMap(FeatureMap, getDiags(), TargetCPU,
	Target.getTargetOpts().Features);
	}
	}

	llvm::SanitizerStatReport &CodeGenModule::getSanStats() {
	if (!SanStats)
	SanStats = llvm::make_unique<llvm::SanitizerStatReport>(&getModule());

	return *SanStats;
	}
	llvm::Value *
	CodeGenModule::createOpenCLIntToSamplerConversion(const Expr *E,
	CodeGenFunction &CGF) {
	llvm::Constant *C = ConstantEmitter(CGF).emitAbstract(E, E->getType());
	auto SamplerT = getOpenCLRuntime().getSamplerType(E->getType().getTypePtr());
	auto FTy = llvm::FunctionType::get(SamplerT, {C->getType()}, false);
	return CGF.Builder.CreateCall(CreateRuntimeFunction(FTy,
	"__translate_sampler_initializer"),
	{C});
	}
	Index: head/contrib/llvm/tools/clang/lib/CodeGen/ItaniumCXXABI.cpp
	===================================================================
	--- head/contrib/llvm/tools/clang/lib/CodeGen/ItaniumCXXABI.cpp (revision 329409)
	+++ head/contrib/llvm/tools/clang/lib/CodeGen/ItaniumCXXABI.cpp (revision 329410)
	@@ -1,4053 +1,4058 @@
	//===------- ItaniumCXXABI.cpp - Emit LLVM Code from ASTs for a Module ----===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This provides C++ code generation targeting the Itanium C++ ABI. The class
	// in this file generates structures that follow the Itanium C++ ABI, which is
	// documented at:
	// http://www.codesourcery.com/public/cxx-abi/abi.html
	// http://www.codesourcery.com/public/cxx-abi/abi-eh.html
	//
	// It also supports the closely-related ARM ABI, documented at:
	// http://infocenter.arm.com/help/topic/com.arm.doc.ihi0041c/IHI0041C_cppabi.pdf
	//
	//===----------------------------------------------------------------------===//

	#include "CGCXXABI.h"
	#include "CGCleanup.h"
	#include "CGRecordLayout.h"
	#include "CGVTables.h"
	#include "CodeGenFunction.h"
	#include "CodeGenModule.h"
	#include "TargetInfo.h"
	#include "clang/CodeGen/ConstantInitBuilder.h"
	#include "clang/AST/Mangle.h"
	#include "clang/AST/Type.h"
	#include "clang/AST/StmtCXX.h"
	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/Value.h"

	using namespace clang;
	using namespace CodeGen;

	namespace {
	class ItaniumCXXABI : public CodeGen::CGCXXABI {
	/// VTables - All the vtables which have been defined.
	llvm::DenseMap<const CXXRecordDecl , llvm::GlobalVariable > VTables;

	protected:
	bool UseARMMethodPtrABI;
	bool UseARMGuardVarABI;
	bool Use32BitVTableOffsetABI;

	ItaniumMangleContext &getMangleContext() {
	return cast<ItaniumMangleContext>(CodeGen::CGCXXABI::getMangleContext());
	}

	public:
	ItaniumCXXABI(CodeGen::CodeGenModule &CGM,
	bool UseARMMethodPtrABI = false,
	bool UseARMGuardVarABI = false) :
	CGCXXABI(CGM), UseARMMethodPtrABI(UseARMMethodPtrABI),
	UseARMGuardVarABI(UseARMGuardVarABI),
	Use32BitVTableOffsetABI(false) { }

	bool classifyReturnType(CGFunctionInfo &FI) const override;

	bool passClassIndirect(const CXXRecordDecl *RD) const {
	// Clang <= 4 used the pre-C++11 rule, which ignores move operations.
	// The PS4 platform ABI follows the behavior of Clang 3.2.
	if (CGM.getCodeGenOpts().getClangABICompat() <=
	CodeGenOptions::ClangABI::Ver4 \|\|
	CGM.getTriple().getOS() == llvm::Triple::PS4)
	return RD->hasNonTrivialDestructor() \|\|
	RD->hasNonTrivialCopyConstructor();
	return !canCopyArgument(RD);
	}

	RecordArgABI getRecordArgABI(const CXXRecordDecl *RD) const override {
	// If C++ prohibits us from making a copy, pass by address.
	if (passClassIndirect(RD))
	return RAA_Indirect;
	return RAA_Default;
	}

	bool isThisCompleteObject(GlobalDecl GD) const override {
	// The Itanium ABI has separate complete-object vs. base-object
	// variants of both constructors and destructors.
	if (isa<CXXDestructorDecl>(GD.getDecl())) {
	switch (GD.getDtorType()) {
	case Dtor_Complete:
	case Dtor_Deleting:
	return true;

	case Dtor_Base:
	return false;

	case Dtor_Comdat:
	llvm_unreachable("emitting dtor comdat as function?");
	}
	llvm_unreachable("bad dtor kind");
	}
	if (isa<CXXConstructorDecl>(GD.getDecl())) {
	switch (GD.getCtorType()) {
	case Ctor_Complete:
	return true;

	case Ctor_Base:
	return false;

	case Ctor_CopyingClosure:
	case Ctor_DefaultClosure:
	llvm_unreachable("closure ctors in Itanium ABI?");

	case Ctor_Comdat:
	llvm_unreachable("emitting ctor comdat as function?");
	}
	llvm_unreachable("bad dtor kind");
	}

	// No other kinds.
	return false;
	}

	bool isZeroInitializable(const MemberPointerType *MPT) override;

	llvm::Type ConvertMemberPointerType(const MemberPointerType MPT) override;

	CGCallee
	EmitLoadOfMemberFunctionPointer(CodeGenFunction &CGF,
	const Expr *E,
	Address This,
	llvm::Value *&ThisPtrForCall,
	llvm::Value *MemFnPtr,
	const MemberPointerType *MPT) override;

	llvm::Value *
	EmitMemberDataPointerAddress(CodeGenFunction &CGF, const Expr *E,
	Address Base,
	llvm::Value *MemPtr,
	const MemberPointerType *MPT) override;

	llvm::Value *EmitMemberPointerConversion(CodeGenFunction &CGF,
	const CastExpr *E,
	llvm::Value *Src) override;
	llvm::Constant EmitMemberPointerConversion(const CastExpr E,
	llvm::Constant *Src) override;

	llvm::Constant EmitNullMemberPointer(const MemberPointerType MPT) override;

	llvm::Constant EmitMemberFunctionPointer(const CXXMethodDecl MD) override;
	llvm::Constant EmitMemberDataPointer(const MemberPointerType MPT,
	CharUnits offset) override;
	llvm::Constant *EmitMemberPointer(const APValue &MP, QualType MPT) override;
	llvm::Constant BuildMemberPointer(const CXXMethodDecl MD,
	CharUnits ThisAdjustment);

	llvm::Value *EmitMemberPointerComparison(CodeGenFunction &CGF,
	llvm::Value L, llvm::Value R,
	const MemberPointerType *MPT,
	bool Inequality) override;

	llvm::Value *EmitMemberPointerIsNotNull(CodeGenFunction &CGF,
	llvm::Value *Addr,
	const MemberPointerType *MPT) override;

	void emitVirtualObjectDelete(CodeGenFunction &CGF, const CXXDeleteExpr *DE,
	Address Ptr, QualType ElementType,
	const CXXDestructorDecl *Dtor) override;

	/// Itanium says that an _Unwind_Exception has to be "double-word"
	/// aligned (and thus the end of it is also so-aligned), meaning 16
	/// bytes. Of course, that was written for the actual Itanium,
	/// which is a 64-bit platform. Classically, the ABI doesn't really
	/// specify the alignment on other platforms, but in practice
	/// libUnwind declares the struct with __attribute__((aligned)), so
	/// we assume that alignment here. (It's generally 16 bytes, but
	/// some targets overwrite it.)
	CharUnits getAlignmentOfExnObject() {
	auto align = CGM.getContext().getTargetDefaultAlignForAttributeAligned();
	return CGM.getContext().toCharUnitsFromBits(align);
	}

	void emitRethrow(CodeGenFunction &CGF, bool isNoReturn) override;
	void emitThrow(CodeGenFunction &CGF, const CXXThrowExpr *E) override;

	void emitBeginCatch(CodeGenFunction &CGF, const CXXCatchStmt *C) override;

	llvm::CallInst *
	emitTerminateForUnexpectedException(CodeGenFunction &CGF,
	llvm::Value *Exn) override;

	void EmitFundamentalRTTIDescriptor(QualType Type, bool DLLExport);
	void EmitFundamentalRTTIDescriptors(bool DLLExport);
	llvm::Constant *getAddrOfRTTIDescriptor(QualType Ty) override;
	CatchTypeInfo
	getAddrOfCXXCatchHandlerType(QualType Ty,
	QualType CatchHandlerType) override {
	return CatchTypeInfo{getAddrOfRTTIDescriptor(Ty), 0};
	}

	bool shouldTypeidBeNullChecked(bool IsDeref, QualType SrcRecordTy) override;
	void EmitBadTypeidCall(CodeGenFunction &CGF) override;
	llvm::Value *EmitTypeid(CodeGenFunction &CGF, QualType SrcRecordTy,
	Address ThisPtr,
	llvm::Type *StdTypeInfoPtrTy) override;

	bool shouldDynamicCastCallBeNullChecked(bool SrcIsPtr,
	QualType SrcRecordTy) override;

	llvm::Value *EmitDynamicCastCall(CodeGenFunction &CGF, Address Value,
	QualType SrcRecordTy, QualType DestTy,
	QualType DestRecordTy,
	llvm::BasicBlock *CastEnd) override;

	llvm::Value *EmitDynamicCastToVoid(CodeGenFunction &CGF, Address Value,
	QualType SrcRecordTy,
	QualType DestTy) override;

	bool EmitBadCastCall(CodeGenFunction &CGF) override;

	llvm::Value *
	GetVirtualBaseClassOffset(CodeGenFunction &CGF, Address This,
	const CXXRecordDecl *ClassDecl,
	const CXXRecordDecl *BaseClassDecl) override;

	void EmitCXXConstructors(const CXXConstructorDecl *D) override;

	AddedStructorArgs
	buildStructorSignature(const CXXMethodDecl *MD, StructorType T,
	SmallVectorImpl<CanQualType> &ArgTys) override;

	bool useThunkForDtorVariant(const CXXDestructorDecl *Dtor,
	CXXDtorType DT) const override {
	// Itanium does not emit any destructor variant as an inline thunk.
	// Delegating may occur as an optimization, but all variants are either
	// emitted with external linkage or as linkonce if they are inline and used.
	return false;
	}

	void EmitCXXDestructors(const CXXDestructorDecl *D) override;

	void addImplicitStructorParams(CodeGenFunction &CGF, QualType &ResTy,
	FunctionArgList &Params) override;

	void EmitInstanceFunctionProlog(CodeGenFunction &CGF) override;

	AddedStructorArgs
	addImplicitConstructorArgs(CodeGenFunction &CGF, const CXXConstructorDecl *D,
	CXXCtorType Type, bool ForVirtualBase,
	bool Delegating, CallArgList &Args) override;

	void EmitDestructorCall(CodeGenFunction &CGF, const CXXDestructorDecl *DD,
	CXXDtorType Type, bool ForVirtualBase,
	bool Delegating, Address This) override;

	void emitVTableDefinitions(CodeGenVTables &CGVT,
	const CXXRecordDecl *RD) override;

	bool isVirtualOffsetNeededForVTableField(CodeGenFunction &CGF,
	CodeGenFunction::VPtr Vptr) override;

	bool doStructorsInitializeVPtrs(const CXXRecordDecl *VTableClass) override {
	return true;
	}

	llvm::Constant *
	getVTableAddressPoint(BaseSubobject Base,
	const CXXRecordDecl *VTableClass) override;

	llvm::Value *getVTableAddressPointInStructor(
	CodeGenFunction &CGF, const CXXRecordDecl *VTableClass,
	BaseSubobject Base, const CXXRecordDecl *NearestVBase) override;

	llvm::Value *getVTableAddressPointInStructorWithVTT(
	CodeGenFunction &CGF, const CXXRecordDecl *VTableClass,
	BaseSubobject Base, const CXXRecordDecl *NearestVBase);

	llvm::Constant *
	getVTableAddressPointForConstExpr(BaseSubobject Base,
	const CXXRecordDecl *VTableClass) override;

	llvm::GlobalVariable getAddrOfVTable(const CXXRecordDecl RD,
	CharUnits VPtrOffset) override;

	CGCallee getVirtualFunctionPointer(CodeGenFunction &CGF, GlobalDecl GD,
	Address This, llvm::Type *Ty,
	SourceLocation Loc) override;

	llvm::Value *EmitVirtualDestructorCall(CodeGenFunction &CGF,
	const CXXDestructorDecl *Dtor,
	CXXDtorType DtorType,
	Address This,
	const CXXMemberCallExpr *CE) override;

	void emitVirtualInheritanceTables(const CXXRecordDecl *RD) override;

	bool canSpeculativelyEmitVTable(const CXXRecordDecl *RD) const override;

	void setThunkLinkage(llvm::Function *Thunk, bool ForVTable, GlobalDecl GD,
	bool ReturnAdjustment) override {
	// Allow inlining of thunks by emitting them with available_externally
	// linkage together with vtables when needed.
	if (ForVTable && !Thunk->hasLocalLinkage())
	Thunk->setLinkage(llvm::GlobalValue::AvailableExternallyLinkage);

	// Propagate dllexport storage, to enable the linker to generate import
	// thunks as necessary (e.g. when a parent class has a key function and a
	// child class doesn't, and the construction vtable for the parent in the
	// child needs to reference the parent's thunks).
	const CXXMethodDecl *MD = cast<CXXMethodDecl>(GD.getDecl());
	if (MD->hasAttr<DLLExportAttr>())
	Thunk->setDLLStorageClass(llvm::GlobalValue::DLLExportStorageClass);
	}

	llvm::Value *performThisAdjustment(CodeGenFunction &CGF, Address This,
	const ThisAdjustment &TA) override;

	llvm::Value *performReturnAdjustment(CodeGenFunction &CGF, Address Ret,
	const ReturnAdjustment &RA) override;

	size_t getSrcArgforCopyCtor(const CXXConstructorDecl *,
	FunctionArgList &Args) const override {
	assert(!Args.empty() && "expected the arglist to not be empty!");
	return Args.size() - 1;
	}

	StringRef GetPureVirtualCallName() override { return "__cxa_pure_virtual"; }
	StringRef GetDeletedVirtualCallName() override
	{ return "__cxa_deleted_virtual"; }

	CharUnits getArrayCookieSizeImpl(QualType elementType) override;
	Address InitializeArrayCookie(CodeGenFunction &CGF,
	Address NewPtr,
	llvm::Value *NumElements,
	const CXXNewExpr *expr,
	QualType ElementType) override;
	llvm::Value *readArrayCookieImpl(CodeGenFunction &CGF,
	Address allocPtr,
	CharUnits cookieSize) override;

	void EmitGuardedInit(CodeGenFunction &CGF, const VarDecl &D,
	llvm::GlobalVariable *DeclPtr,
	bool PerformInit) override;
	void registerGlobalDtor(CodeGenFunction &CGF, const VarDecl &D,
	llvm::Constant dtor, llvm::Constant addr) override;

	llvm::Function getOrCreateThreadLocalWrapper(const VarDecl VD,
	llvm::Value *Val);
	void EmitThreadLocalInitFuncs(
	CodeGenModule &CGM,
	ArrayRef<const VarDecl *> CXXThreadLocals,
	ArrayRef<llvm::Function *> CXXThreadLocalInits,
	ArrayRef<const VarDecl *> CXXThreadLocalInitVars) override;

	bool usesThreadWrapperFunction() const override { return true; }
	LValue EmitThreadLocalVarDeclLValue(CodeGenFunction &CGF, const VarDecl *VD,
	QualType LValType) override;

	bool NeedsVTTParameter(GlobalDecl GD) override;

	/************************** RTTI Uniqueness ****************************/

	protected:
	/// Returns true if the ABI requires RTTI type_info objects to be unique
	/// across a program.
	virtual bool shouldRTTIBeUnique() const { return true; }

	public:
	/// What sort of unique-RTTI behavior should we use?
	enum RTTIUniquenessKind {
	/// We are guaranteeing, or need to guarantee, that the RTTI string
	/// is unique.
	RUK_Unique,

	/// We are not guaranteeing uniqueness for the RTTI string, so we
	/// can demote to hidden visibility but must use string comparisons.
	RUK_NonUniqueHidden,

	/// We are not guaranteeing uniqueness for the RTTI string, so we
	/// have to use string comparisons, but we also have to emit it with
	/// non-hidden visibility.
	RUK_NonUniqueVisible
	};

	/// Return the required visibility status for the given type and linkage in
	/// the current ABI.
	RTTIUniquenessKind
	classifyRTTIUniqueness(QualType CanTy,
	llvm::GlobalValue::LinkageTypes Linkage) const;
	friend class ItaniumRTTIBuilder;

	void emitCXXStructor(const CXXMethodDecl *MD, StructorType Type) override;

	std::pair<llvm::Value , const CXXRecordDecl >
	LoadVTablePtr(CodeGenFunction &CGF, Address This,
	const CXXRecordDecl *RD) override;

	private:
	bool hasAnyUnusedVirtualInlineFunction(const CXXRecordDecl *RD) const {
	const auto &VtableLayout =
	CGM.getItaniumVTableContext().getVTableLayout(RD);

	for (const auto &VtableComponent : VtableLayout.vtable_components()) {
	// Skip empty slot.
	if (!VtableComponent.isUsedFunctionPointerKind())
	continue;

	const CXXMethodDecl *Method = VtableComponent.getFunctionDecl();
	if (!Method->getCanonicalDecl()->isInlined())
	continue;

	StringRef Name = CGM.getMangledName(VtableComponent.getGlobalDecl());
	auto *Entry = CGM.GetGlobalValue(Name);
	// This checks if virtual inline function has already been emitted.
	// Note that it is possible that this inline function would be emitted
	// after trying to emit vtable speculatively. Because of this we do
	// an extra pass after emitting all deferred vtables to find and emit
	// these vtables opportunistically.
	if (!Entry \|\| Entry->isDeclaration())
	return true;
	}
	return false;
	}

	bool isVTableHidden(const CXXRecordDecl *RD) const {
	const auto &VtableLayout =
	CGM.getItaniumVTableContext().getVTableLayout(RD);

	for (const auto &VtableComponent : VtableLayout.vtable_components()) {
	if (VtableComponent.isRTTIKind()) {
	const CXXRecordDecl *RTTIDecl = VtableComponent.getRTTIDecl();
	if (RTTIDecl->getVisibility() == Visibility::HiddenVisibility)
	return true;
	} else if (VtableComponent.isUsedFunctionPointerKind()) {
	const CXXMethodDecl *Method = VtableComponent.getFunctionDecl();
	if (Method->getVisibility() == Visibility::HiddenVisibility &&
	!Method->isDefined())
	return true;
	}
	}
	return false;
	}
	};

	class ARMCXXABI : public ItaniumCXXABI {
	public:
	ARMCXXABI(CodeGen::CodeGenModule &CGM) :
	ItaniumCXXABI(CGM, /* UseARMMethodPtrABI = */ true,
	/* UseARMGuardVarABI = */ true) {}

	bool HasThisReturn(GlobalDecl GD) const override {
	return (isa<CXXConstructorDecl>(GD.getDecl()) \|\| (
	isa<CXXDestructorDecl>(GD.getDecl()) &&
	GD.getDtorType() != Dtor_Deleting));
	}

	void EmitReturnFromThunk(CodeGenFunction &CGF, RValue RV,
	QualType ResTy) override;

	CharUnits getArrayCookieSizeImpl(QualType elementType) override;
	Address InitializeArrayCookie(CodeGenFunction &CGF,
	Address NewPtr,
	llvm::Value *NumElements,
	const CXXNewExpr *expr,
	QualType ElementType) override;
	llvm::Value *readArrayCookieImpl(CodeGenFunction &CGF, Address allocPtr,
	CharUnits cookieSize) override;
	};

	class iOS64CXXABI : public ARMCXXABI {
	public:
	iOS64CXXABI(CodeGen::CodeGenModule &CGM) : ARMCXXABI(CGM) {
	Use32BitVTableOffsetABI = true;
	}

	// ARM64 libraries are prepared for non-unique RTTI.
	bool shouldRTTIBeUnique() const override { return false; }
	};

	class WebAssemblyCXXABI final : public ItaniumCXXABI {
	public:
	explicit WebAssemblyCXXABI(CodeGen::CodeGenModule &CGM)
	: ItaniumCXXABI(CGM, /UseARMMethodPtrABI=/true,
	/UseARMGuardVarABI=/true) {}

	private:
	bool HasThisReturn(GlobalDecl GD) const override {
	return isa<CXXConstructorDecl>(GD.getDecl()) \|\|
	(isa<CXXDestructorDecl>(GD.getDecl()) &&
	GD.getDtorType() != Dtor_Deleting);
	}
	bool canCallMismatchedFunctionType() const override { return false; }
	};
	}

	CodeGen::CGCXXABI *CodeGen::CreateItaniumCXXABI(CodeGenModule &CGM) {
	switch (CGM.getTarget().getCXXABI().getKind()) {
	// For IR-generation purposes, there's no significant difference
	// between the ARM and iOS ABIs.
	case TargetCXXABI::GenericARM:
	case TargetCXXABI::iOS:
	case TargetCXXABI::WatchOS:
	return new ARMCXXABI(CGM);

	case TargetCXXABI::iOS64:
	return new iOS64CXXABI(CGM);

	// Note that AArch64 uses the generic ItaniumCXXABI class since it doesn't
	// include the other 32-bit ARM oddities: constructor/destructor return values
	// and array cookies.
	case TargetCXXABI::GenericAArch64:
	return new ItaniumCXXABI(CGM, /* UseARMMethodPtrABI = */ true,
	/* UseARMGuardVarABI = */ true);

	case TargetCXXABI::GenericMIPS:
	return new ItaniumCXXABI(CGM, /* UseARMMethodPtrABI = */ true);

	case TargetCXXABI::WebAssembly:
	return new WebAssemblyCXXABI(CGM);

	case TargetCXXABI::GenericItanium:
	if (CGM.getContext().getTargetInfo().getTriple().getArch()
	== llvm::Triple::le32) {
	// For PNaCl, use ARM-style method pointers so that PNaCl code
	// does not assume anything about the alignment of function
	// pointers.
	return new ItaniumCXXABI(CGM, /* UseARMMethodPtrABI = */ true,
	/* UseARMGuardVarABI = */ false);
	}
	return new ItaniumCXXABI(CGM);

	case TargetCXXABI::Microsoft:
	llvm_unreachable("Microsoft ABI is not Itanium-based");
	}
	llvm_unreachable("bad ABI kind");
	}

	llvm::Type *
	ItaniumCXXABI::ConvertMemberPointerType(const MemberPointerType *MPT) {
	if (MPT->isMemberDataPointer())
	return CGM.PtrDiffTy;
	return llvm::StructType::get(CGM.PtrDiffTy, CGM.PtrDiffTy);
	}

	/// In the Itanium and ARM ABIs, method pointers have the form:
	/// struct { ptrdiff_t ptr; ptrdiff_t adj; } memptr;
	///
	/// In the Itanium ABI:
	/// - method pointers are virtual if (memptr.ptr & 1) is nonzero
	/// - the this-adjustment is (memptr.adj)
	/// - the virtual offset is (memptr.ptr - 1)
	///
	/// In the ARM ABI:
	/// - method pointers are virtual if (memptr.adj & 1) is nonzero
	/// - the this-adjustment is (memptr.adj >> 1)
	/// - the virtual offset is (memptr.ptr)
	/// ARM uses 'adj' for the virtual flag because Thumb functions
	/// may be only single-byte aligned.
	///
	/// If the member is virtual, the adjusted 'this' pointer points
	/// to a vtable pointer from which the virtual offset is applied.
	///
	/// If the member is non-virtual, memptr.ptr is the address of
	/// the function to call.
	CGCallee ItaniumCXXABI::EmitLoadOfMemberFunctionPointer(
	CodeGenFunction &CGF, const Expr *E, Address ThisAddr,
	llvm::Value *&ThisPtrForCall,
	llvm::Value MemFnPtr, const MemberPointerType MPT) {
	CGBuilderTy &Builder = CGF.Builder;

	const FunctionProtoType *FPT =
	MPT->getPointeeType()->getAs<FunctionProtoType>();
	const CXXRecordDecl *RD =
	cast<CXXRecordDecl>(MPT->getClass()->getAs<RecordType>()->getDecl());

	llvm::FunctionType *FTy = CGM.getTypes().GetFunctionType(
	CGM.getTypes().arrangeCXXMethodType(RD, FPT, /FD=/nullptr));

	llvm::Constant *ptrdiff_1 = llvm::ConstantInt::get(CGM.PtrDiffTy, 1);

	llvm::BasicBlock *FnVirtual = CGF.createBasicBlock("memptr.virtual");
	llvm::BasicBlock *FnNonVirtual = CGF.createBasicBlock("memptr.nonvirtual");
	llvm::BasicBlock *FnEnd = CGF.createBasicBlock("memptr.end");

	// Extract memptr.adj, which is in the second field.
	llvm::Value *RawAdj = Builder.CreateExtractValue(MemFnPtr, 1, "memptr.adj");

	// Compute the true adjustment.
	llvm::Value *Adj = RawAdj;
	if (UseARMMethodPtrABI)
	Adj = Builder.CreateAShr(Adj, ptrdiff_1, "memptr.adj.shifted");

	// Apply the adjustment and cast back to the original struct type
	// for consistency.
	llvm::Value *This = ThisAddr.getPointer();
	llvm::Value *Ptr = Builder.CreateBitCast(This, Builder.getInt8PtrTy());
	Ptr = Builder.CreateInBoundsGEP(Ptr, Adj);
	This = Builder.CreateBitCast(Ptr, This->getType(), "this.adjusted");
	ThisPtrForCall = This;

	// Load the function pointer.
	llvm::Value *FnAsInt = Builder.CreateExtractValue(MemFnPtr, 0, "memptr.ptr");

	// If the LSB in the function pointer is 1, the function pointer points to
	// a virtual function.
	llvm::Value *IsVirtual;
	if (UseARMMethodPtrABI)
	IsVirtual = Builder.CreateAnd(RawAdj, ptrdiff_1);
	else
	IsVirtual = Builder.CreateAnd(FnAsInt, ptrdiff_1);
	IsVirtual = Builder.CreateIsNotNull(IsVirtual, "memptr.isvirtual");
	Builder.CreateCondBr(IsVirtual, FnVirtual, FnNonVirtual);

	// In the virtual path, the adjustment left 'This' pointing to the
	// vtable of the correct base subobject. The "function pointer" is an
	// offset within the vtable (+1 for the virtual flag on non-ARM).
	CGF.EmitBlock(FnVirtual);

	// Cast the adjusted this to a pointer to vtable pointer and load.
	llvm::Type *VTableTy = Builder.getInt8PtrTy();
	CharUnits VTablePtrAlign =
	CGF.CGM.getDynamicOffsetAlignment(ThisAddr.getAlignment(), RD,
	CGF.getPointerAlign());
	llvm::Value *VTable =
	CGF.GetVTablePtr(Address(This, VTablePtrAlign), VTableTy, RD);

	// Apply the offset.
	// On ARM64, to reserve extra space in virtual member function pointers,
	// we only pay attention to the low 32 bits of the offset.
	llvm::Value *VTableOffset = FnAsInt;
	if (!UseARMMethodPtrABI)
	VTableOffset = Builder.CreateSub(VTableOffset, ptrdiff_1);
	if (Use32BitVTableOffsetABI) {
	VTableOffset = Builder.CreateTrunc(VTableOffset, CGF.Int32Ty);
	VTableOffset = Builder.CreateZExt(VTableOffset, CGM.PtrDiffTy);
	}
	VTable = Builder.CreateGEP(VTable, VTableOffset);

	// Load the virtual function to call.
	VTable = Builder.CreateBitCast(VTable, FTy->getPointerTo()->getPointerTo());
	llvm::Value *VirtualFn =
	Builder.CreateAlignedLoad(VTable, CGF.getPointerAlign(),
	"memptr.virtualfn");
	CGF.EmitBranch(FnEnd);

	// In the non-virtual path, the function pointer is actually a
	// function pointer.
	CGF.EmitBlock(FnNonVirtual);
	llvm::Value *NonVirtualFn =
	Builder.CreateIntToPtr(FnAsInt, FTy->getPointerTo(), "memptr.nonvirtualfn");

	// We're done.
	CGF.EmitBlock(FnEnd);
	llvm::PHINode *CalleePtr = Builder.CreatePHI(FTy->getPointerTo(), 2);
	CalleePtr->addIncoming(VirtualFn, FnVirtual);
	CalleePtr->addIncoming(NonVirtualFn, FnNonVirtual);

	CGCallee Callee(FPT, CalleePtr);
	return Callee;
	}

	/// Compute an l-value by applying the given pointer-to-member to a
	/// base object.
	llvm::Value *ItaniumCXXABI::EmitMemberDataPointerAddress(
	CodeGenFunction &CGF, const Expr E, Address Base, llvm::Value MemPtr,
	const MemberPointerType *MPT) {
	assert(MemPtr->getType() == CGM.PtrDiffTy);

	CGBuilderTy &Builder = CGF.Builder;

	// Cast to char*.
	Base = Builder.CreateElementBitCast(Base, CGF.Int8Ty);

	// Apply the offset, which we assume is non-null.
	llvm::Value *Addr =
	Builder.CreateInBoundsGEP(Base.getPointer(), MemPtr, "memptr.offset");

	// Cast the address to the appropriate pointer type, adopting the
	// address space of the base pointer.
	llvm::Type *PType = CGF.ConvertTypeForMem(MPT->getPointeeType())
	->getPointerTo(Base.getAddressSpace());
	return Builder.CreateBitCast(Addr, PType);
	}

	/// Perform a bitcast, derived-to-base, or base-to-derived member pointer
	/// conversion.
	///
	/// Bitcast conversions are always a no-op under Itanium.
	///
	/// Obligatory offset/adjustment diagram:
	/// <-- offset --> <-- adjustment -->
	/// \|--------------------------\|----------------------\|--------------------\|
	/// ^Derived address point ^Base address point ^Member address point
	///
	/// So when converting a base member pointer to a derived member pointer,
	/// we add the offset to the adjustment because the address point has
	/// decreased; and conversely, when converting a derived MP to a base MP
	/// we subtract the offset from the adjustment because the address point
	/// has increased.
	///
	/// The standard forbids (at compile time) conversion to and from
	/// virtual bases, which is why we don't have to consider them here.
	///
	/// The standard forbids (at run time) casting a derived MP to a base
	/// MP when the derived MP does not point to a member of the base.
	/// This is why -1 is a reasonable choice for null data member
	/// pointers.
	llvm::Value *
	ItaniumCXXABI::EmitMemberPointerConversion(CodeGenFunction &CGF,
	const CastExpr *E,
	llvm::Value *src) {
	assert(E->getCastKind() == CK_DerivedToBaseMemberPointer \|\|
	E->getCastKind() == CK_BaseToDerivedMemberPointer \|\|
	E->getCastKind() == CK_ReinterpretMemberPointer);

	// Under Itanium, reinterprets don't require any additional processing.
	if (E->getCastKind() == CK_ReinterpretMemberPointer) return src;

	// Use constant emission if we can.
	if (isa<llvm::Constant>(src))
	return EmitMemberPointerConversion(E, cast<llvm::Constant>(src));

	llvm::Constant *adj = getMemberPointerAdjustment(E);
	if (!adj) return src;

	CGBuilderTy &Builder = CGF.Builder;
	bool isDerivedToBase = (E->getCastKind() == CK_DerivedToBaseMemberPointer);

	const MemberPointerType *destTy =
	E->getType()->castAs<MemberPointerType>();

	// For member data pointers, this is just a matter of adding the
	// offset if the source is non-null.
	if (destTy->isMemberDataPointer()) {
	llvm::Value *dst;
	if (isDerivedToBase)
	dst = Builder.CreateNSWSub(src, adj, "adj");
	else
	dst = Builder.CreateNSWAdd(src, adj, "adj");

	// Null check.
	llvm::Value *null = llvm::Constant::getAllOnesValue(src->getType());
	llvm::Value *isNull = Builder.CreateICmpEQ(src, null, "memptr.isnull");
	return Builder.CreateSelect(isNull, src, dst);
	}

	// The this-adjustment is left-shifted by 1 on ARM.
	if (UseARMMethodPtrABI) {
	uint64_t offset = cast<llvm::ConstantInt>(adj)->getZExtValue();
	offset <<= 1;
	adj = llvm::ConstantInt::get(adj->getType(), offset);
	}

	llvm::Value *srcAdj = Builder.CreateExtractValue(src, 1, "src.adj");
	llvm::Value *dstAdj;
	if (isDerivedToBase)
	dstAdj = Builder.CreateNSWSub(srcAdj, adj, "adj");
	else
	dstAdj = Builder.CreateNSWAdd(srcAdj, adj, "adj");

	return Builder.CreateInsertValue(src, dstAdj, 1);
	}

	llvm::Constant *
	ItaniumCXXABI::EmitMemberPointerConversion(const CastExpr *E,
	llvm::Constant *src) {
	assert(E->getCastKind() == CK_DerivedToBaseMemberPointer \|\|
	E->getCastKind() == CK_BaseToDerivedMemberPointer \|\|
	E->getCastKind() == CK_ReinterpretMemberPointer);

	// Under Itanium, reinterprets don't require any additional processing.
	if (E->getCastKind() == CK_ReinterpretMemberPointer) return src;

	// If the adjustment is trivial, we don't need to do anything.
	llvm::Constant *adj = getMemberPointerAdjustment(E);
	if (!adj) return src;

	bool isDerivedToBase = (E->getCastKind() == CK_DerivedToBaseMemberPointer);

	const MemberPointerType *destTy =
	E->getType()->castAs<MemberPointerType>();

	// For member data pointers, this is just a matter of adding the
	// offset if the source is non-null.
	if (destTy->isMemberDataPointer()) {
	// null maps to null.
	if (src->isAllOnesValue()) return src;

	if (isDerivedToBase)
	return llvm::ConstantExpr::getNSWSub(src, adj);
	else
	return llvm::ConstantExpr::getNSWAdd(src, adj);
	}

	// The this-adjustment is left-shifted by 1 on ARM.
	if (UseARMMethodPtrABI) {
	uint64_t offset = cast<llvm::ConstantInt>(adj)->getZExtValue();
	offset <<= 1;
	adj = llvm::ConstantInt::get(adj->getType(), offset);
	}

	llvm::Constant *srcAdj = llvm::ConstantExpr::getExtractValue(src, 1);
	llvm::Constant *dstAdj;
	if (isDerivedToBase)
	dstAdj = llvm::ConstantExpr::getNSWSub(srcAdj, adj);
	else
	dstAdj = llvm::ConstantExpr::getNSWAdd(srcAdj, adj);

	return llvm::ConstantExpr::getInsertValue(src, dstAdj, 1);
	}

	llvm::Constant *
	ItaniumCXXABI::EmitNullMemberPointer(const MemberPointerType *MPT) {
	// Itanium C++ ABI 2.3:
	// A NULL pointer is represented as -1.
	if (MPT->isMemberDataPointer())
	return llvm::ConstantInt::get(CGM.PtrDiffTy, -1ULL, /isSigned=/true);

	llvm::Constant *Zero = llvm::ConstantInt::get(CGM.PtrDiffTy, 0);
	llvm::Constant *Values[2] = { Zero, Zero };
	return llvm::ConstantStruct::getAnon(Values);
	}

	llvm::Constant *
	ItaniumCXXABI::EmitMemberDataPointer(const MemberPointerType *MPT,
	CharUnits offset) {
	// Itanium C++ ABI 2.3:
	// A pointer to data member is an offset from the base address of
	// the class object containing it, represented as a ptrdiff_t
	return llvm::ConstantInt::get(CGM.PtrDiffTy, offset.getQuantity());
	}

	llvm::Constant *
	ItaniumCXXABI::EmitMemberFunctionPointer(const CXXMethodDecl *MD) {
	return BuildMemberPointer(MD, CharUnits::Zero());
	}

	llvm::Constant ItaniumCXXABI::BuildMemberPointer(const CXXMethodDecl MD,
	CharUnits ThisAdjustment) {
	assert(MD->isInstance() && "Member function must not be static!");
	MD = MD->getCanonicalDecl();

	CodeGenTypes &Types = CGM.getTypes();

	// Get the function pointer (or index if this is a virtual function).
	llvm::Constant *MemPtr[2];
	if (MD->isVirtual()) {
	uint64_t Index = CGM.getItaniumVTableContext().getMethodVTableIndex(MD);

	const ASTContext &Context = getContext();
	CharUnits PointerWidth =
	Context.toCharUnitsFromBits(Context.getTargetInfo().getPointerWidth(0));
	uint64_t VTableOffset = (Index * PointerWidth.getQuantity());

	if (UseARMMethodPtrABI) {
	// ARM C++ ABI 3.2.1:
	// This ABI specifies that adj contains twice the this
	// adjustment, plus 1 if the member function is virtual. The
	// least significant bit of adj then makes exactly the same
	// discrimination as the least significant bit of ptr does for
	// Itanium.
	MemPtr[0] = llvm::ConstantInt::get(CGM.PtrDiffTy, VTableOffset);
	MemPtr[1] = llvm::ConstantInt::get(CGM.PtrDiffTy,
	2 * ThisAdjustment.getQuantity() + 1);
	} else {
	// Itanium C++ ABI 2.3:
	// For a virtual function, [the pointer field] is 1 plus the
	// virtual table offset (in bytes) of the function,
	// represented as a ptrdiff_t.
	MemPtr[0] = llvm::ConstantInt::get(CGM.PtrDiffTy, VTableOffset + 1);
	MemPtr[1] = llvm::ConstantInt::get(CGM.PtrDiffTy,
	ThisAdjustment.getQuantity());
	}
	} else {
	const FunctionProtoType *FPT = MD->getType()->castAs<FunctionProtoType>();
	llvm::Type *Ty;
	// Check whether the function has a computable LLVM signature.
	if (Types.isFuncTypeConvertible(FPT)) {
	// The function has a computable LLVM signature; use the correct type.
	Ty = Types.GetFunctionType(Types.arrangeCXXMethodDeclaration(MD));
	} else {
	// Use an arbitrary non-function type to tell GetAddrOfFunction that the
	// function type is incomplete.
	Ty = CGM.PtrDiffTy;
	}
	llvm::Constant *addr = CGM.GetAddrOfFunction(MD, Ty);

	MemPtr[0] = llvm::ConstantExpr::getPtrToInt(addr, CGM.PtrDiffTy);
	MemPtr[1] = llvm::ConstantInt::get(CGM.PtrDiffTy,
	(UseARMMethodPtrABI ? 2 : 1) *
	ThisAdjustment.getQuantity());
	}

	return llvm::ConstantStruct::getAnon(MemPtr);
	}

	llvm::Constant *ItaniumCXXABI::EmitMemberPointer(const APValue &MP,
	QualType MPType) {
	const MemberPointerType *MPT = MPType->castAs<MemberPointerType>();
	const ValueDecl *MPD = MP.getMemberPointerDecl();
	if (!MPD)
	return EmitNullMemberPointer(MPT);

	CharUnits ThisAdjustment = getMemberPointerPathAdjustment(MP);

	if (const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(MPD))
	return BuildMemberPointer(MD, ThisAdjustment);

	CharUnits FieldOffset =
	getContext().toCharUnitsFromBits(getContext().getFieldOffset(MPD));
	return EmitMemberDataPointer(MPT, ThisAdjustment + FieldOffset);
	}

	/// The comparison algorithm is pretty easy: the member pointers are
	/// the same if they're either bitwise identical or both null.
	///
	/// ARM is different here only because null-ness is more complicated.
	llvm::Value *
	ItaniumCXXABI::EmitMemberPointerComparison(CodeGenFunction &CGF,
	llvm::Value *L,
	llvm::Value *R,
	const MemberPointerType *MPT,
	bool Inequality) {
	CGBuilderTy &Builder = CGF.Builder;

	llvm::ICmpInst::Predicate Eq;
	llvm::Instruction::BinaryOps And, Or;
	if (Inequality) {
	Eq = llvm::ICmpInst::ICMP_NE;
	And = llvm::Instruction::Or;
	Or = llvm::Instruction::And;
	} else {
	Eq = llvm::ICmpInst::ICMP_EQ;
	And = llvm::Instruction::And;
	Or = llvm::Instruction::Or;
	}

	// Member data pointers are easy because there's a unique null
	// value, so it just comes down to bitwise equality.
	if (MPT->isMemberDataPointer())
	return Builder.CreateICmp(Eq, L, R);

	// For member function pointers, the tautologies are more complex.
	// The Itanium tautology is:
	// (L == R) <==> (L.ptr == R.ptr && (L.ptr == 0 \|\| L.adj == R.adj))
	// The ARM tautology is:
	// (L == R) <==> (L.ptr == R.ptr &&
	// (L.adj == R.adj \|\|
	// (L.ptr == 0 && ((L.adj\|R.adj) & 1) == 0)))
	// The inequality tautologies have exactly the same structure, except
	// applying De Morgan's laws.

	llvm::Value *LPtr = Builder.CreateExtractValue(L, 0, "lhs.memptr.ptr");
	llvm::Value *RPtr = Builder.CreateExtractValue(R, 0, "rhs.memptr.ptr");

	// This condition tests whether L.ptr == R.ptr. This must always be
	// true for equality to hold.
	llvm::Value *PtrEq = Builder.CreateICmp(Eq, LPtr, RPtr, "cmp.ptr");

	// This condition, together with the assumption that L.ptr == R.ptr,
	// tests whether the pointers are both null. ARM imposes an extra
	// condition.
	llvm::Value *Zero = llvm::Constant::getNullValue(LPtr->getType());
	llvm::Value *EqZero = Builder.CreateICmp(Eq, LPtr, Zero, "cmp.ptr.null");

	// This condition tests whether L.adj == R.adj. If this isn't
	// true, the pointers are unequal unless they're both null.
	llvm::Value *LAdj = Builder.CreateExtractValue(L, 1, "lhs.memptr.adj");
	llvm::Value *RAdj = Builder.CreateExtractValue(R, 1, "rhs.memptr.adj");
	llvm::Value *AdjEq = Builder.CreateICmp(Eq, LAdj, RAdj, "cmp.adj");

	// Null member function pointers on ARM clear the low bit of Adj,
	// so the zero condition has to check that neither low bit is set.
	if (UseARMMethodPtrABI) {
	llvm::Value *One = llvm::ConstantInt::get(LPtr->getType(), 1);

	// Compute (l.adj \| r.adj) & 1 and test it against zero.
	llvm::Value *OrAdj = Builder.CreateOr(LAdj, RAdj, "or.adj");
	llvm::Value *OrAdjAnd1 = Builder.CreateAnd(OrAdj, One);
	llvm::Value *OrAdjAnd1EqZero = Builder.CreateICmp(Eq, OrAdjAnd1, Zero,
	"cmp.or.adj");
	EqZero = Builder.CreateBinOp(And, EqZero, OrAdjAnd1EqZero);
	}

	// Tie together all our conditions.
	llvm::Value *Result = Builder.CreateBinOp(Or, EqZero, AdjEq);
	Result = Builder.CreateBinOp(And, PtrEq, Result,
	Inequality ? "memptr.ne" : "memptr.eq");
	return Result;
	}

	llvm::Value *
	ItaniumCXXABI::EmitMemberPointerIsNotNull(CodeGenFunction &CGF,
	llvm::Value *MemPtr,
	const MemberPointerType *MPT) {
	CGBuilderTy &Builder = CGF.Builder;

	/// For member data pointers, this is just a check against -1.
	if (MPT->isMemberDataPointer()) {
	assert(MemPtr->getType() == CGM.PtrDiffTy);
	llvm::Value *NegativeOne =
	llvm::Constant::getAllOnesValue(MemPtr->getType());
	return Builder.CreateICmpNE(MemPtr, NegativeOne, "memptr.tobool");
	}

	// In Itanium, a member function pointer is not null if 'ptr' is not null.
	llvm::Value *Ptr = Builder.CreateExtractValue(MemPtr, 0, "memptr.ptr");

	llvm::Constant *Zero = llvm::ConstantInt::get(Ptr->getType(), 0);
	llvm::Value *Result = Builder.CreateICmpNE(Ptr, Zero, "memptr.tobool");

	// On ARM, a member function pointer is also non-null if the low bit of 'adj'
	// (the virtual bit) is set.
	if (UseARMMethodPtrABI) {
	llvm::Constant *One = llvm::ConstantInt::get(Ptr->getType(), 1);
	llvm::Value *Adj = Builder.CreateExtractValue(MemPtr, 1, "memptr.adj");
	llvm::Value *VirtualBit = Builder.CreateAnd(Adj, One, "memptr.virtualbit");
	llvm::Value *IsVirtual = Builder.CreateICmpNE(VirtualBit, Zero,
	"memptr.isvirtual");
	Result = Builder.CreateOr(Result, IsVirtual);
	}

	return Result;
	}

	bool ItaniumCXXABI::classifyReturnType(CGFunctionInfo &FI) const {
	const CXXRecordDecl *RD = FI.getReturnType()->getAsCXXRecordDecl();
	if (!RD)
	return false;

	// If C++ prohibits us from making a copy, return by address.
	if (passClassIndirect(RD)) {
	auto Align = CGM.getContext().getTypeAlignInChars(FI.getReturnType());
	FI.getReturnInfo() = ABIArgInfo::getIndirect(Align, /ByVal=/false);
	return true;
	}
	return false;
	}

	/// The Itanium ABI requires non-zero initialization only for data
	/// member pointers, for which '0' is a valid offset.
	bool ItaniumCXXABI::isZeroInitializable(const MemberPointerType *MPT) {
	return MPT->isMemberFunctionPointer();
	}

	/// The Itanium ABI always places an offset to the complete object
	/// at entry -2 in the vtable.
	void ItaniumCXXABI::emitVirtualObjectDelete(CodeGenFunction &CGF,
	const CXXDeleteExpr *DE,
	Address Ptr,
	QualType ElementType,
	const CXXDestructorDecl *Dtor) {
	bool UseGlobalDelete = DE->isGlobalDelete();
	if (UseGlobalDelete) {
	// Derive the complete-object pointer, which is what we need
	// to pass to the deallocation function.

	// Grab the vtable pointer as an intptr_t*.
	auto *ClassDecl =
	cast<CXXRecordDecl>(ElementType->getAs<RecordType>()->getDecl());
	llvm::Value *VTable =
	CGF.GetVTablePtr(Ptr, CGF.IntPtrTy->getPointerTo(), ClassDecl);

	// Track back to entry -2 and pull out the offset there.
	llvm::Value *OffsetPtr = CGF.Builder.CreateConstInBoundsGEP1_64(
	VTable, -2, "complete-offset.ptr");
	llvm::Value *Offset =
	CGF.Builder.CreateAlignedLoad(OffsetPtr, CGF.getPointerAlign());

	// Apply the offset.
	llvm::Value *CompletePtr =
	CGF.Builder.CreateBitCast(Ptr.getPointer(), CGF.Int8PtrTy);
	CompletePtr = CGF.Builder.CreateInBoundsGEP(CompletePtr, Offset);

	// If we're supposed to call the global delete, make sure we do so
	// even if the destructor throws.
	CGF.pushCallObjectDeleteCleanup(DE->getOperatorDelete(), CompletePtr,
	ElementType);
	}

	// FIXME: Provide a source location here even though there's no
	// CXXMemberCallExpr for dtor call.
	CXXDtorType DtorType = UseGlobalDelete ? Dtor_Complete : Dtor_Deleting;
	EmitVirtualDestructorCall(CGF, Dtor, DtorType, Ptr, /CE=/nullptr);

	if (UseGlobalDelete)
	CGF.PopCleanupBlock();
	}

	void ItaniumCXXABI::emitRethrow(CodeGenFunction &CGF, bool isNoReturn) {
	// void __cxa_rethrow();

	llvm::FunctionType *FTy =
	llvm::FunctionType::get(CGM.VoidTy, /IsVarArgs=/false);

	llvm::Constant *Fn = CGM.CreateRuntimeFunction(FTy, "__cxa_rethrow");

	if (isNoReturn)
	CGF.EmitNoreturnRuntimeCallOrInvoke(Fn, None);
	else
	CGF.EmitRuntimeCallOrInvoke(Fn);
	}

	static llvm::Constant *getAllocateExceptionFn(CodeGenModule &CGM) {
	// void *__cxa_allocate_exception(size_t thrown_size);

	llvm::FunctionType *FTy =
	llvm::FunctionType::get(CGM.Int8PtrTy, CGM.SizeTy, /IsVarArgs=/false);

	return CGM.CreateRuntimeFunction(FTy, "__cxa_allocate_exception");
	}

	static llvm::Constant *getThrowFn(CodeGenModule &CGM) {
	// void __cxa_throw(void thrown_exception, std::type_info tinfo,
	// void (dest) (void ));

	llvm::Type *Args[3] = { CGM.Int8PtrTy, CGM.Int8PtrTy, CGM.Int8PtrTy };
	llvm::FunctionType *FTy =
	llvm::FunctionType::get(CGM.VoidTy, Args, /IsVarArgs=/false);

	return CGM.CreateRuntimeFunction(FTy, "__cxa_throw");
	}

	void ItaniumCXXABI::emitThrow(CodeGenFunction &CGF, const CXXThrowExpr *E) {
	QualType ThrowType = E->getSubExpr()->getType();
	// Now allocate the exception object.
	llvm::Type *SizeTy = CGF.ConvertType(getContext().getSizeType());
	uint64_t TypeSize = getContext().getTypeSizeInChars(ThrowType).getQuantity();

	llvm::Constant *AllocExceptionFn = getAllocateExceptionFn(CGM);
	llvm::CallInst *ExceptionPtr = CGF.EmitNounwindRuntimeCall(
	AllocExceptionFn, llvm::ConstantInt::get(SizeTy, TypeSize), "exception");

	CharUnits ExnAlign = getAlignmentOfExnObject();
	CGF.EmitAnyExprToExn(E->getSubExpr(), Address(ExceptionPtr, ExnAlign));

	// Now throw the exception.
	llvm::Constant *TypeInfo = CGM.GetAddrOfRTTIDescriptor(ThrowType,
	/ForEH=/true);

	// The address of the destructor. If the exception type has a
	// trivial destructor (or isn't a record), we just pass null.
	llvm::Constant *Dtor = nullptr;
	if (const RecordType *RecordTy = ThrowType->getAs<RecordType>()) {
	CXXRecordDecl *Record = cast<CXXRecordDecl>(RecordTy->getDecl());
	if (!Record->hasTrivialDestructor()) {
	CXXDestructorDecl *DtorD = Record->getDestructor();
	Dtor = CGM.getAddrOfCXXStructor(DtorD, StructorType::Complete);
	Dtor = llvm::ConstantExpr::getBitCast(Dtor, CGM.Int8PtrTy);
	}
	}
	if (!Dtor) Dtor = llvm::Constant::getNullValue(CGM.Int8PtrTy);

	llvm::Value *args[] = { ExceptionPtr, TypeInfo, Dtor };
	CGF.EmitNoreturnRuntimeCallOrInvoke(getThrowFn(CGM), args);
	}

	static llvm::Constant *getItaniumDynamicCastFn(CodeGenFunction &CGF) {
	// void __dynamic_cast(const void sub,
	// const abi::__class_type_info *src,
	// const abi::__class_type_info *dst,
	// std::ptrdiff_t src2dst_offset);

	llvm::Type *Int8PtrTy = CGF.Int8PtrTy;
	llvm::Type *PtrDiffTy =
	CGF.ConvertType(CGF.getContext().getPointerDiffType());

	llvm::Type *Args[4] = { Int8PtrTy, Int8PtrTy, Int8PtrTy, PtrDiffTy };

	llvm::FunctionType *FTy = llvm::FunctionType::get(Int8PtrTy, Args, false);

	// Mark the function as nounwind readonly.
	llvm::Attribute::AttrKind FuncAttrs[] = { llvm::Attribute::NoUnwind,
	llvm::Attribute::ReadOnly };
	llvm::AttributeList Attrs = llvm::AttributeList::get(
	CGF.getLLVMContext(), llvm::AttributeList::FunctionIndex, FuncAttrs);

	return CGF.CGM.CreateRuntimeFunction(FTy, "__dynamic_cast", Attrs);
	}

	static llvm::Constant *getBadCastFn(CodeGenFunction &CGF) {
	// void __cxa_bad_cast();
	llvm::FunctionType *FTy = llvm::FunctionType::get(CGF.VoidTy, false);
	return CGF.CGM.CreateRuntimeFunction(FTy, "__cxa_bad_cast");
	}

	/// \brief Compute the src2dst_offset hint as described in the
	/// Itanium C++ ABI [2.9.7]
	static CharUnits computeOffsetHint(ASTContext &Context,
	const CXXRecordDecl *Src,
	const CXXRecordDecl *Dst) {
	CXXBasePaths Paths(/FindAmbiguities=/true, /RecordPaths=/true,
	/DetectVirtual=/false);

	// If Dst is not derived from Src we can skip the whole computation below and
	// return that Src is not a public base of Dst. Record all inheritance paths.
	if (!Dst->isDerivedFrom(Src, Paths))
	return CharUnits::fromQuantity(-2ULL);

	unsigned NumPublicPaths = 0;
	CharUnits Offset;

	// Now walk all possible inheritance paths.
	for (const CXXBasePath &Path : Paths) {
	if (Path.Access != AS_public) // Ignore non-public inheritance.
	continue;

	++NumPublicPaths;

	for (const CXXBasePathElement &PathElement : Path) {
	// If the path contains a virtual base class we can't give any hint.
	// -1: no hint.
	if (PathElement.Base->isVirtual())
	return CharUnits::fromQuantity(-1ULL);

	if (NumPublicPaths > 1) // Won't use offsets, skip computation.
	continue;

	// Accumulate the base class offsets.
	const ASTRecordLayout &L = Context.getASTRecordLayout(PathElement.Class);
	Offset += L.getBaseClassOffset(
	PathElement.Base->getType()->getAsCXXRecordDecl());
	}
	}

	// -2: Src is not a public base of Dst.
	if (NumPublicPaths == 0)
	return CharUnits::fromQuantity(-2ULL);

	// -3: Src is a multiple public base type but never a virtual base type.
	if (NumPublicPaths > 1)
	return CharUnits::fromQuantity(-3ULL);

	// Otherwise, the Src type is a unique public nonvirtual base type of Dst.
	// Return the offset of Src from the origin of Dst.
	return Offset;
	}

	static llvm::Constant *getBadTypeidFn(CodeGenFunction &CGF) {
	// void __cxa_bad_typeid();
	llvm::FunctionType *FTy = llvm::FunctionType::get(CGF.VoidTy, false);

	return CGF.CGM.CreateRuntimeFunction(FTy, "__cxa_bad_typeid");
	}

	bool ItaniumCXXABI::shouldTypeidBeNullChecked(bool IsDeref,
	QualType SrcRecordTy) {
	return IsDeref;
	}

	void ItaniumCXXABI::EmitBadTypeidCall(CodeGenFunction &CGF) {
	llvm::Value *Fn = getBadTypeidFn(CGF);
	CGF.EmitRuntimeCallOrInvoke(Fn).setDoesNotReturn();
	CGF.Builder.CreateUnreachable();
	}

	llvm::Value *ItaniumCXXABI::EmitTypeid(CodeGenFunction &CGF,
	QualType SrcRecordTy,
	Address ThisPtr,
	llvm::Type *StdTypeInfoPtrTy) {
	auto *ClassDecl =
	cast<CXXRecordDecl>(SrcRecordTy->getAs<RecordType>()->getDecl());
	llvm::Value *Value =
	CGF.GetVTablePtr(ThisPtr, StdTypeInfoPtrTy->getPointerTo(), ClassDecl);

	// Load the type info.
	Value = CGF.Builder.CreateConstInBoundsGEP1_64(Value, -1ULL);
	return CGF.Builder.CreateAlignedLoad(Value, CGF.getPointerAlign());
	}

	bool ItaniumCXXABI::shouldDynamicCastCallBeNullChecked(bool SrcIsPtr,
	QualType SrcRecordTy) {
	return SrcIsPtr;
	}

	llvm::Value *ItaniumCXXABI::EmitDynamicCastCall(
	CodeGenFunction &CGF, Address ThisAddr, QualType SrcRecordTy,
	QualType DestTy, QualType DestRecordTy, llvm::BasicBlock *CastEnd) {
	llvm::Type *PtrDiffLTy =
	CGF.ConvertType(CGF.getContext().getPointerDiffType());
	llvm::Type *DestLTy = CGF.ConvertType(DestTy);

	llvm::Value *SrcRTTI =
	CGF.CGM.GetAddrOfRTTIDescriptor(SrcRecordTy.getUnqualifiedType());
	llvm::Value *DestRTTI =
	CGF.CGM.GetAddrOfRTTIDescriptor(DestRecordTy.getUnqualifiedType());

	// Compute the offset hint.
	const CXXRecordDecl *SrcDecl = SrcRecordTy->getAsCXXRecordDecl();
	const CXXRecordDecl *DestDecl = DestRecordTy->getAsCXXRecordDecl();
	llvm::Value *OffsetHint = llvm::ConstantInt::get(
	PtrDiffLTy,
	computeOffsetHint(CGF.getContext(), SrcDecl, DestDecl).getQuantity());

	// Emit the call to __dynamic_cast.
	llvm::Value *Value = ThisAddr.getPointer();
	Value = CGF.EmitCastToVoidPtr(Value);

	llvm::Value *args[] = {Value, SrcRTTI, DestRTTI, OffsetHint};
	Value = CGF.EmitNounwindRuntimeCall(getItaniumDynamicCastFn(CGF), args);
	Value = CGF.Builder.CreateBitCast(Value, DestLTy);

	/// C++ [expr.dynamic.cast]p9:
	/// A failed cast to reference type throws std::bad_cast
	if (DestTy->isReferenceType()) {
	llvm::BasicBlock *BadCastBlock =
	CGF.createBasicBlock("dynamic_cast.bad_cast");

	llvm::Value *IsNull = CGF.Builder.CreateIsNull(Value);
	CGF.Builder.CreateCondBr(IsNull, BadCastBlock, CastEnd);

	CGF.EmitBlock(BadCastBlock);
	EmitBadCastCall(CGF);
	}

	return Value;
	}

	llvm::Value *ItaniumCXXABI::EmitDynamicCastToVoid(CodeGenFunction &CGF,
	Address ThisAddr,
	QualType SrcRecordTy,
	QualType DestTy) {
	llvm::Type *PtrDiffLTy =
	CGF.ConvertType(CGF.getContext().getPointerDiffType());
	llvm::Type *DestLTy = CGF.ConvertType(DestTy);

	auto *ClassDecl =
	cast<CXXRecordDecl>(SrcRecordTy->getAs<RecordType>()->getDecl());
	// Get the vtable pointer.
	llvm::Value *VTable = CGF.GetVTablePtr(ThisAddr, PtrDiffLTy->getPointerTo(),
	ClassDecl);

	// Get the offset-to-top from the vtable.
	llvm::Value *OffsetToTop =
	CGF.Builder.CreateConstInBoundsGEP1_64(VTable, -2ULL);
	OffsetToTop =
	CGF.Builder.CreateAlignedLoad(OffsetToTop, CGF.getPointerAlign(),
	"offset.to.top");

	// Finally, add the offset to the pointer.
	llvm::Value *Value = ThisAddr.getPointer();
	Value = CGF.EmitCastToVoidPtr(Value);
	Value = CGF.Builder.CreateInBoundsGEP(Value, OffsetToTop);

	return CGF.Builder.CreateBitCast(Value, DestLTy);
	}

	bool ItaniumCXXABI::EmitBadCastCall(CodeGenFunction &CGF) {
	llvm::Value *Fn = getBadCastFn(CGF);
	CGF.EmitRuntimeCallOrInvoke(Fn).setDoesNotReturn();
	CGF.Builder.CreateUnreachable();
	return true;
	}

	llvm::Value *
	ItaniumCXXABI::GetVirtualBaseClassOffset(CodeGenFunction &CGF,
	Address This,
	const CXXRecordDecl *ClassDecl,
	const CXXRecordDecl *BaseClassDecl) {
	llvm::Value *VTablePtr = CGF.GetVTablePtr(This, CGM.Int8PtrTy, ClassDecl);
	CharUnits VBaseOffsetOffset =
	CGM.getItaniumVTableContext().getVirtualBaseOffsetOffset(ClassDecl,
	BaseClassDecl);

	llvm::Value *VBaseOffsetPtr =
	CGF.Builder.CreateConstGEP1_64(VTablePtr, VBaseOffsetOffset.getQuantity(),
	"vbase.offset.ptr");
	VBaseOffsetPtr = CGF.Builder.CreateBitCast(VBaseOffsetPtr,
	CGM.PtrDiffTy->getPointerTo());

	llvm::Value *VBaseOffset =
	CGF.Builder.CreateAlignedLoad(VBaseOffsetPtr, CGF.getPointerAlign(),
	"vbase.offset");

	return VBaseOffset;
	}

	void ItaniumCXXABI::EmitCXXConstructors(const CXXConstructorDecl *D) {
	// Just make sure we're in sync with TargetCXXABI.
	assert(CGM.getTarget().getCXXABI().hasConstructorVariants());

	// The constructor used for constructing this as a base class;
	// ignores virtual bases.
	CGM.EmitGlobal(GlobalDecl(D, Ctor_Base));

	// The constructor used for constructing this as a complete class;
	// constructs the virtual bases, then calls the base constructor.
	if (!D->getParent()->isAbstract()) {
	// We don't need to emit the complete ctor if the class is abstract.
	CGM.EmitGlobal(GlobalDecl(D, Ctor_Complete));
	}
	}

	CGCXXABI::AddedStructorArgs
	ItaniumCXXABI::buildStructorSignature(const CXXMethodDecl *MD, StructorType T,
	SmallVectorImpl<CanQualType> &ArgTys) {
	ASTContext &Context = getContext();

	// All parameters are already in place except VTT, which goes after 'this'.
	// These are Clang types, so we don't need to worry about sret yet.

	// Check if we need to add a VTT parameter (which has type void **).
	if (T == StructorType::Base && MD->getParent()->getNumVBases() != 0) {
	ArgTys.insert(ArgTys.begin() + 1,
	Context.getPointerType(Context.VoidPtrTy));
	return AddedStructorArgs::prefix(1);
	}
	return AddedStructorArgs{};
	}

	void ItaniumCXXABI::EmitCXXDestructors(const CXXDestructorDecl *D) {
	// The destructor used for destructing this as a base class; ignores
	// virtual bases.
	CGM.EmitGlobal(GlobalDecl(D, Dtor_Base));

	// The destructor used for destructing this as a most-derived class;
	// call the base destructor and then destructs any virtual bases.
	CGM.EmitGlobal(GlobalDecl(D, Dtor_Complete));

	// The destructor in a virtual table is always a 'deleting'
	// destructor, which calls the complete destructor and then uses the
	// appropriate operator delete.
	if (D->isVirtual())
	CGM.EmitGlobal(GlobalDecl(D, Dtor_Deleting));
	}

	void ItaniumCXXABI::addImplicitStructorParams(CodeGenFunction &CGF,
	QualType &ResTy,
	FunctionArgList &Params) {
	const CXXMethodDecl *MD = cast<CXXMethodDecl>(CGF.CurGD.getDecl());
	assert(isa<CXXConstructorDecl>(MD) \|\| isa<CXXDestructorDecl>(MD));

	// Check if we need a VTT parameter as well.
	if (NeedsVTTParameter(CGF.CurGD)) {
	ASTContext &Context = getContext();

	// FIXME: avoid the fake decl
	QualType T = Context.getPointerType(Context.VoidPtrTy);
	auto *VTTDecl = ImplicitParamDecl::Create(
	Context, /DC=/nullptr, MD->getLocation(), &Context.Idents.get("vtt"),
	T, ImplicitParamDecl::CXXVTT);
	Params.insert(Params.begin() + 1, VTTDecl);
	getStructorImplicitParamDecl(CGF) = VTTDecl;
	}
	}

	void ItaniumCXXABI::EmitInstanceFunctionProlog(CodeGenFunction &CGF) {
	// Naked functions have no prolog.
	if (CGF.CurFuncDecl && CGF.CurFuncDecl->hasAttr<NakedAttr>())
	return;

	/// Initialize the 'this' slot. In the Itanium C++ ABI, no prologue
	/// adjustments are required, becuase they are all handled by thunks.
	setCXXABIThisValue(CGF, loadIncomingCXXThis(CGF));

	/// Initialize the 'vtt' slot if needed.
	if (getStructorImplicitParamDecl(CGF)) {
	getStructorImplicitParamValue(CGF) = CGF.Builder.CreateLoad(
	CGF.GetAddrOfLocalVar(getStructorImplicitParamDecl(CGF)), "vtt");
	}

	/// If this is a function that the ABI specifies returns 'this', initialize
	/// the return slot to 'this' at the start of the function.
	///
	/// Unlike the setting of return types, this is done within the ABI
	/// implementation instead of by clients of CGCXXABI because:
	/// 1) getThisValue is currently protected
	/// 2) in theory, an ABI could implement 'this' returns some other way;
	/// HasThisReturn only specifies a contract, not the implementation
	if (HasThisReturn(CGF.CurGD))
	CGF.Builder.CreateStore(getThisValue(CGF), CGF.ReturnValue);
	}

	CGCXXABI::AddedStructorArgs ItaniumCXXABI::addImplicitConstructorArgs(
	CodeGenFunction &CGF, const CXXConstructorDecl *D, CXXCtorType Type,
	bool ForVirtualBase, bool Delegating, CallArgList &Args) {
	if (!NeedsVTTParameter(GlobalDecl(D, Type)))
	return AddedStructorArgs{};

	// Insert the implicit 'vtt' argument as the second argument.
	llvm::Value *VTT =
	CGF.GetVTTParameter(GlobalDecl(D, Type), ForVirtualBase, Delegating);
	QualType VTTTy = getContext().getPointerType(getContext().VoidPtrTy);
	Args.insert(Args.begin() + 1,
	CallArg(RValue::get(VTT), VTTTy, /needscopy=/false));
	return AddedStructorArgs::prefix(1); // Added one arg.
	}

	void ItaniumCXXABI::EmitDestructorCall(CodeGenFunction &CGF,
	const CXXDestructorDecl *DD,
	CXXDtorType Type, bool ForVirtualBase,
	bool Delegating, Address This) {
	GlobalDecl GD(DD, Type);
	llvm::Value *VTT = CGF.GetVTTParameter(GD, ForVirtualBase, Delegating);
	QualType VTTTy = getContext().getPointerType(getContext().VoidPtrTy);

	CGCallee Callee;
	if (getContext().getLangOpts().AppleKext &&
	Type != Dtor_Base && DD->isVirtual())
	Callee = CGF.BuildAppleKextVirtualDestructorCall(DD, Type, DD->getParent());
	else
	Callee =
	CGCallee::forDirect(CGM.getAddrOfCXXStructor(DD, getFromDtorType(Type)),
	DD);

	CGF.EmitCXXMemberOrOperatorCall(DD, Callee, ReturnValueSlot(),
	This.getPointer(), VTT, VTTTy,
	nullptr, nullptr);
	}

	void ItaniumCXXABI::emitVTableDefinitions(CodeGenVTables &CGVT,
	const CXXRecordDecl *RD) {
	llvm::GlobalVariable *VTable = getAddrOfVTable(RD, CharUnits());
	if (VTable->hasInitializer())
	return;

	ItaniumVTableContext &VTContext = CGM.getItaniumVTableContext();
	const VTableLayout &VTLayout = VTContext.getVTableLayout(RD);
	llvm::GlobalVariable::LinkageTypes Linkage = CGM.getVTableLinkage(RD);
	llvm::Constant *RTTI =
	CGM.GetAddrOfRTTIDescriptor(CGM.getContext().getTagDeclType(RD));

	// Create and set the initializer.
	ConstantInitBuilder Builder(CGM);
	auto Components = Builder.beginStruct();
	CGVT.createVTableInitializer(Components, VTLayout, RTTI);
	Components.finishAndSetAsInitializer(VTable);

	// Set the correct linkage.
	VTable->setLinkage(Linkage);

	if (CGM.supportsCOMDAT() && VTable->isWeakForLinker())
	VTable->setComdat(CGM.getModule().getOrInsertComdat(VTable->getName()));

	// Set the right visibility.
	CGM.setGlobalVisibility(VTable, RD, ForDefinition);

	// Use pointer alignment for the vtable. Otherwise we would align them based
	// on the size of the initializer which doesn't make sense as only single
	// values are read.
	unsigned PAlign = CGM.getTarget().getPointerAlign(0);
	VTable->setAlignment(getContext().toCharUnitsFromBits(PAlign).getQuantity());

	// If this is the magic class __cxxabiv1::__fundamental_type_info,
	// we will emit the typeinfo for the fundamental types. This is the
	// same behaviour as GCC.
	const DeclContext *DC = RD->getDeclContext();
	if (RD->getIdentifier() &&
	RD->getIdentifier()->isStr("__fundamental_type_info") &&
	isa<NamespaceDecl>(DC) && cast<NamespaceDecl>(DC)->getIdentifier() &&
	cast<NamespaceDecl>(DC)->getIdentifier()->isStr("__cxxabiv1") &&
	DC->getParent()->isTranslationUnit())
	EmitFundamentalRTTIDescriptors(RD->hasAttr<DLLExportAttr>());

	if (!VTable->isDeclarationForLinker())
	CGM.EmitVTableTypeMetadata(VTable, VTLayout);
	}

	bool ItaniumCXXABI::isVirtualOffsetNeededForVTableField(
	CodeGenFunction &CGF, CodeGenFunction::VPtr Vptr) {
	if (Vptr.NearestVBase == nullptr)
	return false;
	return NeedsVTTParameter(CGF.CurGD);
	}

	llvm::Value *ItaniumCXXABI::getVTableAddressPointInStructor(
	CodeGenFunction &CGF, const CXXRecordDecl *VTableClass, BaseSubobject Base,
	const CXXRecordDecl *NearestVBase) {

	if ((Base.getBase()->getNumVBases() \|\| NearestVBase != nullptr) &&
	NeedsVTTParameter(CGF.CurGD)) {
	return getVTableAddressPointInStructorWithVTT(CGF, VTableClass, Base,
	NearestVBase);
	}
	return getVTableAddressPoint(Base, VTableClass);
	}

	llvm::Constant *
	ItaniumCXXABI::getVTableAddressPoint(BaseSubobject Base,
	const CXXRecordDecl *VTableClass) {
	llvm::GlobalValue *VTable = getAddrOfVTable(VTableClass, CharUnits());

	// Find the appropriate vtable within the vtable group, and the address point
	// within that vtable.
	VTableLayout::AddressPointLocation AddressPoint =
	CGM.getItaniumVTableContext()
	.getVTableLayout(VTableClass)
	.getAddressPoint(Base);
	llvm::Value *Indices[] = {
	llvm::ConstantInt::get(CGM.Int32Ty, 0),
	llvm::ConstantInt::get(CGM.Int32Ty, AddressPoint.VTableIndex),
	llvm::ConstantInt::get(CGM.Int32Ty, AddressPoint.AddressPointIndex),
	};

	return llvm::ConstantExpr::getGetElementPtr(VTable->getValueType(), VTable,
	Indices, /InBounds=/true,
	/InRangeIndex=/1);
	}

	llvm::Value *ItaniumCXXABI::getVTableAddressPointInStructorWithVTT(
	CodeGenFunction &CGF, const CXXRecordDecl *VTableClass, BaseSubobject Base,
	const CXXRecordDecl *NearestVBase) {
	assert((Base.getBase()->getNumVBases() \|\| NearestVBase != nullptr) &&
	NeedsVTTParameter(CGF.CurGD) && "This class doesn't have VTT");

	// Get the secondary vpointer index.
	uint64_t VirtualPointerIndex =
	CGM.getVTables().getSecondaryVirtualPointerIndex(VTableClass, Base);

	/// Load the VTT.
	llvm::Value *VTT = CGF.LoadCXXVTT();
	if (VirtualPointerIndex)
	VTT = CGF.Builder.CreateConstInBoundsGEP1_64(VTT, VirtualPointerIndex);

	// And load the address point from the VTT.
	return CGF.Builder.CreateAlignedLoad(VTT, CGF.getPointerAlign());
	}

	llvm::Constant *ItaniumCXXABI::getVTableAddressPointForConstExpr(
	BaseSubobject Base, const CXXRecordDecl *VTableClass) {
	return getVTableAddressPoint(Base, VTableClass);
	}

	llvm::GlobalVariable ItaniumCXXABI::getAddrOfVTable(const CXXRecordDecl RD,
	CharUnits VPtrOffset) {
	assert(VPtrOffset.isZero() && "Itanium ABI only supports zero vptr offsets");

	llvm::GlobalVariable *&VTable = VTables[RD];
	if (VTable)
	return VTable;

	// Queue up this vtable for possible deferred emission.
	CGM.addDeferredVTable(RD);

	SmallString<256> Name;
	llvm::raw_svector_ostream Out(Name);
	getMangleContext().mangleCXXVTable(RD, Out);

	const VTableLayout &VTLayout =
	CGM.getItaniumVTableContext().getVTableLayout(RD);
	llvm::Type *VTableType = CGM.getVTables().getVTableType(VTLayout);

	VTable = CGM.CreateOrReplaceCXXRuntimeVariable(
	Name, VTableType, llvm::GlobalValue::ExternalLinkage);
	VTable->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
	CGM.setGlobalVisibility(VTable, RD, NotForDefinition);

	if (RD->hasAttr<DLLImportAttr>())
	VTable->setDLLStorageClass(llvm::GlobalValue::DLLImportStorageClass);
	else if (RD->hasAttr<DLLExportAttr>())
	VTable->setDLLStorageClass(llvm::GlobalValue::DLLExportStorageClass);

	return VTable;
	}

	CGCallee ItaniumCXXABI::getVirtualFunctionPointer(CodeGenFunction &CGF,
	GlobalDecl GD,
	Address This,
	llvm::Type *Ty,
	SourceLocation Loc) {
	GD = GD.getCanonicalDecl();
	Ty = Ty->getPointerTo()->getPointerTo();
	auto *MethodDecl = cast<CXXMethodDecl>(GD.getDecl());
	llvm::Value *VTable = CGF.GetVTablePtr(This, Ty, MethodDecl->getParent());

	uint64_t VTableIndex = CGM.getItaniumVTableContext().getMethodVTableIndex(GD);
	llvm::Value *VFunc;
	if (CGF.ShouldEmitVTableTypeCheckedLoad(MethodDecl->getParent())) {
	VFunc = CGF.EmitVTableTypeCheckedLoad(
	MethodDecl->getParent(), VTable,
	VTableIndex * CGM.getContext().getTargetInfo().getPointerWidth(0) / 8);
	} else {
	CGF.EmitTypeMetadataCodeForVCall(MethodDecl->getParent(), VTable, Loc);

	llvm::Value *VFuncPtr =
	CGF.Builder.CreateConstInBoundsGEP1_64(VTable, VTableIndex, "vfn");
	auto *VFuncLoad =
	CGF.Builder.CreateAlignedLoad(VFuncPtr, CGF.getPointerAlign());

	// Add !invariant.load md to virtual function load to indicate that
	// function didn't change inside vtable.
	// It's safe to add it without -fstrict-vtable-pointers, but it would not
	// help in devirtualization because it will only matter if we will have 2
	// the same virtual function loads from the same vtable load, which won't
	// happen without enabled devirtualization with -fstrict-vtable-pointers.
	if (CGM.getCodeGenOpts().OptimizationLevel > 0 &&
	CGM.getCodeGenOpts().StrictVTablePointers)
	VFuncLoad->setMetadata(
	llvm::LLVMContext::MD_invariant_load,
	llvm::MDNode::get(CGM.getLLVMContext(),
	llvm::ArrayRef<llvm::Metadata *>()));
	VFunc = VFuncLoad;
	}

	CGCallee Callee(MethodDecl, VFunc);
	return Callee;
	}

	llvm::Value *ItaniumCXXABI::EmitVirtualDestructorCall(
	CodeGenFunction &CGF, const CXXDestructorDecl *Dtor, CXXDtorType DtorType,
	Address This, const CXXMemberCallExpr *CE) {
	assert(CE == nullptr \|\| CE->arg_begin() == CE->arg_end());
	assert(DtorType == Dtor_Deleting \|\| DtorType == Dtor_Complete);

	const CGFunctionInfo *FInfo = &CGM.getTypes().arrangeCXXStructorDeclaration(
	Dtor, getFromDtorType(DtorType));
	llvm::Type Ty = CGF.CGM.getTypes().GetFunctionType(FInfo);
	CGCallee Callee =
	getVirtualFunctionPointer(CGF, GlobalDecl(Dtor, DtorType), This, Ty,
	CE ? CE->getLocStart() : SourceLocation());

	CGF.EmitCXXMemberOrOperatorCall(Dtor, Callee, ReturnValueSlot(),
	This.getPointer(), /ImplicitParam=/nullptr,
	QualType(), CE, nullptr);
	return nullptr;
	}

	void ItaniumCXXABI::emitVirtualInheritanceTables(const CXXRecordDecl *RD) {
	CodeGenVTables &VTables = CGM.getVTables();
	llvm::GlobalVariable *VTT = VTables.GetAddrOfVTT(RD);
	VTables.EmitVTTDefinition(VTT, CGM.getVTableLinkage(RD), RD);
	}

	bool ItaniumCXXABI::canSpeculativelyEmitVTable(const CXXRecordDecl *RD) const {
	// We don't emit available_externally vtables if we are in -fapple-kext mode
	// because kext mode does not permit devirtualization.
	if (CGM.getLangOpts().AppleKext)
	return false;

	// If we don't have any not emitted inline virtual function, and if vtable is
	// not hidden, then we are safe to emit available_externally copy of vtable.
	// FIXME we can still emit a copy of the vtable if we
	// can emit definition of the inline functions.
	return !hasAnyUnusedVirtualInlineFunction(RD) && !isVTableHidden(RD);
	}
	static llvm::Value *performTypeAdjustment(CodeGenFunction &CGF,
	Address InitialPtr,
	int64_t NonVirtualAdjustment,
	int64_t VirtualAdjustment,
	bool IsReturnAdjustment) {
	if (!NonVirtualAdjustment && !VirtualAdjustment)
	return InitialPtr.getPointer();

	Address V = CGF.Builder.CreateElementBitCast(InitialPtr, CGF.Int8Ty);

	// In a base-to-derived cast, the non-virtual adjustment is applied first.
	if (NonVirtualAdjustment && !IsReturnAdjustment) {
	V = CGF.Builder.CreateConstInBoundsByteGEP(V,
	CharUnits::fromQuantity(NonVirtualAdjustment));
	}

	// Perform the virtual adjustment if we have one.
	llvm::Value *ResultPtr;
	if (VirtualAdjustment) {
	llvm::Type *PtrDiffTy =
	CGF.ConvertType(CGF.getContext().getPointerDiffType());

	Address VTablePtrPtr = CGF.Builder.CreateElementBitCast(V, CGF.Int8PtrTy);
	llvm::Value *VTablePtr = CGF.Builder.CreateLoad(VTablePtrPtr);

	llvm::Value *OffsetPtr =
	CGF.Builder.CreateConstInBoundsGEP1_64(VTablePtr, VirtualAdjustment);

	OffsetPtr = CGF.Builder.CreateBitCast(OffsetPtr, PtrDiffTy->getPointerTo());

	// Load the adjustment offset from the vtable.
	llvm::Value *Offset =
	CGF.Builder.CreateAlignedLoad(OffsetPtr, CGF.getPointerAlign());

	// Adjust our pointer.
	ResultPtr = CGF.Builder.CreateInBoundsGEP(V.getPointer(), Offset);
	} else {
	ResultPtr = V.getPointer();
	}

	// In a derived-to-base conversion, the non-virtual adjustment is
	// applied second.
	if (NonVirtualAdjustment && IsReturnAdjustment) {
	ResultPtr = CGF.Builder.CreateConstInBoundsGEP1_64(ResultPtr,
	NonVirtualAdjustment);
	}

	// Cast back to the original type.
	return CGF.Builder.CreateBitCast(ResultPtr, InitialPtr.getType());
	}

	llvm::Value *ItaniumCXXABI::performThisAdjustment(CodeGenFunction &CGF,
	Address This,
	const ThisAdjustment &TA) {
	return performTypeAdjustment(CGF, This, TA.NonVirtual,
	TA.Virtual.Itanium.VCallOffsetOffset,
	/IsReturnAdjustment=/false);
	}

	llvm::Value *
	ItaniumCXXABI::performReturnAdjustment(CodeGenFunction &CGF, Address Ret,
	const ReturnAdjustment &RA) {
	return performTypeAdjustment(CGF, Ret, RA.NonVirtual,
	RA.Virtual.Itanium.VBaseOffsetOffset,
	/IsReturnAdjustment=/true);
	}

	void ARMCXXABI::EmitReturnFromThunk(CodeGenFunction &CGF,
	RValue RV, QualType ResultType) {
	if (!isa<CXXDestructorDecl>(CGF.CurGD.getDecl()))
	return ItaniumCXXABI::EmitReturnFromThunk(CGF, RV, ResultType);

	// Destructor thunks in the ARM ABI have indeterminate results.
	llvm::Type *T = CGF.ReturnValue.getElementType();
	RValue Undef = RValue::get(llvm::UndefValue::get(T));
	return ItaniumCXXABI::EmitReturnFromThunk(CGF, Undef, ResultType);
	}

	/************************ Array allocation cookies ************************/

	CharUnits ItaniumCXXABI::getArrayCookieSizeImpl(QualType elementType) {
	// The array cookie is a size_t; pad that up to the element alignment.
	// The cookie is actually right-justified in that space.
	return std::max(CharUnits::fromQuantity(CGM.SizeSizeInBytes),
	CGM.getContext().getTypeAlignInChars(elementType));
	}

	Address ItaniumCXXABI::InitializeArrayCookie(CodeGenFunction &CGF,
	Address NewPtr,
	llvm::Value *NumElements,
	const CXXNewExpr *expr,
	QualType ElementType) {
	assert(requiresArrayCookie(expr));

	unsigned AS = NewPtr.getAddressSpace();

	ASTContext &Ctx = getContext();
	CharUnits SizeSize = CGF.getSizeSize();

	// The size of the cookie.
	CharUnits CookieSize =
	std::max(SizeSize, Ctx.getTypeAlignInChars(ElementType));
	assert(CookieSize == getArrayCookieSizeImpl(ElementType));

	// Compute an offset to the cookie.
	Address CookiePtr = NewPtr;
	CharUnits CookieOffset = CookieSize - SizeSize;
	if (!CookieOffset.isZero())
	CookiePtr = CGF.Builder.CreateConstInBoundsByteGEP(CookiePtr, CookieOffset);

	// Write the number of elements into the appropriate slot.
	Address NumElementsPtr =
	CGF.Builder.CreateElementBitCast(CookiePtr, CGF.SizeTy);
	llvm::Instruction *SI = CGF.Builder.CreateStore(NumElements, NumElementsPtr);

	// Handle the array cookie specially in ASan.
	if (CGM.getLangOpts().Sanitize.has(SanitizerKind::Address) && AS == 0 &&
	expr->getOperatorNew()->isReplaceableGlobalAllocationFunction()) {
	// The store to the CookiePtr does not need to be instrumented.
	CGM.getSanitizerMetadata()->disableSanitizerForInstruction(SI);
	llvm::FunctionType *FTy =
	llvm::FunctionType::get(CGM.VoidTy, NumElementsPtr.getType(), false);
	llvm::Constant *F =
	CGM.CreateRuntimeFunction(FTy, "__asan_poison_cxx_array_cookie");
	CGF.Builder.CreateCall(F, NumElementsPtr.getPointer());
	}

	// Finally, compute a pointer to the actual data buffer by skipping
	// over the cookie completely.
	return CGF.Builder.CreateConstInBoundsByteGEP(NewPtr, CookieSize);
	}

	llvm::Value *ItaniumCXXABI::readArrayCookieImpl(CodeGenFunction &CGF,
	Address allocPtr,
	CharUnits cookieSize) {
	// The element size is right-justified in the cookie.
	Address numElementsPtr = allocPtr;
	CharUnits numElementsOffset = cookieSize - CGF.getSizeSize();
	if (!numElementsOffset.isZero())
	numElementsPtr =
	CGF.Builder.CreateConstInBoundsByteGEP(numElementsPtr, numElementsOffset);

	unsigned AS = allocPtr.getAddressSpace();
	numElementsPtr = CGF.Builder.CreateElementBitCast(numElementsPtr, CGF.SizeTy);
	if (!CGM.getLangOpts().Sanitize.has(SanitizerKind::Address) \|\| AS != 0)
	return CGF.Builder.CreateLoad(numElementsPtr);
	// In asan mode emit a function call instead of a regular load and let the
	// run-time deal with it: if the shadow is properly poisoned return the
	// cookie, otherwise return 0 to avoid an infinite loop calling DTORs.
	// We can't simply ignore this load using nosanitize metadata because
	// the metadata may be lost.
	llvm::FunctionType *FTy =
	llvm::FunctionType::get(CGF.SizeTy, CGF.SizeTy->getPointerTo(0), false);
	llvm::Constant *F =
	CGM.CreateRuntimeFunction(FTy, "__asan_load_cxx_array_cookie");
	return CGF.Builder.CreateCall(F, numElementsPtr.getPointer());
	}

	CharUnits ARMCXXABI::getArrayCookieSizeImpl(QualType elementType) {
	// ARM says that the cookie is always:
	// struct array_cookie {
	// std::size_t element_size; // element_size != 0
	// std::size_t element_count;
	// };
	// But the base ABI doesn't give anything an alignment greater than
	// 8, so we can dismiss this as typical ABI-author blindness to
	// actual language complexity and round up to the element alignment.
	return std::max(CharUnits::fromQuantity(2 * CGM.SizeSizeInBytes),
	CGM.getContext().getTypeAlignInChars(elementType));
	}

	Address ARMCXXABI::InitializeArrayCookie(CodeGenFunction &CGF,
	Address newPtr,
	llvm::Value *numElements,
	const CXXNewExpr *expr,
	QualType elementType) {
	assert(requiresArrayCookie(expr));

	// The cookie is always at the start of the buffer.
	Address cookie = newPtr;

	// The first element is the element size.
	cookie = CGF.Builder.CreateElementBitCast(cookie, CGF.SizeTy);
	llvm::Value *elementSize = llvm::ConstantInt::get(CGF.SizeTy,
	getContext().getTypeSizeInChars(elementType).getQuantity());
	CGF.Builder.CreateStore(elementSize, cookie);

	// The second element is the element count.
	cookie = CGF.Builder.CreateConstInBoundsGEP(cookie, 1, CGF.getSizeSize());
	CGF.Builder.CreateStore(numElements, cookie);

	// Finally, compute a pointer to the actual data buffer by skipping
	// over the cookie completely.
	CharUnits cookieSize = ARMCXXABI::getArrayCookieSizeImpl(elementType);
	return CGF.Builder.CreateConstInBoundsByteGEP(newPtr, cookieSize);
	}

	llvm::Value *ARMCXXABI::readArrayCookieImpl(CodeGenFunction &CGF,
	Address allocPtr,
	CharUnits cookieSize) {
	// The number of elements is at offset sizeof(size_t) relative to
	// the allocated pointer.
	Address numElementsPtr
	= CGF.Builder.CreateConstInBoundsByteGEP(allocPtr, CGF.getSizeSize());

	numElementsPtr = CGF.Builder.CreateElementBitCast(numElementsPtr, CGF.SizeTy);
	return CGF.Builder.CreateLoad(numElementsPtr);
	}

	/********************* Static local initialization ************************/

	static llvm::Constant *getGuardAcquireFn(CodeGenModule &CGM,
	llvm::PointerType *GuardPtrTy) {
	// int __cxa_guard_acquire(__guard *guard_object);
	llvm::FunctionType *FTy =
	llvm::FunctionType::get(CGM.getTypes().ConvertType(CGM.getContext().IntTy),
	GuardPtrTy, /isVarArg=/false);
	return CGM.CreateRuntimeFunction(
	FTy, "__cxa_guard_acquire",
	llvm::AttributeList::get(CGM.getLLVMContext(),
	llvm::AttributeList::FunctionIndex,
	llvm::Attribute::NoUnwind));
	}

	static llvm::Constant *getGuardReleaseFn(CodeGenModule &CGM,
	llvm::PointerType *GuardPtrTy) {
	// void __cxa_guard_release(__guard *guard_object);
	llvm::FunctionType *FTy =
	llvm::FunctionType::get(CGM.VoidTy, GuardPtrTy, /isVarArg=/false);
	return CGM.CreateRuntimeFunction(
	FTy, "__cxa_guard_release",
	llvm::AttributeList::get(CGM.getLLVMContext(),
	llvm::AttributeList::FunctionIndex,
	llvm::Attribute::NoUnwind));
	}

	static llvm::Constant *getGuardAbortFn(CodeGenModule &CGM,
	llvm::PointerType *GuardPtrTy) {
	// void __cxa_guard_abort(__guard *guard_object);
	llvm::FunctionType *FTy =
	llvm::FunctionType::get(CGM.VoidTy, GuardPtrTy, /isVarArg=/false);
	return CGM.CreateRuntimeFunction(
	FTy, "__cxa_guard_abort",
	llvm::AttributeList::get(CGM.getLLVMContext(),
	llvm::AttributeList::FunctionIndex,
	llvm::Attribute::NoUnwind));
	}

	namespace {
	struct CallGuardAbort final : EHScopeStack::Cleanup {
	llvm::GlobalVariable *Guard;
	CallGuardAbort(llvm::GlobalVariable *Guard) : Guard(Guard) {}

	void Emit(CodeGenFunction &CGF, Flags flags) override {
	CGF.EmitNounwindRuntimeCall(getGuardAbortFn(CGF.CGM, Guard->getType()),
	Guard);
	}
	};
	}

	/// The ARM code here follows the Itanium code closely enough that we
	/// just special-case it at particular places.
	void ItaniumCXXABI::EmitGuardedInit(CodeGenFunction &CGF,
	const VarDecl &D,
	llvm::GlobalVariable *var,
	bool shouldPerformInit) {
	CGBuilderTy &Builder = CGF.Builder;

	// Inline variables that weren't instantiated from variable templates have
	// partially-ordered initialization within their translation unit.
	bool NonTemplateInline =
	D.isInline() &&
	!isTemplateInstantiation(D.getTemplateSpecializationKind());

	// We only need to use thread-safe statics for local non-TLS variables and
	// inline variables; other global initialization is always single-threaded
	// or (through lazy dynamic loading in multiple threads) unsequenced.
	bool threadsafe = getContext().getLangOpts().ThreadsafeStatics &&
	(D.isLocalVarDecl() \|\| NonTemplateInline) &&
	!D.getTLSKind();

	// If we have a global variable with internal linkage and thread-safe statics
	// are disabled, we can just let the guard variable be of type i8.
	bool useInt8GuardVariable = !threadsafe && var->hasInternalLinkage();

	llvm::IntegerType *guardTy;
	CharUnits guardAlignment;
	if (useInt8GuardVariable) {
	guardTy = CGF.Int8Ty;
	guardAlignment = CharUnits::One();
	} else {
	// Guard variables are 64 bits in the generic ABI and size width on ARM
	// (i.e. 32-bit on AArch32, 64-bit on AArch64).
	if (UseARMGuardVarABI) {
	guardTy = CGF.SizeTy;
	guardAlignment = CGF.getSizeAlign();
	} else {
	guardTy = CGF.Int64Ty;
	guardAlignment = CharUnits::fromQuantity(
	CGM.getDataLayout().getABITypeAlignment(guardTy));
	}
	}
	llvm::PointerType *guardPtrTy = guardTy->getPointerTo();

	// Create the guard variable if we don't already have it (as we
	// might if we're double-emitting this function body).
	llvm::GlobalVariable *guard = CGM.getStaticLocalDeclGuardAddress(&D);
	if (!guard) {
	// Mangle the name for the guard.
	SmallString<256> guardName;
	{
	llvm::raw_svector_ostream out(guardName);
	getMangleContext().mangleStaticGuardVariable(&D, out);
	}

	// Create the guard variable with a zero-initializer.
	// Just absorb linkage and visibility from the guarded variable.
	guard = new llvm::GlobalVariable(CGM.getModule(), guardTy,
	false, var->getLinkage(),
	llvm::ConstantInt::get(guardTy, 0),
	guardName.str());
	guard->setVisibility(var->getVisibility());
	// If the variable is thread-local, so is its guard variable.
	guard->setThreadLocalMode(var->getThreadLocalMode());
	guard->setAlignment(guardAlignment.getQuantity());

	// The ABI says: "It is suggested that it be emitted in the same COMDAT
	// group as the associated data object." In practice, this doesn't work for
	// non-ELF and non-Wasm object formats, so only do it for ELF and Wasm.
	llvm::Comdat *C = var->getComdat();
	if (!D.isLocalVarDecl() && C &&
	(CGM.getTarget().getTriple().isOSBinFormatELF() \|\|
	CGM.getTarget().getTriple().isOSBinFormatWasm())) {
	guard->setComdat(C);
	// An inline variable's guard function is run from the per-TU
	// initialization function, not via a dedicated global ctor function, so
	// we can't put it in a comdat.
	if (!NonTemplateInline)
	CGF.CurFn->setComdat(C);
	} else if (CGM.supportsCOMDAT() && guard->isWeakForLinker()) {
	guard->setComdat(CGM.getModule().getOrInsertComdat(guard->getName()));
	}

	CGM.setStaticLocalDeclGuardAddress(&D, guard);
	}

	Address guardAddr = Address(guard, guardAlignment);

	// Test whether the variable has completed initialization.
	//
	// Itanium C++ ABI 3.3.2:
	// The following is pseudo-code showing how these functions can be used:
	// if (obj_guard.first_byte == 0) {
	// if ( __cxa_guard_acquire (&obj_guard) ) {
	// try {
	// ... initialize the object ...;
	// } catch (...) {
	// __cxa_guard_abort (&obj_guard);
	// throw;
	// }
	// ... queue object destructor with __cxa_atexit() ...;
	// __cxa_guard_release (&obj_guard);
	// }
	// }

	// Load the first byte of the guard variable.
	llvm::LoadInst *LI =
	Builder.CreateLoad(Builder.CreateElementBitCast(guardAddr, CGM.Int8Ty));

	// Itanium ABI:
	// An implementation supporting thread-safety on multiprocessor
	// systems must also guarantee that references to the initialized
	// object do not occur before the load of the initialization flag.
	//
	// In LLVM, we do this by marking the load Acquire.
	if (threadsafe)
	LI->setAtomic(llvm::AtomicOrdering::Acquire);

	// For ARM, we should only check the first bit, rather than the entire byte:
	//
	// ARM C++ ABI 3.2.3.1:
	// To support the potential use of initialization guard variables
	// as semaphores that are the target of ARM SWP and LDREX/STREX
	// synchronizing instructions we define a static initialization
	// guard variable to be a 4-byte aligned, 4-byte word with the
	// following inline access protocol.
	// #define INITIALIZED 1
	// if ((obj_guard & INITIALIZED) != INITIALIZED) {
	// if (__cxa_guard_acquire(&obj_guard))
	// ...
	// }
	//
	// and similarly for ARM64:
	//
	// ARM64 C++ ABI 3.2.2:
	// This ABI instead only specifies the value bit 0 of the static guard
	// variable; all other bits are platform defined. Bit 0 shall be 0 when the
	// variable is not initialized and 1 when it is.
	llvm::Value *V =
	(UseARMGuardVarABI && !useInt8GuardVariable)
	? Builder.CreateAnd(LI, llvm::ConstantInt::get(CGM.Int8Ty, 1))
	: LI;
	llvm::Value *NeedsInit = Builder.CreateIsNull(V, "guard.uninitialized");

	llvm::BasicBlock *InitCheckBlock = CGF.createBasicBlock("init.check");
	llvm::BasicBlock *EndBlock = CGF.createBasicBlock("init.end");

	// Check if the first byte of the guard variable is zero.
	CGF.EmitCXXGuardedInitBranch(NeedsInit, InitCheckBlock, EndBlock,
	CodeGenFunction::GuardKind::VariableGuard, &D);

	CGF.EmitBlock(InitCheckBlock);

	// Variables used when coping with thread-safe statics and exceptions.
	if (threadsafe) {
	// Call __cxa_guard_acquire.
	llvm::Value *V
	= CGF.EmitNounwindRuntimeCall(getGuardAcquireFn(CGM, guardPtrTy), guard);

	llvm::BasicBlock *InitBlock = CGF.createBasicBlock("init");

	Builder.CreateCondBr(Builder.CreateIsNotNull(V, "tobool"),
	InitBlock, EndBlock);

	// Call __cxa_guard_abort along the exceptional edge.
	CGF.EHStack.pushCleanup<CallGuardAbort>(EHCleanup, guard);

	CGF.EmitBlock(InitBlock);
	}

	// Emit the initializer and add a global destructor if appropriate.
	CGF.EmitCXXGlobalVarDeclInit(D, var, shouldPerformInit);

	if (threadsafe) {
	// Pop the guard-abort cleanup if we pushed one.
	CGF.PopCleanupBlock();

	// Call __cxa_guard_release. This cannot throw.
	CGF.EmitNounwindRuntimeCall(getGuardReleaseFn(CGM, guardPtrTy),
	guardAddr.getPointer());
	} else {
	Builder.CreateStore(llvm::ConstantInt::get(guardTy, 1), guardAddr);
	}

	CGF.EmitBlock(EndBlock);
	}

	/// Register a global destructor using __cxa_atexit.
	static void emitGlobalDtorWithCXAAtExit(CodeGenFunction &CGF,
	llvm::Constant *dtor,
	llvm::Constant *addr,
	bool TLS) {
	const char *Name = "__cxa_atexit";
	if (TLS) {
	const llvm::Triple &T = CGF.getTarget().getTriple();
	Name = T.isOSDarwin() ? "_tlv_atexit" : "__cxa_thread_atexit";
	}

	// We're assuming that the destructor function is something we can
	// reasonably call with the default CC. Go ahead and cast it to the
	// right prototype.
	llvm::Type *dtorTy =
	llvm::FunctionType::get(CGF.VoidTy, CGF.Int8PtrTy, false)->getPointerTo();

	// extern "C" int __cxa_atexit(void (f)(void ), void p, void d);
	llvm::Type *paramTys[] = { dtorTy, CGF.Int8PtrTy, CGF.Int8PtrTy };
	llvm::FunctionType *atexitTy =
	llvm::FunctionType::get(CGF.IntTy, paramTys, false);

	// Fetch the actual function.
	llvm::Constant *atexit = CGF.CGM.CreateRuntimeFunction(atexitTy, Name);
	if (llvm::Function *fn = dyn_cast<llvm::Function>(atexit))
	fn->setDoesNotThrow();

	// Create a variable that binds the atexit to this shared object.
	llvm::Constant *handle =
	CGF.CGM.CreateRuntimeVariable(CGF.Int8Ty, "__dso_handle");
	auto *GV = cast<llvm::GlobalValue>(handle->stripPointerCasts());
	GV->setVisibility(llvm::GlobalValue::HiddenVisibility);

	llvm::Value *args[] = {
	llvm::ConstantExpr::getBitCast(dtor, dtorTy),
	llvm::ConstantExpr::getBitCast(addr, CGF.Int8PtrTy),
	handle
	};
	CGF.EmitNounwindRuntimeCall(atexit, args);
	}

	/// Register a global destructor as best as we know how.
	void ItaniumCXXABI::registerGlobalDtor(CodeGenFunction &CGF,
	const VarDecl &D,
	llvm::Constant *dtor,
	llvm::Constant *addr) {
	// Use __cxa_atexit if available.
	if (CGM.getCodeGenOpts().CXAAtExit)
	return emitGlobalDtorWithCXAAtExit(CGF, dtor, addr, D.getTLSKind());

	if (D.getTLSKind())
	CGM.ErrorUnsupported(&D, "non-trivial TLS destruction");

	// In Apple kexts, we want to add a global destructor entry.
	// FIXME: shouldn't this be guarded by some variable?
	if (CGM.getLangOpts().AppleKext) {
	// Generate a global destructor entry.
	return CGM.AddCXXDtorEntry(dtor, addr);
	}

	CGF.registerGlobalDtorWithAtExit(D, dtor, addr);
	}

	static bool isThreadWrapperReplaceable(const VarDecl *VD,
	CodeGen::CodeGenModule &CGM) {
	assert(!VD->isStaticLocal() && "static local VarDecls don't need wrappers!");
	// Darwin prefers to have references to thread local variables to go through
	// the thread wrapper instead of directly referencing the backing variable.
	return VD->getTLSKind() == VarDecl::TLS_Dynamic &&
	CGM.getTarget().getTriple().isOSDarwin();
	}

	/// Get the appropriate linkage for the wrapper function. This is essentially
	/// the weak form of the variable's linkage; every translation unit which needs
	/// the wrapper emits a copy, and we want the linker to merge them.
	static llvm::GlobalValue::LinkageTypes
	getThreadLocalWrapperLinkage(const VarDecl *VD, CodeGen::CodeGenModule &CGM) {
	llvm::GlobalValue::LinkageTypes VarLinkage =
	CGM.getLLVMLinkageVarDefinition(VD, /isConstant=/false);

	// For internal linkage variables, we don't need an external or weak wrapper.
	if (llvm::GlobalValue::isLocalLinkage(VarLinkage))
	return VarLinkage;

	// If the thread wrapper is replaceable, give it appropriate linkage.
	if (isThreadWrapperReplaceable(VD, CGM))
	if (!llvm::GlobalVariable::isLinkOnceLinkage(VarLinkage) &&
	!llvm::GlobalVariable::isWeakODRLinkage(VarLinkage))
	return VarLinkage;
	return llvm::GlobalValue::WeakODRLinkage;
	}

	llvm::Function *
	ItaniumCXXABI::getOrCreateThreadLocalWrapper(const VarDecl *VD,
	llvm::Value *Val) {
	// Mangle the name for the thread_local wrapper function.
	SmallString<256> WrapperName;
	{
	llvm::raw_svector_ostream Out(WrapperName);
	getMangleContext().mangleItaniumThreadLocalWrapper(VD, Out);
	}

	// FIXME: If VD is a definition, we should regenerate the function attributes
	// before returning.
	if (llvm::Value *V = CGM.getModule().getNamedValue(WrapperName))
	return cast<llvm::Function>(V);

	QualType RetQT = VD->getType();
	if (RetQT->isReferenceType())
	RetQT = RetQT.getNonReferenceType();

	const CGFunctionInfo &FI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(
	getContext().getPointerType(RetQT), FunctionArgList());

	llvm::FunctionType *FnTy = CGM.getTypes().GetFunctionType(FI);
	llvm::Function *Wrapper =
	llvm::Function::Create(FnTy, getThreadLocalWrapperLinkage(VD, CGM),
	WrapperName.str(), &CGM.getModule());

	CGM.SetLLVMFunctionAttributes(nullptr, FI, Wrapper);

	if (VD->hasDefinition())
	CGM.SetLLVMFunctionAttributesForDefinition(nullptr, Wrapper);

	// Always resolve references to the wrapper at link time.
	if (!Wrapper->hasLocalLinkage() && !(isThreadWrapperReplaceable(VD, CGM) &&
	!llvm::GlobalVariable::isLinkOnceLinkage(Wrapper->getLinkage()) &&
	!llvm::GlobalVariable::isWeakODRLinkage(Wrapper->getLinkage())))
	Wrapper->setVisibility(llvm::GlobalValue::HiddenVisibility);

	if (isThreadWrapperReplaceable(VD, CGM)) {
	Wrapper->setCallingConv(llvm::CallingConv::CXX_FAST_TLS);
	Wrapper->addFnAttr(llvm::Attribute::NoUnwind);
	}
	return Wrapper;
	}

	void ItaniumCXXABI::EmitThreadLocalInitFuncs(
	CodeGenModule &CGM, ArrayRef<const VarDecl *> CXXThreadLocals,
	ArrayRef<llvm::Function *> CXXThreadLocalInits,
	ArrayRef<const VarDecl *> CXXThreadLocalInitVars) {
	llvm::Function *InitFunc = nullptr;

	// Separate initializers into those with ordered (or partially-ordered)
	// initialization and those with unordered initialization.
	llvm::SmallVector<llvm::Function *, 8> OrderedInits;
	llvm::SmallDenseMap<const VarDecl , llvm::Function > UnorderedInits;
	for (unsigned I = 0; I != CXXThreadLocalInits.size(); ++I) {
	if (isTemplateInstantiation(
	CXXThreadLocalInitVars[I]->getTemplateSpecializationKind()))
	UnorderedInits[CXXThreadLocalInitVars[I]->getCanonicalDecl()] =
	CXXThreadLocalInits[I];
	else
	OrderedInits.push_back(CXXThreadLocalInits[I]);
	}

	if (!OrderedInits.empty()) {
	// Generate a guarded initialization function.
	llvm::FunctionType *FTy =
	llvm::FunctionType::get(CGM.VoidTy, /isVarArg=/false);
	const CGFunctionInfo &FI = CGM.getTypes().arrangeNullaryFunction();
	InitFunc = CGM.CreateGlobalInitOrDestructFunction(FTy, "__tls_init", FI,
	SourceLocation(),
	/TLS=/true);
	llvm::GlobalVariable *Guard = new llvm::GlobalVariable(
	CGM.getModule(), CGM.Int8Ty, /isConstant=/false,
	llvm::GlobalVariable::InternalLinkage,
	llvm::ConstantInt::get(CGM.Int8Ty, 0), "__tls_guard");
	Guard->setThreadLocal(true);

	CharUnits GuardAlign = CharUnits::One();
	Guard->setAlignment(GuardAlign.getQuantity());

	CodeGenFunction(CGM).GenerateCXXGlobalInitFunc(InitFunc, OrderedInits,
	Address(Guard, GuardAlign));
	// On Darwin platforms, use CXX_FAST_TLS calling convention.
	if (CGM.getTarget().getTriple().isOSDarwin()) {
	InitFunc->setCallingConv(llvm::CallingConv::CXX_FAST_TLS);
	InitFunc->addFnAttr(llvm::Attribute::NoUnwind);
	}
	}

	// Emit thread wrappers.
	for (const VarDecl *VD : CXXThreadLocals) {
	llvm::GlobalVariable *Var =
	cast<llvm::GlobalVariable>(CGM.GetGlobalValue(CGM.getMangledName(VD)));
	llvm::Function *Wrapper = getOrCreateThreadLocalWrapper(VD, Var);

	// Some targets require that all access to thread local variables go through
	// the thread wrapper. This means that we cannot attempt to create a thread
	// wrapper or a thread helper.
	if (isThreadWrapperReplaceable(VD, CGM) && !VD->hasDefinition()) {
	Wrapper->setLinkage(llvm::Function::ExternalLinkage);
	continue;
	}

	// Mangle the name for the thread_local initialization function.
	SmallString<256> InitFnName;
	{
	llvm::raw_svector_ostream Out(InitFnName);
	getMangleContext().mangleItaniumThreadLocalInit(VD, Out);
	}

	// If we have a definition for the variable, emit the initialization
	// function as an alias to the global Init function (if any). Otherwise,
	// produce a declaration of the initialization function.
	llvm::GlobalValue *Init = nullptr;
	bool InitIsInitFunc = false;
	if (VD->hasDefinition()) {
	InitIsInitFunc = true;
	llvm::Function *InitFuncToUse = InitFunc;
	if (isTemplateInstantiation(VD->getTemplateSpecializationKind()))
	InitFuncToUse = UnorderedInits.lookup(VD->getCanonicalDecl());
	if (InitFuncToUse)
	Init = llvm::GlobalAlias::create(Var->getLinkage(), InitFnName.str(),
	InitFuncToUse);
	} else {
	// Emit a weak global function referring to the initialization function.
	// This function will not exist if the TU defining the thread_local
	// variable in question does not need any dynamic initialization for
	// its thread_local variables.
	llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, false);
	Init = llvm::Function::Create(FnTy,
	llvm::GlobalVariable::ExternalWeakLinkage,
	InitFnName.str(), &CGM.getModule());
	const CGFunctionInfo &FI = CGM.getTypes().arrangeNullaryFunction();
	CGM.SetLLVMFunctionAttributes(nullptr, FI, cast<llvm::Function>(Init));
	}

	if (Init)
	Init->setVisibility(Var->getVisibility());

	llvm::LLVMContext &Context = CGM.getModule().getContext();
	llvm::BasicBlock *Entry = llvm::BasicBlock::Create(Context, "", Wrapper);
	CGBuilderTy Builder(CGM, Entry);
	if (InitIsInitFunc) {
	if (Init) {
	llvm::CallInst *CallVal = Builder.CreateCall(Init);
	if (isThreadWrapperReplaceable(VD, CGM))
	CallVal->setCallingConv(llvm::CallingConv::CXX_FAST_TLS);
	}
	} else {
	// Don't know whether we have an init function. Call it if it exists.
	llvm::Value *Have = Builder.CreateIsNotNull(Init);
	llvm::BasicBlock *InitBB = llvm::BasicBlock::Create(Context, "", Wrapper);
	llvm::BasicBlock *ExitBB = llvm::BasicBlock::Create(Context, "", Wrapper);
	Builder.CreateCondBr(Have, InitBB, ExitBB);

	Builder.SetInsertPoint(InitBB);
	Builder.CreateCall(Init);
	Builder.CreateBr(ExitBB);

	Builder.SetInsertPoint(ExitBB);
	}

	// For a reference, the result of the wrapper function is a pointer to
	// the referenced object.
	llvm::Value *Val = Var;
	if (VD->getType()->isReferenceType()) {
	CharUnits Align = CGM.getContext().getDeclAlign(VD);
	Val = Builder.CreateAlignedLoad(Val, Align);
	}
	if (Val->getType() != Wrapper->getReturnType())
	Val = Builder.CreatePointerBitCastOrAddrSpaceCast(
	Val, Wrapper->getReturnType(), "");
	Builder.CreateRet(Val);
	}
	}

	LValue ItaniumCXXABI::EmitThreadLocalVarDeclLValue(CodeGenFunction &CGF,
	const VarDecl *VD,
	QualType LValType) {
	llvm::Value *Val = CGF.CGM.GetAddrOfGlobalVar(VD);
	llvm::Function *Wrapper = getOrCreateThreadLocalWrapper(VD, Val);

	llvm::CallInst *CallVal = CGF.Builder.CreateCall(Wrapper);
	CallVal->setCallingConv(Wrapper->getCallingConv());

	LValue LV;
	if (VD->getType()->isReferenceType())
	LV = CGF.MakeNaturalAlignAddrLValue(CallVal, LValType);
	else
	LV = CGF.MakeAddrLValue(CallVal, LValType,
	CGF.getContext().getDeclAlign(VD));
	// FIXME: need setObjCGCLValueClass?
	return LV;
	}

	/// Return whether the given global decl needs a VTT parameter, which it does
	/// if it's a base constructor or destructor with virtual bases.
	bool ItaniumCXXABI::NeedsVTTParameter(GlobalDecl GD) {
	const CXXMethodDecl *MD = cast<CXXMethodDecl>(GD.getDecl());

	// We don't have any virtual bases, just return early.
	if (!MD->getParent()->getNumVBases())
	return false;

	// Check if we have a base constructor.
	if (isa<CXXConstructorDecl>(MD) && GD.getCtorType() == Ctor_Base)
	return true;

	// Check if we have a base destructor.
	if (isa<CXXDestructorDecl>(MD) && GD.getDtorType() == Dtor_Base)
	return true;

	return false;
	}

	namespace {
	class ItaniumRTTIBuilder {
	CodeGenModule &CGM; // Per-module state.
	llvm::LLVMContext &VMContext;
	const ItaniumCXXABI &CXXABI; // Per-module state.

	/// Fields - The fields of the RTTI descriptor currently being built.
	SmallVector<llvm::Constant *, 16> Fields;

	/// GetAddrOfTypeName - Returns the mangled type name of the given type.
	llvm::GlobalVariable *
	GetAddrOfTypeName(QualType Ty, llvm::GlobalVariable::LinkageTypes Linkage);

	/// GetAddrOfExternalRTTIDescriptor - Returns the constant for the RTTI
	/// descriptor of the given type.
	llvm::Constant *GetAddrOfExternalRTTIDescriptor(QualType Ty);

	/// BuildVTablePointer - Build the vtable pointer for the given type.
	void BuildVTablePointer(const Type *Ty);

	/// BuildSIClassTypeInfo - Build an abi::__si_class_type_info, used for single
	/// inheritance, according to the Itanium C++ ABI, 2.9.5p6b.
	void BuildSIClassTypeInfo(const CXXRecordDecl *RD);

	/// BuildVMIClassTypeInfo - Build an abi::__vmi_class_type_info, used for
	/// classes with bases that do not satisfy the abi::__si_class_type_info
	/// constraints, according ti the Itanium C++ ABI, 2.9.5p5c.
	void BuildVMIClassTypeInfo(const CXXRecordDecl *RD);

	/// BuildPointerTypeInfo - Build an abi::__pointer_type_info struct, used
	/// for pointer types.
	void BuildPointerTypeInfo(QualType PointeeTy);

	/// BuildObjCObjectTypeInfo - Build the appropriate kind of
	/// type_info for an object type.
	void BuildObjCObjectTypeInfo(const ObjCObjectType *Ty);

	/// BuildPointerToMemberTypeInfo - Build an abi::__pointer_to_member_type_info
	/// struct, used for member pointer types.
	void BuildPointerToMemberTypeInfo(const MemberPointerType *Ty);

	public:
	ItaniumRTTIBuilder(const ItaniumCXXABI &ABI)
	: CGM(ABI.CGM), VMContext(CGM.getModule().getContext()), CXXABI(ABI) {}

	// Pointer type info flags.
	enum {
	/// PTI_Const - Type has const qualifier.
	PTI_Const = 0x1,

	/// PTI_Volatile - Type has volatile qualifier.
	PTI_Volatile = 0x2,

	/// PTI_Restrict - Type has restrict qualifier.
	PTI_Restrict = 0x4,

	/// PTI_Incomplete - Type is incomplete.
	PTI_Incomplete = 0x8,

	/// PTI_ContainingClassIncomplete - Containing class is incomplete.
	/// (in pointer to member).
	PTI_ContainingClassIncomplete = 0x10,

	/// PTI_TransactionSafe - Pointee is transaction_safe function (C++ TM TS).
	//PTI_TransactionSafe = 0x20,

	/// PTI_Noexcept - Pointee is noexcept function (C++1z).
	PTI_Noexcept = 0x40,
	};

	// VMI type info flags.
	enum {
	/// VMI_NonDiamondRepeat - Class has non-diamond repeated inheritance.
	VMI_NonDiamondRepeat = 0x1,

	/// VMI_DiamondShaped - Class is diamond shaped.
	VMI_DiamondShaped = 0x2
	};

	// Base class type info flags.
	enum {
	/// BCTI_Virtual - Base class is virtual.
	BCTI_Virtual = 0x1,

	/// BCTI_Public - Base class is public.
	BCTI_Public = 0x2
	};

	/// BuildTypeInfo - Build the RTTI type info struct for the given type.
	///
	/// \param Force - true to force the creation of this RTTI value
	/// \param DLLExport - true to mark the RTTI value as DLLExport
	llvm::Constant *BuildTypeInfo(QualType Ty, bool Force = false,
	bool DLLExport = false);
	};
	}

	llvm::GlobalVariable *ItaniumRTTIBuilder::GetAddrOfTypeName(
	QualType Ty, llvm::GlobalVariable::LinkageTypes Linkage) {
	SmallString<256> Name;
	llvm::raw_svector_ostream Out(Name);
	CGM.getCXXABI().getMangleContext().mangleCXXRTTIName(Ty, Out);

	// We know that the mangled name of the type starts at index 4 of the
	// mangled name of the typename, so we can just index into it in order to
	// get the mangled name of the type.
	llvm::Constant *Init = llvm::ConstantDataArray::getString(VMContext,
	Name.substr(4));

	llvm::GlobalVariable *GV =
	CGM.CreateOrReplaceCXXRuntimeVariable(Name, Init->getType(), Linkage);

	GV->setInitializer(Init);

	return GV;
	}

	llvm::Constant *
	ItaniumRTTIBuilder::GetAddrOfExternalRTTIDescriptor(QualType Ty) {
	// Mangle the RTTI name.
	SmallString<256> Name;
	llvm::raw_svector_ostream Out(Name);
	CGM.getCXXABI().getMangleContext().mangleCXXRTTI(Ty, Out);

	// Look for an existing global.
	llvm::GlobalVariable *GV = CGM.getModule().getNamedGlobal(Name);

	if (!GV) {
	// Create a new global variable.
	// Note for the future: If we would ever like to do deferred emission of
	// RTTI, check if emitting vtables opportunistically need any adjustment.

	GV = new llvm::GlobalVariable(CGM.getModule(), CGM.Int8PtrTy,
	/Constant=/true,
	llvm::GlobalValue::ExternalLinkage, nullptr,
	Name);
	if (const RecordType *RecordTy = dyn_cast<RecordType>(Ty)) {
	const CXXRecordDecl *RD = cast<CXXRecordDecl>(RecordTy->getDecl());
	if (RD->hasAttr<DLLImportAttr>())
	GV->setDLLStorageClass(llvm::GlobalVariable::DLLImportStorageClass);
	}
	}

	return llvm::ConstantExpr::getBitCast(GV, CGM.Int8PtrTy);
	}

	/// TypeInfoIsInStandardLibrary - Given a builtin type, returns whether the type
	/// info for that type is defined in the standard library.
	static bool TypeInfoIsInStandardLibrary(const BuiltinType *Ty) {
	// Itanium C++ ABI 2.9.2:
	// Basic type information (e.g. for "int", "bool", etc.) will be kept in
	// the run-time support library. Specifically, the run-time support
	// library should contain type_info objects for the types X, X* and
	// X const*, for every X in: void, std::nullptr_t, bool, wchar_t, char,
	// unsigned char, signed char, short, unsigned short, int, unsigned int,
	// long, unsigned long, long long, unsigned long long, float, double,
	// long double, char16_t, char32_t, and the IEEE 754r decimal and
	// half-precision floating point types.
	//
	// GCC also emits RTTI for __int128.
	// FIXME: We do not emit RTTI information for decimal types here.

	// Types added here must also be added to EmitFundamentalRTTIDescriptors.
	switch (Ty->getKind()) {
	case BuiltinType::Void:
	case BuiltinType::NullPtr:
	case BuiltinType::Bool:
	case BuiltinType::WChar_S:
	case BuiltinType::WChar_U:
	case BuiltinType::Char_U:
	case BuiltinType::Char_S:
	case BuiltinType::UChar:
	case BuiltinType::SChar:
	case BuiltinType::Short:
	case BuiltinType::UShort:
	case BuiltinType::Int:
	case BuiltinType::UInt:
	case BuiltinType::Long:
	case BuiltinType::ULong:
	case BuiltinType::LongLong:
	case BuiltinType::ULongLong:
	case BuiltinType::Half:
	case BuiltinType::Float:
	case BuiltinType::Double:
	case BuiltinType::LongDouble:
	case BuiltinType::Float16:
	case BuiltinType::Float128:
	case BuiltinType::Char16:
	case BuiltinType::Char32:
	case BuiltinType::Int128:
	case BuiltinType::UInt128:
	return true;

	#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
	case BuiltinType::Id:
	#include "clang/Basic/OpenCLImageTypes.def"
	case BuiltinType::OCLSampler:
	case BuiltinType::OCLEvent:
	case BuiltinType::OCLClkEvent:
	case BuiltinType::OCLQueue:
	case BuiltinType::OCLReserveID:
	return false;

	case BuiltinType::Dependent:
	#define BUILTIN_TYPE(Id, SingletonId)
	#define PLACEHOLDER_TYPE(Id, SingletonId) \
	case BuiltinType::Id:
	#include "clang/AST/BuiltinTypes.def"
	llvm_unreachable("asking for RRTI for a placeholder type!");

	case BuiltinType::ObjCId:
	case BuiltinType::ObjCClass:
	case BuiltinType::ObjCSel:
	llvm_unreachable("FIXME: Objective-C types are unsupported!");
	}

	llvm_unreachable("Invalid BuiltinType Kind!");
	}

	static bool TypeInfoIsInStandardLibrary(const PointerType *PointerTy) {
	QualType PointeeTy = PointerTy->getPointeeType();
	const BuiltinType *BuiltinTy = dyn_cast<BuiltinType>(PointeeTy);
	if (!BuiltinTy)
	return false;

	// Check the qualifiers.
	Qualifiers Quals = PointeeTy.getQualifiers();
	Quals.removeConst();

	if (!Quals.empty())
	return false;

	return TypeInfoIsInStandardLibrary(BuiltinTy);
	}

	/// IsStandardLibraryRTTIDescriptor - Returns whether the type
	/// information for the given type exists in the standard library.
	static bool IsStandardLibraryRTTIDescriptor(QualType Ty) {
	// Type info for builtin types is defined in the standard library.
	if (const BuiltinType *BuiltinTy = dyn_cast<BuiltinType>(Ty))
	return TypeInfoIsInStandardLibrary(BuiltinTy);

	// Type info for some pointer types to builtin types is defined in the
	// standard library.
	if (const PointerType *PointerTy = dyn_cast<PointerType>(Ty))
	return TypeInfoIsInStandardLibrary(PointerTy);

	return false;
	}

	/// ShouldUseExternalRTTIDescriptor - Returns whether the type information for
	/// the given type exists somewhere else, and that we should not emit the type
	/// information in this translation unit. Assumes that it is not a
	/// standard-library type.
	static bool ShouldUseExternalRTTIDescriptor(CodeGenModule &CGM,
	QualType Ty) {
	ASTContext &Context = CGM.getContext();

	// If RTTI is disabled, assume it might be disabled in the
	// translation unit that defines any potential key function, too.
	if (!Context.getLangOpts().RTTI) return false;

	if (const RecordType *RecordTy = dyn_cast<RecordType>(Ty)) {
	const CXXRecordDecl *RD = cast<CXXRecordDecl>(RecordTy->getDecl());
	if (!RD->hasDefinition())
	return false;

	if (!RD->isDynamicClass())
	return false;

	// FIXME: this may need to be reconsidered if the key function
	// changes.
	// N.B. We must always emit the RTTI data ourselves if there exists a key
	// function.
	bool IsDLLImport = RD->hasAttr<DLLImportAttr>();
	+
	+ // Don't import the RTTI but emit it locally.
	+ if (CGM.getTriple().isWindowsGNUEnvironment() && IsDLLImport)
	+ return false;
	+
	if (CGM.getVTables().isVTableExternal(RD))
	return IsDLLImport && !CGM.getTriple().isWindowsItaniumEnvironment()
	? false
	: true;

	if (IsDLLImport)
	return true;
	}

	return false;
	}

	/// IsIncompleteClassType - Returns whether the given record type is incomplete.
	static bool IsIncompleteClassType(const RecordType *RecordTy) {
	return !RecordTy->getDecl()->isCompleteDefinition();
	}

	/// ContainsIncompleteClassType - Returns whether the given type contains an
	/// incomplete class type. This is true if
	///
	/// * The given type is an incomplete class type.
	/// * The given type is a pointer type whose pointee type contains an
	/// incomplete class type.
	/// * The given type is a member pointer type whose class is an incomplete
	/// class type.
	/// * The given type is a member pointer type whoise pointee type contains an
	/// incomplete class type.
	/// is an indirect or direct pointer to an incomplete class type.
	static bool ContainsIncompleteClassType(QualType Ty) {
	if (const RecordType *RecordTy = dyn_cast<RecordType>(Ty)) {
	if (IsIncompleteClassType(RecordTy))
	return true;
	}

	if (const PointerType *PointerTy = dyn_cast<PointerType>(Ty))
	return ContainsIncompleteClassType(PointerTy->getPointeeType());

	if (const MemberPointerType *MemberPointerTy =
	dyn_cast<MemberPointerType>(Ty)) {
	// Check if the class type is incomplete.
	const RecordType *ClassType = cast<RecordType>(MemberPointerTy->getClass());
	if (IsIncompleteClassType(ClassType))
	return true;

	return ContainsIncompleteClassType(MemberPointerTy->getPointeeType());
	}

	return false;
	}

	// CanUseSingleInheritance - Return whether the given record decl has a "single,
	// public, non-virtual base at offset zero (i.e. the derived class is dynamic
	// iff the base is)", according to Itanium C++ ABI, 2.95p6b.
	static bool CanUseSingleInheritance(const CXXRecordDecl *RD) {
	// Check the number of bases.
	if (RD->getNumBases() != 1)
	return false;

	// Get the base.
	CXXRecordDecl::base_class_const_iterator Base = RD->bases_begin();

	// Check that the base is not virtual.
	if (Base->isVirtual())
	return false;

	// Check that the base is public.
	if (Base->getAccessSpecifier() != AS_public)
	return false;

	// Check that the class is dynamic iff the base is.
	const CXXRecordDecl *BaseDecl =
	cast<CXXRecordDecl>(Base->getType()->getAs<RecordType>()->getDecl());
	if (!BaseDecl->isEmpty() &&
	BaseDecl->isDynamicClass() != RD->isDynamicClass())
	return false;

	return true;
	}

	void ItaniumRTTIBuilder::BuildVTablePointer(const Type *Ty) {
	// abi::__class_type_info.
	static const char * const ClassTypeInfo =
	"_ZTVN10__cxxabiv117__class_type_infoE";
	// abi::__si_class_type_info.
	static const char * const SIClassTypeInfo =
	"_ZTVN10__cxxabiv120__si_class_type_infoE";
	// abi::__vmi_class_type_info.
	static const char * const VMIClassTypeInfo =
	"_ZTVN10__cxxabiv121__vmi_class_type_infoE";

	const char *VTableName = nullptr;

	switch (Ty->getTypeClass()) {
	#define TYPE(Class, Base)
	#define ABSTRACT_TYPE(Class, Base)
	#define NON_CANONICAL_UNLESS_DEPENDENT_TYPE(Class, Base) case Type::Class:
	#define NON_CANONICAL_TYPE(Class, Base) case Type::Class:
	#define DEPENDENT_TYPE(Class, Base) case Type::Class:
	#include "clang/AST/TypeNodes.def"
	llvm_unreachable("Non-canonical and dependent types shouldn't get here");

	case Type::LValueReference:
	case Type::RValueReference:
	llvm_unreachable("References shouldn't get here");

	case Type::Auto:
	case Type::DeducedTemplateSpecialization:
	llvm_unreachable("Undeduced type shouldn't get here");

	case Type::Pipe:
	llvm_unreachable("Pipe types shouldn't get here");

	case Type::Builtin:
	// GCC treats vector and complex types as fundamental types.
	case Type::Vector:
	case Type::ExtVector:
	case Type::Complex:
	case Type::Atomic:
	// FIXME: GCC treats block pointers as fundamental types?!
	case Type::BlockPointer:
	// abi::__fundamental_type_info.
	VTableName = "_ZTVN10__cxxabiv123__fundamental_type_infoE";
	break;

	case Type::ConstantArray:
	case Type::IncompleteArray:
	case Type::VariableArray:
	// abi::__array_type_info.
	VTableName = "_ZTVN10__cxxabiv117__array_type_infoE";
	break;

	case Type::FunctionNoProto:
	case Type::FunctionProto:
	// abi::__function_type_info.
	VTableName = "_ZTVN10__cxxabiv120__function_type_infoE";
	break;

	case Type::Enum:
	// abi::__enum_type_info.
	VTableName = "_ZTVN10__cxxabiv116__enum_type_infoE";
	break;

	case Type::Record: {
	const CXXRecordDecl *RD =
	cast<CXXRecordDecl>(cast<RecordType>(Ty)->getDecl());

	if (!RD->hasDefinition() \|\| !RD->getNumBases()) {
	VTableName = ClassTypeInfo;
	} else if (CanUseSingleInheritance(RD)) {
	VTableName = SIClassTypeInfo;
	} else {
	VTableName = VMIClassTypeInfo;
	}

	break;
	}

	case Type::ObjCObject:
	// Ignore protocol qualifiers.
	Ty = cast<ObjCObjectType>(Ty)->getBaseType().getTypePtr();

	// Handle id and Class.
	if (isa<BuiltinType>(Ty)) {
	VTableName = ClassTypeInfo;
	break;
	}

	assert(isa<ObjCInterfaceType>(Ty));
	// Fall through.

	case Type::ObjCInterface:
	if (cast<ObjCInterfaceType>(Ty)->getDecl()->getSuperClass()) {
	VTableName = SIClassTypeInfo;
	} else {
	VTableName = ClassTypeInfo;
	}
	break;

	case Type::ObjCObjectPointer:
	case Type::Pointer:
	// abi::__pointer_type_info.
	VTableName = "_ZTVN10__cxxabiv119__pointer_type_infoE";
	break;

	case Type::MemberPointer:
	// abi::__pointer_to_member_type_info.
	VTableName = "_ZTVN10__cxxabiv129__pointer_to_member_type_infoE";
	break;
	}

	llvm::Constant *VTable =
	CGM.getModule().getOrInsertGlobal(VTableName, CGM.Int8PtrTy);

	llvm::Type *PtrDiffTy =
	CGM.getTypes().ConvertType(CGM.getContext().getPointerDiffType());

	// The vtable address point is 2.
	llvm::Constant *Two = llvm::ConstantInt::get(PtrDiffTy, 2);
	VTable =
	llvm::ConstantExpr::getInBoundsGetElementPtr(CGM.Int8PtrTy, VTable, Two);
	VTable = llvm::ConstantExpr::getBitCast(VTable, CGM.Int8PtrTy);

	Fields.push_back(VTable);
	}

	/// \brief Return the linkage that the type info and type info name constants
	/// should have for the given type.
	static llvm::GlobalVariable::LinkageTypes getTypeInfoLinkage(CodeGenModule &CGM,
	QualType Ty) {
	// Itanium C++ ABI 2.9.5p7:
	// In addition, it and all of the intermediate abi::__pointer_type_info
	// structs in the chain down to the abi::__class_type_info for the
	// incomplete class type must be prevented from resolving to the
	// corresponding type_info structs for the complete class type, possibly
	// by making them local static objects. Finally, a dummy class RTTI is
	// generated for the incomplete type that will not resolve to the final
	// complete class RTTI (because the latter need not exist), possibly by
	// making it a local static object.
	if (ContainsIncompleteClassType(Ty))
	return llvm::GlobalValue::InternalLinkage;

	switch (Ty->getLinkage()) {
	case NoLinkage:
	case InternalLinkage:
	case UniqueExternalLinkage:
	return llvm::GlobalValue::InternalLinkage;

	case VisibleNoLinkage:
	case ModuleInternalLinkage:
	case ModuleLinkage:
	case ExternalLinkage:
	// RTTI is not enabled, which means that this type info struct is going
	// to be used for exception handling. Give it linkonce_odr linkage.
	if (!CGM.getLangOpts().RTTI)
	return llvm::GlobalValue::LinkOnceODRLinkage;

	if (const RecordType *Record = dyn_cast<RecordType>(Ty)) {
	const CXXRecordDecl *RD = cast<CXXRecordDecl>(Record->getDecl());
	if (RD->hasAttr<WeakAttr>())
	return llvm::GlobalValue::WeakODRLinkage;
	if (CGM.getTriple().isWindowsItaniumEnvironment())
	if (RD->hasAttr<DLLImportAttr>() &&
	ShouldUseExternalRTTIDescriptor(CGM, Ty))
	return llvm::GlobalValue::ExternalLinkage;
	// MinGW always uses LinkOnceODRLinkage for type info.
	if (RD->isDynamicClass() &&
	!CGM.getContext()
	.getTargetInfo()
	.getTriple()
	.isWindowsGNUEnvironment())
	return CGM.getVTableLinkage(RD);
	}

	return llvm::GlobalValue::LinkOnceODRLinkage;
	}

	llvm_unreachable("Invalid linkage!");
	}

	llvm::Constant *ItaniumRTTIBuilder::BuildTypeInfo(QualType Ty, bool Force,
	bool DLLExport) {
	// We want to operate on the canonical type.
	Ty = Ty.getCanonicalType();

	// Check if we've already emitted an RTTI descriptor for this type.
	SmallString<256> Name;
	llvm::raw_svector_ostream Out(Name);
	CGM.getCXXABI().getMangleContext().mangleCXXRTTI(Ty, Out);

	llvm::GlobalVariable *OldGV = CGM.getModule().getNamedGlobal(Name);
	if (OldGV && !OldGV->isDeclaration()) {
	assert(!OldGV->hasAvailableExternallyLinkage() &&
	"available_externally typeinfos not yet implemented");

	return llvm::ConstantExpr::getBitCast(OldGV, CGM.Int8PtrTy);
	}

	// Check if there is already an external RTTI descriptor for this type.
	bool IsStdLib = IsStandardLibraryRTTIDescriptor(Ty);
	if (!Force && (IsStdLib \|\| ShouldUseExternalRTTIDescriptor(CGM, Ty)))
	return GetAddrOfExternalRTTIDescriptor(Ty);

	// Emit the standard library with external linkage.
	llvm::GlobalVariable::LinkageTypes Linkage;
	if (IsStdLib)
	Linkage = llvm::GlobalValue::ExternalLinkage;
	else
	Linkage = getTypeInfoLinkage(CGM, Ty);

	// Add the vtable pointer.
	BuildVTablePointer(cast<Type>(Ty));

	// And the name.
	llvm::GlobalVariable *TypeName = GetAddrOfTypeName(Ty, Linkage);
	llvm::Constant *TypeNameField;

	// If we're supposed to demote the visibility, be sure to set a flag
	// to use a string comparison for type_info comparisons.
	ItaniumCXXABI::RTTIUniquenessKind RTTIUniqueness =
	CXXABI.classifyRTTIUniqueness(Ty, Linkage);
	if (RTTIUniqueness != ItaniumCXXABI::RUK_Unique) {
	// The flag is the sign bit, which on ARM64 is defined to be clear
	// for global pointers. This is very ARM64-specific.
	TypeNameField = llvm::ConstantExpr::getPtrToInt(TypeName, CGM.Int64Ty);
	llvm::Constant *flag =
	llvm::ConstantInt::get(CGM.Int64Ty, ((uint64_t)1) << 63);
	TypeNameField = llvm::ConstantExpr::getAdd(TypeNameField, flag);
	TypeNameField =
	llvm::ConstantExpr::getIntToPtr(TypeNameField, CGM.Int8PtrTy);
	} else {
	TypeNameField = llvm::ConstantExpr::getBitCast(TypeName, CGM.Int8PtrTy);
	}
	Fields.push_back(TypeNameField);

	switch (Ty->getTypeClass()) {
	#define TYPE(Class, Base)
	#define ABSTRACT_TYPE(Class, Base)
	#define NON_CANONICAL_UNLESS_DEPENDENT_TYPE(Class, Base) case Type::Class:
	#define NON_CANONICAL_TYPE(Class, Base) case Type::Class:
	#define DEPENDENT_TYPE(Class, Base) case Type::Class:
	#include "clang/AST/TypeNodes.def"
	llvm_unreachable("Non-canonical and dependent types shouldn't get here");

	// GCC treats vector types as fundamental types.
	case Type::Builtin:
	case Type::Vector:
	case Type::ExtVector:
	case Type::Complex:
	case Type::BlockPointer:
	// Itanium C++ ABI 2.9.5p4:
	// abi::__fundamental_type_info adds no data members to std::type_info.
	break;

	case Type::LValueReference:
	case Type::RValueReference:
	llvm_unreachable("References shouldn't get here");

	case Type::Auto:
	case Type::DeducedTemplateSpecialization:
	llvm_unreachable("Undeduced type shouldn't get here");

	case Type::Pipe:
	llvm_unreachable("Pipe type shouldn't get here");

	case Type::ConstantArray:
	case Type::IncompleteArray:
	case Type::VariableArray:
	// Itanium C++ ABI 2.9.5p5:
	// abi::__array_type_info adds no data members to std::type_info.
	break;

	case Type::FunctionNoProto:
	case Type::FunctionProto:
	// Itanium C++ ABI 2.9.5p5:
	// abi::__function_type_info adds no data members to std::type_info.
	break;

	case Type::Enum:
	// Itanium C++ ABI 2.9.5p5:
	// abi::__enum_type_info adds no data members to std::type_info.
	break;

	case Type::Record: {
	const CXXRecordDecl *RD =
	cast<CXXRecordDecl>(cast<RecordType>(Ty)->getDecl());
	if (!RD->hasDefinition() \|\| !RD->getNumBases()) {
	// We don't need to emit any fields.
	break;
	}

	if (CanUseSingleInheritance(RD))
	BuildSIClassTypeInfo(RD);
	else
	BuildVMIClassTypeInfo(RD);

	break;
	}

	case Type::ObjCObject:
	case Type::ObjCInterface:
	BuildObjCObjectTypeInfo(cast<ObjCObjectType>(Ty));
	break;

	case Type::ObjCObjectPointer:
	BuildPointerTypeInfo(cast<ObjCObjectPointerType>(Ty)->getPointeeType());
	break;

	case Type::Pointer:
	BuildPointerTypeInfo(cast<PointerType>(Ty)->getPointeeType());
	break;

	case Type::MemberPointer:
	BuildPointerToMemberTypeInfo(cast<MemberPointerType>(Ty));
	break;

	case Type::Atomic:
	// No fields, at least for the moment.
	break;
	}

	llvm::Constant *Init = llvm::ConstantStruct::getAnon(Fields);

	llvm::Module &M = CGM.getModule();
	llvm::GlobalVariable *GV =
	new llvm::GlobalVariable(M, Init->getType(),
	/Constant=/true, Linkage, Init, Name);

	// If there's already an old global variable, replace it with the new one.
	if (OldGV) {
	GV->takeName(OldGV);
	llvm::Constant *NewPtr =
	llvm::ConstantExpr::getBitCast(GV, OldGV->getType());
	OldGV->replaceAllUsesWith(NewPtr);
	OldGV->eraseFromParent();
	}

	if (CGM.supportsCOMDAT() && GV->isWeakForLinker())
	GV->setComdat(M.getOrInsertComdat(GV->getName()));

	// The Itanium ABI specifies that type_info objects must be globally
	// unique, with one exception: if the type is an incomplete class
	// type or a (possibly indirect) pointer to one. That exception
	// affects the general case of comparing type_info objects produced
	// by the typeid operator, which is why the comparison operators on
	// std::type_info generally use the type_info name pointers instead
	// of the object addresses. However, the language's built-in uses
	// of RTTI generally require class types to be complete, even when
	// manipulating pointers to those class types. This allows the
	// implementation of dynamic_cast to rely on address equality tests,
	// which is much faster.

	// All of this is to say that it's important that both the type_info
	// object and the type_info name be uniqued when weakly emitted.

	// Give the type_info object and name the formal visibility of the
	// type itself.
	llvm::GlobalValue::VisibilityTypes llvmVisibility;
	if (llvm::GlobalValue::isLocalLinkage(Linkage))
	// If the linkage is local, only default visibility makes sense.
	llvmVisibility = llvm::GlobalValue::DefaultVisibility;
	else if (RTTIUniqueness == ItaniumCXXABI::RUK_NonUniqueHidden)
	llvmVisibility = llvm::GlobalValue::HiddenVisibility;
	else
	llvmVisibility = CodeGenModule::GetLLVMVisibility(Ty->getVisibility());

	TypeName->setVisibility(llvmVisibility);
	GV->setVisibility(llvmVisibility);

	if (CGM.getTriple().isWindowsItaniumEnvironment()) {
	auto RD = Ty->getAsCXXRecordDecl();
	if (DLLExport \|\| (RD && RD->hasAttr<DLLExportAttr>())) {
	TypeName->setDLLStorageClass(llvm::GlobalValue::DLLExportStorageClass);
	GV->setDLLStorageClass(llvm::GlobalValue::DLLExportStorageClass);
	} else if (RD && RD->hasAttr<DLLImportAttr>() &&
	ShouldUseExternalRTTIDescriptor(CGM, Ty)) {
	TypeName->setDLLStorageClass(llvm::GlobalValue::DLLImportStorageClass);
	GV->setDLLStorageClass(llvm::GlobalValue::DLLImportStorageClass);

	// Because the typename and the typeinfo are DLL import, convert them to
	// declarations rather than definitions. The initializers still need to
	// be constructed to calculate the type for the declarations.
	TypeName->setInitializer(nullptr);
	GV->setInitializer(nullptr);
	}
	}

	return llvm::ConstantExpr::getBitCast(GV, CGM.Int8PtrTy);
	}

	/// BuildObjCObjectTypeInfo - Build the appropriate kind of type_info
	/// for the given Objective-C object type.
	void ItaniumRTTIBuilder::BuildObjCObjectTypeInfo(const ObjCObjectType *OT) {
	// Drop qualifiers.
	const Type *T = OT->getBaseType().getTypePtr();
	assert(isa<BuiltinType>(T) \|\| isa<ObjCInterfaceType>(T));

	// The builtin types are abi::__class_type_infos and don't require
	// extra fields.
	if (isa<BuiltinType>(T)) return;

	ObjCInterfaceDecl *Class = cast<ObjCInterfaceType>(T)->getDecl();
	ObjCInterfaceDecl *Super = Class->getSuperClass();

	// Root classes are also __class_type_info.
	if (!Super) return;

	QualType SuperTy = CGM.getContext().getObjCInterfaceType(Super);

	// Everything else is single inheritance.
	llvm::Constant *BaseTypeInfo =
	ItaniumRTTIBuilder(CXXABI).BuildTypeInfo(SuperTy);
	Fields.push_back(BaseTypeInfo);
	}

	/// BuildSIClassTypeInfo - Build an abi::__si_class_type_info, used for single
	/// inheritance, according to the Itanium C++ ABI, 2.95p6b.
	void ItaniumRTTIBuilder::BuildSIClassTypeInfo(const CXXRecordDecl *RD) {
	// Itanium C++ ABI 2.9.5p6b:
	// It adds to abi::__class_type_info a single member pointing to the
	// type_info structure for the base type,
	llvm::Constant *BaseTypeInfo =
	ItaniumRTTIBuilder(CXXABI).BuildTypeInfo(RD->bases_begin()->getType());
	Fields.push_back(BaseTypeInfo);
	}

	namespace {
	/// SeenBases - Contains virtual and non-virtual bases seen when traversing
	/// a class hierarchy.
	struct SeenBases {
	llvm::SmallPtrSet<const CXXRecordDecl *, 16> NonVirtualBases;
	llvm::SmallPtrSet<const CXXRecordDecl *, 16> VirtualBases;
	};
	}

	/// ComputeVMIClassTypeInfoFlags - Compute the value of the flags member in
	/// abi::__vmi_class_type_info.
	///
	static unsigned ComputeVMIClassTypeInfoFlags(const CXXBaseSpecifier *Base,
	SeenBases &Bases) {

	unsigned Flags = 0;

	const CXXRecordDecl *BaseDecl =
	cast<CXXRecordDecl>(Base->getType()->getAs<RecordType>()->getDecl());

	if (Base->isVirtual()) {
	// Mark the virtual base as seen.
	if (!Bases.VirtualBases.insert(BaseDecl).second) {
	// If this virtual base has been seen before, then the class is diamond
	// shaped.
	Flags \|= ItaniumRTTIBuilder::VMI_DiamondShaped;
	} else {
	if (Bases.NonVirtualBases.count(BaseDecl))
	Flags \|= ItaniumRTTIBuilder::VMI_NonDiamondRepeat;
	}
	} else {
	// Mark the non-virtual base as seen.
	if (!Bases.NonVirtualBases.insert(BaseDecl).second) {
	// If this non-virtual base has been seen before, then the class has non-
	// diamond shaped repeated inheritance.
	Flags \|= ItaniumRTTIBuilder::VMI_NonDiamondRepeat;
	} else {
	if (Bases.VirtualBases.count(BaseDecl))
	Flags \|= ItaniumRTTIBuilder::VMI_NonDiamondRepeat;
	}
	}

	// Walk all bases.
	for (const auto &I : BaseDecl->bases())
	Flags \|= ComputeVMIClassTypeInfoFlags(&I, Bases);

	return Flags;
	}

	static unsigned ComputeVMIClassTypeInfoFlags(const CXXRecordDecl *RD) {
	unsigned Flags = 0;
	SeenBases Bases;

	// Walk all bases.
	for (const auto &I : RD->bases())
	Flags \|= ComputeVMIClassTypeInfoFlags(&I, Bases);

	return Flags;
	}

	/// BuildVMIClassTypeInfo - Build an abi::__vmi_class_type_info, used for
	/// classes with bases that do not satisfy the abi::__si_class_type_info
	/// constraints, according ti the Itanium C++ ABI, 2.9.5p5c.
	void ItaniumRTTIBuilder::BuildVMIClassTypeInfo(const CXXRecordDecl *RD) {
	llvm::Type *UnsignedIntLTy =
	CGM.getTypes().ConvertType(CGM.getContext().UnsignedIntTy);

	// Itanium C++ ABI 2.9.5p6c:
	// __flags is a word with flags describing details about the class
	// structure, which may be referenced by using the __flags_masks
	// enumeration. These flags refer to both direct and indirect bases.
	unsigned Flags = ComputeVMIClassTypeInfoFlags(RD);
	Fields.push_back(llvm::ConstantInt::get(UnsignedIntLTy, Flags));

	// Itanium C++ ABI 2.9.5p6c:
	// __base_count is a word with the number of direct proper base class
	// descriptions that follow.
	Fields.push_back(llvm::ConstantInt::get(UnsignedIntLTy, RD->getNumBases()));

	if (!RD->getNumBases())
	return;

	// Now add the base class descriptions.

	// Itanium C++ ABI 2.9.5p6c:
	// __base_info[] is an array of base class descriptions -- one for every
	// direct proper base. Each description is of the type:
	//
	// struct abi::__base_class_type_info {
	// public:
	// const __class_type_info *__base_type;
	// long __offset_flags;
	//
	// enum __offset_flags_masks {
	// __virtual_mask = 0x1,
	// __public_mask = 0x2,
	// __offset_shift = 8
	// };
	// };

	// If we're in mingw and 'long' isn't wide enough for a pointer, use 'long
	// long' instead of 'long' for __offset_flags. libstdc++abi uses long long on
	// LLP64 platforms.
	// FIXME: Consider updating libc++abi to match, and extend this logic to all
	// LLP64 platforms.
	QualType OffsetFlagsTy = CGM.getContext().LongTy;
	const TargetInfo &TI = CGM.getContext().getTargetInfo();
	if (TI.getTriple().isOSCygMing() && TI.getPointerWidth(0) > TI.getLongWidth())
	OffsetFlagsTy = CGM.getContext().LongLongTy;
	llvm::Type *OffsetFlagsLTy =
	CGM.getTypes().ConvertType(OffsetFlagsTy);

	for (const auto &Base : RD->bases()) {
	// The __base_type member points to the RTTI for the base type.
	Fields.push_back(ItaniumRTTIBuilder(CXXABI).BuildTypeInfo(Base.getType()));

	const CXXRecordDecl *BaseDecl =
	cast<CXXRecordDecl>(Base.getType()->getAs<RecordType>()->getDecl());

	int64_t OffsetFlags = 0;

	// All but the lower 8 bits of __offset_flags are a signed offset.
	// For a non-virtual base, this is the offset in the object of the base
	// subobject. For a virtual base, this is the offset in the virtual table of
	// the virtual base offset for the virtual base referenced (negative).
	CharUnits Offset;
	if (Base.isVirtual())
	Offset =
	CGM.getItaniumVTableContext().getVirtualBaseOffsetOffset(RD, BaseDecl);
	else {
	const ASTRecordLayout &Layout = CGM.getContext().getASTRecordLayout(RD);
	Offset = Layout.getBaseClassOffset(BaseDecl);
	};

	OffsetFlags = uint64_t(Offset.getQuantity()) << 8;

	// The low-order byte of __offset_flags contains flags, as given by the
	// masks from the enumeration __offset_flags_masks.
	if (Base.isVirtual())
	OffsetFlags \|= BCTI_Virtual;
	if (Base.getAccessSpecifier() == AS_public)
	OffsetFlags \|= BCTI_Public;

	Fields.push_back(llvm::ConstantInt::get(OffsetFlagsLTy, OffsetFlags));
	}
	}

	/// Compute the flags for a __pbase_type_info, and remove the corresponding
	/// pieces from \p Type.
	static unsigned extractPBaseFlags(ASTContext &Ctx, QualType &Type) {
	unsigned Flags = 0;

	if (Type.isConstQualified())
	Flags \|= ItaniumRTTIBuilder::PTI_Const;
	if (Type.isVolatileQualified())
	Flags \|= ItaniumRTTIBuilder::PTI_Volatile;
	if (Type.isRestrictQualified())
	Flags \|= ItaniumRTTIBuilder::PTI_Restrict;
	Type = Type.getUnqualifiedType();

	// Itanium C++ ABI 2.9.5p7:
	// When the abi::__pbase_type_info is for a direct or indirect pointer to an
	// incomplete class type, the incomplete target type flag is set.
	if (ContainsIncompleteClassType(Type))
	Flags \|= ItaniumRTTIBuilder::PTI_Incomplete;

	if (auto *Proto = Type->getAs<FunctionProtoType>()) {
	if (Proto->isNothrow(Ctx)) {
	Flags \|= ItaniumRTTIBuilder::PTI_Noexcept;
	Type = Ctx.getFunctionType(
	Proto->getReturnType(), Proto->getParamTypes(),
	Proto->getExtProtoInfo().withExceptionSpec(EST_None));
	}
	}

	return Flags;
	}

	/// BuildPointerTypeInfo - Build an abi::__pointer_type_info struct,
	/// used for pointer types.
	void ItaniumRTTIBuilder::BuildPointerTypeInfo(QualType PointeeTy) {
	// Itanium C++ ABI 2.9.5p7:
	// __flags is a flag word describing the cv-qualification and other
	// attributes of the type pointed to
	unsigned Flags = extractPBaseFlags(CGM.getContext(), PointeeTy);

	llvm::Type *UnsignedIntLTy =
	CGM.getTypes().ConvertType(CGM.getContext().UnsignedIntTy);
	Fields.push_back(llvm::ConstantInt::get(UnsignedIntLTy, Flags));

	// Itanium C++ ABI 2.9.5p7:
	// __pointee is a pointer to the std::type_info derivation for the
	// unqualified type being pointed to.
	llvm::Constant *PointeeTypeInfo =
	ItaniumRTTIBuilder(CXXABI).BuildTypeInfo(PointeeTy);
	Fields.push_back(PointeeTypeInfo);
	}

	/// BuildPointerToMemberTypeInfo - Build an abi::__pointer_to_member_type_info
	/// struct, used for member pointer types.
	void
	ItaniumRTTIBuilder::BuildPointerToMemberTypeInfo(const MemberPointerType *Ty) {
	QualType PointeeTy = Ty->getPointeeType();

	// Itanium C++ ABI 2.9.5p7:
	// __flags is a flag word describing the cv-qualification and other
	// attributes of the type pointed to.
	unsigned Flags = extractPBaseFlags(CGM.getContext(), PointeeTy);

	const RecordType *ClassType = cast<RecordType>(Ty->getClass());
	if (IsIncompleteClassType(ClassType))
	Flags \|= PTI_ContainingClassIncomplete;

	llvm::Type *UnsignedIntLTy =
	CGM.getTypes().ConvertType(CGM.getContext().UnsignedIntTy);
	Fields.push_back(llvm::ConstantInt::get(UnsignedIntLTy, Flags));

	// Itanium C++ ABI 2.9.5p7:
	// __pointee is a pointer to the std::type_info derivation for the
	// unqualified type being pointed to.
	llvm::Constant *PointeeTypeInfo =
	ItaniumRTTIBuilder(CXXABI).BuildTypeInfo(PointeeTy);
	Fields.push_back(PointeeTypeInfo);

	// Itanium C++ ABI 2.9.5p9:
	// __context is a pointer to an abi::__class_type_info corresponding to the
	// class type containing the member pointed to
	// (e.g., the "A" in "int A::*").
	Fields.push_back(
	ItaniumRTTIBuilder(CXXABI).BuildTypeInfo(QualType(ClassType, 0)));
	}

	llvm::Constant *ItaniumCXXABI::getAddrOfRTTIDescriptor(QualType Ty) {
	return ItaniumRTTIBuilder(*this).BuildTypeInfo(Ty);
	}

	void ItaniumCXXABI::EmitFundamentalRTTIDescriptor(QualType Type,
	bool DLLExport) {
	QualType PointerType = getContext().getPointerType(Type);
	QualType PointerTypeConst = getContext().getPointerType(Type.withConst());
	ItaniumRTTIBuilder(this).BuildTypeInfo(Type, /Force=*/true, DLLExport);
	ItaniumRTTIBuilder(this).BuildTypeInfo(PointerType, /Force=*/true,
	DLLExport);
	ItaniumRTTIBuilder(this).BuildTypeInfo(PointerTypeConst, /Force=*/true,
	DLLExport);
	}

	void ItaniumCXXABI::EmitFundamentalRTTIDescriptors(bool DLLExport) {
	// Types added here must also be added to TypeInfoIsInStandardLibrary.
	QualType FundamentalTypes[] = {
	getContext().VoidTy, getContext().NullPtrTy,
	getContext().BoolTy, getContext().WCharTy,
	getContext().CharTy, getContext().UnsignedCharTy,
	getContext().SignedCharTy, getContext().ShortTy,
	getContext().UnsignedShortTy, getContext().IntTy,
	getContext().UnsignedIntTy, getContext().LongTy,
	getContext().UnsignedLongTy, getContext().LongLongTy,
	getContext().UnsignedLongLongTy, getContext().Int128Ty,
	getContext().UnsignedInt128Ty, getContext().HalfTy,
	getContext().FloatTy, getContext().DoubleTy,
	getContext().LongDoubleTy, getContext().Float128Ty,
	getContext().Char16Ty, getContext().Char32Ty
	};
	for (const QualType &FundamentalType : FundamentalTypes)
	EmitFundamentalRTTIDescriptor(FundamentalType, DLLExport);
	}

	/// What sort of uniqueness rules should we use for the RTTI for the
	/// given type?
	ItaniumCXXABI::RTTIUniquenessKind ItaniumCXXABI::classifyRTTIUniqueness(
	QualType CanTy, llvm::GlobalValue::LinkageTypes Linkage) const {
	if (shouldRTTIBeUnique())
	return RUK_Unique;

	// It's only necessary for linkonce_odr or weak_odr linkage.
	if (Linkage != llvm::GlobalValue::LinkOnceODRLinkage &&
	Linkage != llvm::GlobalValue::WeakODRLinkage)
	return RUK_Unique;

	// It's only necessary with default visibility.
	if (CanTy->getVisibility() != DefaultVisibility)
	return RUK_Unique;

	// If we're not required to publish this symbol, hide it.
	if (Linkage == llvm::GlobalValue::LinkOnceODRLinkage)
	return RUK_NonUniqueHidden;

	// If we're required to publish this symbol, as we might be under an
	// explicit instantiation, leave it with default visibility but
	// enable string-comparisons.
	assert(Linkage == llvm::GlobalValue::WeakODRLinkage);
	return RUK_NonUniqueVisible;
	}

	// Find out how to codegen the complete destructor and constructor
	namespace {
	enum class StructorCodegen { Emit, RAUW, Alias, COMDAT };
	}
	static StructorCodegen getCodegenToUse(CodeGenModule &CGM,
	const CXXMethodDecl *MD) {
	if (!CGM.getCodeGenOpts().CXXCtorDtorAliases)
	return StructorCodegen::Emit;

	// The complete and base structors are not equivalent if there are any virtual
	// bases, so emit separate functions.
	if (MD->getParent()->getNumVBases())
	return StructorCodegen::Emit;

	GlobalDecl AliasDecl;
	if (const auto *DD = dyn_cast<CXXDestructorDecl>(MD)) {
	AliasDecl = GlobalDecl(DD, Dtor_Complete);
	} else {
	const auto *CD = cast<CXXConstructorDecl>(MD);
	AliasDecl = GlobalDecl(CD, Ctor_Complete);
	}
	llvm::GlobalValue::LinkageTypes Linkage = CGM.getFunctionLinkage(AliasDecl);

	if (llvm::GlobalValue::isDiscardableIfUnused(Linkage))
	return StructorCodegen::RAUW;

	// FIXME: Should we allow available_externally aliases?
	if (!llvm::GlobalAlias::isValidLinkage(Linkage))
	return StructorCodegen::RAUW;

	if (llvm::GlobalValue::isWeakForLinker(Linkage)) {
	// Only ELF and wasm support COMDATs with arbitrary names (C5/D5).
	if (CGM.getTarget().getTriple().isOSBinFormatELF() \|\|
	CGM.getTarget().getTriple().isOSBinFormatWasm())
	return StructorCodegen::COMDAT;
	return StructorCodegen::Emit;
	}

	return StructorCodegen::Alias;
	}

	static void emitConstructorDestructorAlias(CodeGenModule &CGM,
	GlobalDecl AliasDecl,
	GlobalDecl TargetDecl) {
	llvm::GlobalValue::LinkageTypes Linkage = CGM.getFunctionLinkage(AliasDecl);

	StringRef MangledName = CGM.getMangledName(AliasDecl);
	llvm::GlobalValue *Entry = CGM.GetGlobalValue(MangledName);
	if (Entry && !Entry->isDeclaration())
	return;

	auto *Aliasee = cast<llvm::GlobalValue>(CGM.GetAddrOfGlobal(TargetDecl));

	// Create the alias with no name.
	auto *Alias = llvm::GlobalAlias::create(Linkage, "", Aliasee);

	// Switch any previous uses to the alias.
	if (Entry) {
	assert(Entry->getType() == Aliasee->getType() &&
	"declaration exists with different type");
	Alias->takeName(Entry);
	Entry->replaceAllUsesWith(Alias);
	Entry->eraseFromParent();
	} else {
	Alias->setName(MangledName);
	}

	// Finally, set up the alias with its proper name and attributes.
	CGM.setAliasAttributes(cast<NamedDecl>(AliasDecl.getDecl()), Alias);
	}

	void ItaniumCXXABI::emitCXXStructor(const CXXMethodDecl *MD,
	StructorType Type) {
	auto *CD = dyn_cast<CXXConstructorDecl>(MD);
	const CXXDestructorDecl *DD = CD ? nullptr : cast<CXXDestructorDecl>(MD);

	StructorCodegen CGType = getCodegenToUse(CGM, MD);

	if (Type == StructorType::Complete) {
	GlobalDecl CompleteDecl;
	GlobalDecl BaseDecl;
	if (CD) {
	CompleteDecl = GlobalDecl(CD, Ctor_Complete);
	BaseDecl = GlobalDecl(CD, Ctor_Base);
	} else {
	CompleteDecl = GlobalDecl(DD, Dtor_Complete);
	BaseDecl = GlobalDecl(DD, Dtor_Base);
	}

	if (CGType == StructorCodegen::Alias \|\| CGType == StructorCodegen::COMDAT) {
	emitConstructorDestructorAlias(CGM, CompleteDecl, BaseDecl);
	return;
	}

	if (CGType == StructorCodegen::RAUW) {
	StringRef MangledName = CGM.getMangledName(CompleteDecl);
	auto *Aliasee = CGM.GetAddrOfGlobal(BaseDecl);
	CGM.addReplacement(MangledName, Aliasee);
	return;
	}
	}

	// The base destructor is equivalent to the base destructor of its
	// base class if there is exactly one non-virtual base class with a
	// non-trivial destructor, there are no fields with a non-trivial
	// destructor, and the body of the destructor is trivial.
	if (DD && Type == StructorType::Base && CGType != StructorCodegen::COMDAT &&
	!CGM.TryEmitBaseDestructorAsAlias(DD))
	return;

	// FIXME: The deleting destructor is equivalent to the selected operator
	// delete if:
	// * either the delete is a destroying operator delete or the destructor
	// would be trivial if it weren't virtual,
	// * the conversion from the 'this' parameter to the first parameter of the
	// destructor is equivalent to a bitcast,
	// * the destructor does not have an implicit "this" return, and
	// * the operator delete has the same calling convention and IR function type
	// as the destructor.
	// In such cases we should try to emit the deleting dtor as an alias to the
	// selected 'operator delete'.

	llvm::Function *Fn = CGM.codegenCXXStructor(MD, Type);

	if (CGType == StructorCodegen::COMDAT) {
	SmallString<256> Buffer;
	llvm::raw_svector_ostream Out(Buffer);
	if (DD)
	getMangleContext().mangleCXXDtorComdat(DD, Out);
	else
	getMangleContext().mangleCXXCtorComdat(CD, Out);
	llvm::Comdat *C = CGM.getModule().getOrInsertComdat(Out.str());
	Fn->setComdat(C);
	} else {
	CGM.maybeSetTrivialComdat(MD, Fn);
	}
	}

	static llvm::Constant *getBeginCatchFn(CodeGenModule &CGM) {
	// void __cxa_begin_catch(void);
	llvm::FunctionType *FTy = llvm::FunctionType::get(
	CGM.Int8PtrTy, CGM.Int8PtrTy, /IsVarArgs=/false);

	return CGM.CreateRuntimeFunction(FTy, "__cxa_begin_catch");
	}

	static llvm::Constant *getEndCatchFn(CodeGenModule &CGM) {
	// void __cxa_end_catch();
	llvm::FunctionType *FTy =
	llvm::FunctionType::get(CGM.VoidTy, /IsVarArgs=/false);

	return CGM.CreateRuntimeFunction(FTy, "__cxa_end_catch");
	}

	static llvm::Constant *getGetExceptionPtrFn(CodeGenModule &CGM) {
	// void __cxa_get_exception_ptr(void);
	llvm::FunctionType *FTy = llvm::FunctionType::get(
	CGM.Int8PtrTy, CGM.Int8PtrTy, /IsVarArgs=/false);

	return CGM.CreateRuntimeFunction(FTy, "__cxa_get_exception_ptr");
	}

	namespace {
	/// A cleanup to call __cxa_end_catch. In many cases, the caught
	/// exception type lets us state definitively that the thrown exception
	/// type does not have a destructor. In particular:
	/// - Catch-alls tell us nothing, so we have to conservatively
	/// assume that the thrown exception might have a destructor.
	/// - Catches by reference behave according to their base types.
	/// - Catches of non-record types will only trigger for exceptions
	/// of non-record types, which never have destructors.
	/// - Catches of record types can trigger for arbitrary subclasses
	/// of the caught type, so we have to assume the actual thrown
	/// exception type might have a throwing destructor, even if the
	/// caught type's destructor is trivial or nothrow.
	struct CallEndCatch final : EHScopeStack::Cleanup {
	CallEndCatch(bool MightThrow) : MightThrow(MightThrow) {}
	bool MightThrow;

	void Emit(CodeGenFunction &CGF, Flags flags) override {
	if (!MightThrow) {
	CGF.EmitNounwindRuntimeCall(getEndCatchFn(CGF.CGM));
	return;
	}

	CGF.EmitRuntimeCallOrInvoke(getEndCatchFn(CGF.CGM));
	}
	};
	}

	/// Emits a call to __cxa_begin_catch and enters a cleanup to call
	/// __cxa_end_catch.
	///
	/// \param EndMightThrow - true if __cxa_end_catch might throw
	static llvm::Value *CallBeginCatch(CodeGenFunction &CGF,
	llvm::Value *Exn,
	bool EndMightThrow) {
	llvm::CallInst *call =
	CGF.EmitNounwindRuntimeCall(getBeginCatchFn(CGF.CGM), Exn);

	CGF.EHStack.pushCleanup<CallEndCatch>(NormalAndEHCleanup, EndMightThrow);

	return call;
	}

	/// A "special initializer" callback for initializing a catch
	/// parameter during catch initialization.
	static void InitCatchParam(CodeGenFunction &CGF,
	const VarDecl &CatchParam,
	Address ParamAddr,
	SourceLocation Loc) {
	// Load the exception from where the landing pad saved it.
	llvm::Value *Exn = CGF.getExceptionFromSlot();

	CanQualType CatchType =
	CGF.CGM.getContext().getCanonicalType(CatchParam.getType());
	llvm::Type *LLVMCatchTy = CGF.ConvertTypeForMem(CatchType);

	// If we're catching by reference, we can just cast the object
	// pointer to the appropriate pointer.
	if (isa<ReferenceType>(CatchType)) {
	QualType CaughtType = cast<ReferenceType>(CatchType)->getPointeeType();
	bool EndCatchMightThrow = CaughtType->isRecordType();

	// __cxa_begin_catch returns the adjusted object pointer.
	llvm::Value *AdjustedExn = CallBeginCatch(CGF, Exn, EndCatchMightThrow);

	// We have no way to tell the personality function that we're
	// catching by reference, so if we're catching a pointer,
	// __cxa_begin_catch will actually return that pointer by value.
	if (const PointerType *PT = dyn_cast<PointerType>(CaughtType)) {
	QualType PointeeType = PT->getPointeeType();

	// When catching by reference, generally we should just ignore
	// this by-value pointer and use the exception object instead.
	if (!PointeeType->isRecordType()) {

	// Exn points to the struct _Unwind_Exception header, which
	// we have to skip past in order to reach the exception data.
	unsigned HeaderSize =
	CGF.CGM.getTargetCodeGenInfo().getSizeOfUnwindException();
	AdjustedExn = CGF.Builder.CreateConstGEP1_32(Exn, HeaderSize);

	// However, if we're catching a pointer-to-record type that won't
	// work, because the personality function might have adjusted
	// the pointer. There's actually no way for us to fully satisfy
	// the language/ABI contract here: we can't use Exn because it
	// might have the wrong adjustment, but we can't use the by-value
	// pointer because it's off by a level of abstraction.
	//
	// The current solution is to dump the adjusted pointer into an
	// alloca, which breaks language semantics (because changing the
	// pointer doesn't change the exception) but at least works.
	// The better solution would be to filter out non-exact matches
	// and rethrow them, but this is tricky because the rethrow
	// really needs to be catchable by other sites at this landing
	// pad. The best solution is to fix the personality function.
	} else {
	// Pull the pointer for the reference type off.
	llvm::Type *PtrTy =
	cast<llvm::PointerType>(LLVMCatchTy)->getElementType();

	// Create the temporary and write the adjusted pointer into it.
	Address ExnPtrTmp =
	CGF.CreateTempAlloca(PtrTy, CGF.getPointerAlign(), "exn.byref.tmp");
	llvm::Value *Casted = CGF.Builder.CreateBitCast(AdjustedExn, PtrTy);
	CGF.Builder.CreateStore(Casted, ExnPtrTmp);

	// Bind the reference to the temporary.
	AdjustedExn = ExnPtrTmp.getPointer();
	}
	}

	llvm::Value *ExnCast =
	CGF.Builder.CreateBitCast(AdjustedExn, LLVMCatchTy, "exn.byref");
	CGF.Builder.CreateStore(ExnCast, ParamAddr);
	return;
	}

	// Scalars and complexes.
	TypeEvaluationKind TEK = CGF.getEvaluationKind(CatchType);
	if (TEK != TEK_Aggregate) {
	llvm::Value *AdjustedExn = CallBeginCatch(CGF, Exn, false);

	// If the catch type is a pointer type, __cxa_begin_catch returns
	// the pointer by value.
	if (CatchType->hasPointerRepresentation()) {
	llvm::Value *CastExn =
	CGF.Builder.CreateBitCast(AdjustedExn, LLVMCatchTy, "exn.casted");

	switch (CatchType.getQualifiers().getObjCLifetime()) {
	case Qualifiers::OCL_Strong:
	CastExn = CGF.EmitARCRetainNonBlock(CastExn);
	// fallthrough

	case Qualifiers::OCL_None:
	case Qualifiers::OCL_ExplicitNone:
	case Qualifiers::OCL_Autoreleasing:
	CGF.Builder.CreateStore(CastExn, ParamAddr);
	return;

	case Qualifiers::OCL_Weak:
	CGF.EmitARCInitWeak(ParamAddr, CastExn);
	return;
	}
	llvm_unreachable("bad ownership qualifier!");
	}

	// Otherwise, it returns a pointer into the exception object.

	llvm::Type *PtrTy = LLVMCatchTy->getPointerTo(0); // addrspace 0 ok
	llvm::Value *Cast = CGF.Builder.CreateBitCast(AdjustedExn, PtrTy);

	LValue srcLV = CGF.MakeNaturalAlignAddrLValue(Cast, CatchType);
	LValue destLV = CGF.MakeAddrLValue(ParamAddr, CatchType);
	switch (TEK) {
	case TEK_Complex:
	CGF.EmitStoreOfComplex(CGF.EmitLoadOfComplex(srcLV, Loc), destLV,
	/init/ true);
	return;
	case TEK_Scalar: {
	llvm::Value *ExnLoad = CGF.EmitLoadOfScalar(srcLV, Loc);
	CGF.EmitStoreOfScalar(ExnLoad, destLV, /init/ true);
	return;
	}
	case TEK_Aggregate:
	llvm_unreachable("evaluation kind filtered out!");
	}
	llvm_unreachable("bad evaluation kind");
	}

	assert(isa<RecordType>(CatchType) && "unexpected catch type!");
	auto catchRD = CatchType->getAsCXXRecordDecl();
	CharUnits caughtExnAlignment = CGF.CGM.getClassPointerAlignment(catchRD);

	llvm::Type *PtrTy = LLVMCatchTy->getPointerTo(0); // addrspace 0 ok

	// Check for a copy expression. If we don't have a copy expression,
	// that means a trivial copy is okay.
	const Expr *copyExpr = CatchParam.getInit();
	if (!copyExpr) {
	llvm::Value *rawAdjustedExn = CallBeginCatch(CGF, Exn, true);
	Address adjustedExn(CGF.Builder.CreateBitCast(rawAdjustedExn, PtrTy),
	caughtExnAlignment);
	CGF.EmitAggregateCopy(ParamAddr, adjustedExn, CatchType);
	return;
	}

	// We have to call __cxa_get_exception_ptr to get the adjusted
	// pointer before copying.
	llvm::CallInst *rawAdjustedExn =
	CGF.EmitNounwindRuntimeCall(getGetExceptionPtrFn(CGF.CGM), Exn);

	// Cast that to the appropriate type.
	Address adjustedExn(CGF.Builder.CreateBitCast(rawAdjustedExn, PtrTy),
	caughtExnAlignment);

	// The copy expression is defined in terms of an OpaqueValueExpr.
	// Find it and map it to the adjusted expression.
	CodeGenFunction::OpaqueValueMapping
	opaque(CGF, OpaqueValueExpr::findInCopyConstruct(copyExpr),
	CGF.MakeAddrLValue(adjustedExn, CatchParam.getType()));

	// Call the copy ctor in a terminate scope.
	CGF.EHStack.pushTerminate();

	// Perform the copy construction.
	CGF.EmitAggExpr(copyExpr,
	AggValueSlot::forAddr(ParamAddr, Qualifiers(),
	AggValueSlot::IsNotDestructed,
	AggValueSlot::DoesNotNeedGCBarriers,
	AggValueSlot::IsNotAliased));

	// Leave the terminate scope.
	CGF.EHStack.popTerminate();

	// Undo the opaque value mapping.
	opaque.pop();

	// Finally we can call __cxa_begin_catch.
	CallBeginCatch(CGF, Exn, true);
	}

	/// Begins a catch statement by initializing the catch variable and
	/// calling __cxa_begin_catch.
	void ItaniumCXXABI::emitBeginCatch(CodeGenFunction &CGF,
	const CXXCatchStmt *S) {
	// We have to be very careful with the ordering of cleanups here:
	// C++ [except.throw]p4:
	// The destruction [of the exception temporary] occurs
	// immediately after the destruction of the object declared in
	// the exception-declaration in the handler.
	//
	// So the precise ordering is:
	// 1. Construct catch variable.
	// 2. __cxa_begin_catch
	// 3. Enter __cxa_end_catch cleanup
	// 4. Enter dtor cleanup
	//
	// We do this by using a slightly abnormal initialization process.
	// Delegation sequence:
	// - ExitCXXTryStmt opens a RunCleanupsScope
	// - EmitAutoVarAlloca creates the variable and debug info
	// - InitCatchParam initializes the variable from the exception
	// - CallBeginCatch calls __cxa_begin_catch
	// - CallBeginCatch enters the __cxa_end_catch cleanup
	// - EmitAutoVarCleanups enters the variable destructor cleanup
	// - EmitCXXTryStmt emits the code for the catch body
	// - EmitCXXTryStmt close the RunCleanupsScope

	VarDecl *CatchParam = S->getExceptionDecl();
	if (!CatchParam) {
	llvm::Value *Exn = CGF.getExceptionFromSlot();
	CallBeginCatch(CGF, Exn, true);
	return;
	}

	// Emit the local.
	CodeGenFunction::AutoVarEmission var = CGF.EmitAutoVarAlloca(*CatchParam);
	InitCatchParam(CGF, *CatchParam, var.getObjectAddress(CGF), S->getLocStart());
	CGF.EmitAutoVarCleanups(var);
	}

	/// Get or define the following function:
	/// void @__clang_call_terminate(i8* %exn) nounwind noreturn
	/// This code is used only in C++.
	static llvm::Constant *getClangCallTerminateFn(CodeGenModule &CGM) {
	llvm::FunctionType *fnTy =
	llvm::FunctionType::get(CGM.VoidTy, CGM.Int8PtrTy, /IsVarArgs=/false);
	llvm::Constant *fnRef = CGM.CreateRuntimeFunction(
	fnTy, "__clang_call_terminate", llvm::AttributeList(), /Local=/true);

	llvm::Function *fn = dyn_cast<llvm::Function>(fnRef);
	if (fn && fn->empty()) {
	fn->setDoesNotThrow();
	fn->setDoesNotReturn();

	// What we really want is to massively penalize inlining without
	// forbidding it completely. The difference between that and
	// 'noinline' is negligible.
	fn->addFnAttr(llvm::Attribute::NoInline);

	// Allow this function to be shared across translation units, but
	// we don't want it to turn into an exported symbol.
	fn->setLinkage(llvm::Function::LinkOnceODRLinkage);
	fn->setVisibility(llvm::Function::HiddenVisibility);
	if (CGM.supportsCOMDAT())
	fn->setComdat(CGM.getModule().getOrInsertComdat(fn->getName()));

	// Set up the function.
	llvm::BasicBlock *entry =
	llvm::BasicBlock::Create(CGM.getLLVMContext(), "", fn);
	CGBuilderTy builder(CGM, entry);

	// Pull the exception pointer out of the parameter list.
	llvm::Value exn = &fn->arg_begin();

	// Call __cxa_begin_catch(exn).
	llvm::CallInst *catchCall = builder.CreateCall(getBeginCatchFn(CGM), exn);
	catchCall->setDoesNotThrow();
	catchCall->setCallingConv(CGM.getRuntimeCC());

	// Call std::terminate().
	llvm::CallInst *termCall = builder.CreateCall(CGM.getTerminateFn());
	termCall->setDoesNotThrow();
	termCall->setDoesNotReturn();
	termCall->setCallingConv(CGM.getRuntimeCC());

	// std::terminate cannot return.
	builder.CreateUnreachable();
	}

	return fnRef;
	}

	llvm::CallInst *
	ItaniumCXXABI::emitTerminateForUnexpectedException(CodeGenFunction &CGF,
	llvm::Value *Exn) {
	// In C++, we want to call __cxa_begin_catch() before terminating.
	if (Exn) {
	assert(CGF.CGM.getLangOpts().CPlusPlus);
	return CGF.EmitNounwindRuntimeCall(getClangCallTerminateFn(CGF.CGM), Exn);
	}
	return CGF.EmitNounwindRuntimeCall(CGF.CGM.getTerminateFn());
	}

	std::pair<llvm::Value , const CXXRecordDecl >
	ItaniumCXXABI::LoadVTablePtr(CodeGenFunction &CGF, Address This,
	const CXXRecordDecl *RD) {
	return {CGF.GetVTablePtr(This, CGM.Int8PtrTy, RD), RD};
	}
	Index: head/contrib/llvm/tools/clang/lib/Format/TokenAnnotator.cpp
	===================================================================
	--- head/contrib/llvm/tools/clang/lib/Format/TokenAnnotator.cpp (revision 329409)
	+++ head/contrib/llvm/tools/clang/lib/Format/TokenAnnotator.cpp (revision 329410)
	@@ -1,2910 +1,2913 @@
	//===--- TokenAnnotator.cpp - Format C++ code -----------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	///
	/// \file
	/// \brief This file implements a token annotator, i.e. creates
	/// \c AnnotatedTokens out of \c FormatTokens with required extra information.
	///
	//===----------------------------------------------------------------------===//

	#include "TokenAnnotator.h"
	#include "clang/Basic/SourceManager.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/Support/Debug.h"

	#define DEBUG_TYPE "format-token-annotator"

	namespace clang {
	namespace format {

	namespace {

	/// \brief A parser that gathers additional information about tokens.
	///
	/// The \c TokenAnnotator tries to match parenthesis and square brakets and
	/// store a parenthesis levels. It also tries to resolve matching "<" and ">"
	/// into template parameter lists.
	class AnnotatingParser {
	public:
	AnnotatingParser(const FormatStyle &Style, AnnotatedLine &Line,
	const AdditionalKeywords &Keywords)
	: Style(Style), Line(Line), CurrentToken(Line.First), AutoFound(false),
	Keywords(Keywords) {
	Contexts.push_back(Context(tok::unknown, 1, /IsExpression=/false));
	resetTokenMetadata(CurrentToken);
	}

	private:
	bool parseAngle() {
	if (!CurrentToken \|\| !CurrentToken->Previous)
	return false;
	if (NonTemplateLess.count(CurrentToken->Previous))
	return false;

	const FormatToken &Previous = *CurrentToken->Previous; // The '<'.
	if (Previous.Previous) {
	if (Previous.Previous->Tok.isLiteral())
	return false;
	if (Previous.Previous->is(tok::r_paren) && Contexts.size() > 1 &&
	(!Previous.Previous->MatchingParen \|\|
	!Previous.Previous->MatchingParen->is(TT_OverloadedOperatorLParen)))
	return false;
	}

	FormatToken *Left = CurrentToken->Previous;
	Left->ParentBracket = Contexts.back().ContextKind;
	ScopedContextCreator ContextCreator(*this, tok::less, 12);

	// If this angle is in the context of an expression, we need to be more
	// hesitant to detect it as opening template parameters.
	bool InExprContext = Contexts.back().IsExpression;

	Contexts.back().IsExpression = false;
	// If there's a template keyword before the opening angle bracket, this is a
	// template parameter, not an argument.
	Contexts.back().InTemplateArgument =
	Left->Previous && Left->Previous->Tok.isNot(tok::kw_template);

	if (Style.Language == FormatStyle::LK_Java &&
	CurrentToken->is(tok::question))
	next();

	while (CurrentToken) {
	if (CurrentToken->is(tok::greater)) {
	Left->MatchingParen = CurrentToken;
	CurrentToken->MatchingParen = Left;
	CurrentToken->Type = TT_TemplateCloser;
	next();
	return true;
	}
	if (CurrentToken->is(tok::question) &&
	Style.Language == FormatStyle::LK_Java) {
	next();
	continue;
	}
	if (CurrentToken->isOneOf(tok::r_paren, tok::r_square, tok::r_brace) \|\|
	(CurrentToken->isOneOf(tok::colon, tok::question) && InExprContext &&
	Style.Language != FormatStyle::LK_Proto &&
	Style.Language != FormatStyle::LK_TextProto))
	return false;
	// If a && or \|\| is found and interpreted as a binary operator, this set
	// of angles is likely part of something like "a < b && c > d". If the
	// angles are inside an expression, the \|\|/&& might also be a binary
	// operator that was misinterpreted because we are parsing template
	// parameters.
	// FIXME: This is getting out of hand, write a decent parser.
	if (CurrentToken->Previous->isOneOf(tok::pipepipe, tok::ampamp) &&
	CurrentToken->Previous->is(TT_BinaryOperator) &&
	Contexts[Contexts.size() - 2].IsExpression &&
	!Line.startsWith(tok::kw_template))
	return false;
	updateParameterCount(Left, CurrentToken);
	if (Style.Language == FormatStyle::LK_Proto) {
	if (FormatToken *Previous = CurrentToken->getPreviousNonComment()) {
	if (CurrentToken->is(tok::colon) \|\|
	(CurrentToken->isOneOf(tok::l_brace, tok::less) &&
	Previous->isNot(tok::colon)))
	Previous->Type = TT_SelectorName;
	}
	}
	if (!consumeToken())
	return false;
	}
	return false;
	}

	bool parseParens(bool LookForDecls = false) {
	if (!CurrentToken)
	return false;
	FormatToken *Left = CurrentToken->Previous;
	Left->ParentBracket = Contexts.back().ContextKind;
	ScopedContextCreator ContextCreator(*this, tok::l_paren, 1);

	// FIXME: This is a bit of a hack. Do better.
	Contexts.back().ColonIsForRangeExpr =
	Contexts.size() == 2 && Contexts[0].ColonIsForRangeExpr;

	bool StartsObjCMethodExpr = false;
	if (CurrentToken->is(tok::caret)) {
	// (^ can start a block type.
	Left->Type = TT_ObjCBlockLParen;
	} else if (FormatToken *MaybeSel = Left->Previous) {
	// @selector( starts a selector.
	if (MaybeSel->isObjCAtKeyword(tok::objc_selector) && MaybeSel->Previous &&
	MaybeSel->Previous->is(tok::at)) {
	StartsObjCMethodExpr = true;
	}
	}

	if (Left->is(TT_OverloadedOperatorLParen)) {
	Contexts.back().IsExpression = false;
	} else if (Style.Language == FormatStyle::LK_JavaScript &&
	(Line.startsWith(Keywords.kw_type, tok::identifier) \|\|
	Line.startsWith(tok::kw_export, Keywords.kw_type,
	tok::identifier))) {
	// type X = (...);
	// export type X = (...);
	Contexts.back().IsExpression = false;
	} else if (Left->Previous &&
	(Left->Previous->isOneOf(tok::kw_static_assert, tok::kw_decltype,
	tok::kw_if, tok::kw_while, tok::l_paren,
	tok::comma) \|\|
	Left->Previous->endsSequence(tok::kw_constexpr, tok::kw_if) \|\|
	Left->Previous->is(TT_BinaryOperator))) {
	// static_assert, if and while usually contain expressions.
	Contexts.back().IsExpression = true;
	} else if (Style.Language == FormatStyle::LK_JavaScript && Left->Previous &&
	(Left->Previous->is(Keywords.kw_function) \|\|
	(Left->Previous->endsSequence(tok::identifier,
	Keywords.kw_function)))) {
	// function(...) or function f(...)
	Contexts.back().IsExpression = false;
	} else if (Style.Language == FormatStyle::LK_JavaScript && Left->Previous &&
	Left->Previous->is(TT_JsTypeColon)) {
	// let x: (SomeType);
	Contexts.back().IsExpression = false;
	} else if (Left->Previous && Left->Previous->is(tok::r_square) &&
	Left->Previous->MatchingParen &&
	Left->Previous->MatchingParen->is(TT_LambdaLSquare)) {
	// This is a parameter list of a lambda expression.
	Contexts.back().IsExpression = false;
	} else if (Line.InPPDirective &&
	(!Left->Previous \|\| !Left->Previous->is(tok::identifier))) {
	Contexts.back().IsExpression = true;
	} else if (Contexts[Contexts.size() - 2].CaretFound) {
	// This is the parameter list of an ObjC block.
	Contexts.back().IsExpression = false;
	} else if (Left->Previous && Left->Previous->is(tok::kw___attribute)) {
	Left->Type = TT_AttributeParen;
	} else if (Left->Previous && Left->Previous->is(TT_ForEachMacro)) {
	// The first argument to a foreach macro is a declaration.
	Contexts.back().IsForEachMacro = true;
	Contexts.back().IsExpression = false;
	} else if (Left->Previous && Left->Previous->MatchingParen &&
	Left->Previous->MatchingParen->is(TT_ObjCBlockLParen)) {
	Contexts.back().IsExpression = false;
	} else if (!Line.MustBeDeclaration && !Line.InPPDirective) {
	bool IsForOrCatch =
	Left->Previous && Left->Previous->isOneOf(tok::kw_for, tok::kw_catch);
	Contexts.back().IsExpression = !IsForOrCatch;
	}

	if (StartsObjCMethodExpr) {
	Contexts.back().ColonIsObjCMethodExpr = true;
	Left->Type = TT_ObjCMethodExpr;
	}

	bool MightBeFunctionType = !Contexts[Contexts.size() - 2].IsExpression;
	bool ProbablyFunctionType = CurrentToken->isOneOf(tok::star, tok::amp);
	bool HasMultipleLines = false;
	bool HasMultipleParametersOnALine = false;
	bool MightBeObjCForRangeLoop =
	Left->Previous && Left->Previous->is(tok::kw_for);
	while (CurrentToken) {
	// LookForDecls is set when "if (" has been seen. Check for
	// 'identifier' '*' 'identifier' followed by not '=' -- this
	// '*' has to be a binary operator but determineStarAmpUsage() will
	// categorize it as an unary operator, so set the right type here.
	if (LookForDecls && CurrentToken->Next) {
	FormatToken *Prev = CurrentToken->getPreviousNonComment();
	if (Prev) {
	FormatToken *PrevPrev = Prev->getPreviousNonComment();
	FormatToken *Next = CurrentToken->Next;
	if (PrevPrev && PrevPrev->is(tok::identifier) &&
	Prev->isOneOf(tok::star, tok::amp, tok::ampamp) &&
	CurrentToken->is(tok::identifier) && Next->isNot(tok::equal)) {
	Prev->Type = TT_BinaryOperator;
	LookForDecls = false;
	}
	}
	}

	if (CurrentToken->Previous->is(TT_PointerOrReference) &&
	CurrentToken->Previous->Previous->isOneOf(tok::l_paren,
	tok::coloncolon))
	ProbablyFunctionType = true;
	if (CurrentToken->is(tok::comma))
	MightBeFunctionType = false;
	if (CurrentToken->Previous->is(TT_BinaryOperator))
	Contexts.back().IsExpression = true;
	if (CurrentToken->is(tok::r_paren)) {
	if (MightBeFunctionType && ProbablyFunctionType && CurrentToken->Next &&
	(CurrentToken->Next->is(tok::l_paren) \|\|
	(CurrentToken->Next->is(tok::l_square) && Line.MustBeDeclaration)))
	Left->Type = TT_FunctionTypeLParen;
	Left->MatchingParen = CurrentToken;
	CurrentToken->MatchingParen = Left;

	if (CurrentToken->Next && CurrentToken->Next->is(tok::l_brace) &&
	Left->Previous && Left->Previous->is(tok::l_paren)) {
	// Detect the case where macros are used to generate lambdas or
	// function bodies, e.g.:
	// auto my_lambda = MARCO((Type *type, int i) { .. body .. });
	for (FormatToken *Tok = Left; Tok != CurrentToken; Tok = Tok->Next) {
	if (Tok->is(TT_BinaryOperator) &&
	Tok->isOneOf(tok::star, tok::amp, tok::ampamp))
	Tok->Type = TT_PointerOrReference;
	}
	}

	if (StartsObjCMethodExpr) {
	CurrentToken->Type = TT_ObjCMethodExpr;
	if (Contexts.back().FirstObjCSelectorName) {
	Contexts.back().FirstObjCSelectorName->LongestObjCSelectorName =
	Contexts.back().LongestObjCSelectorName;
	}
	}

	if (Left->is(TT_AttributeParen))
	CurrentToken->Type = TT_AttributeParen;
	if (Left->Previous && Left->Previous->is(TT_JavaAnnotation))
	CurrentToken->Type = TT_JavaAnnotation;
	if (Left->Previous && Left->Previous->is(TT_LeadingJavaAnnotation))
	CurrentToken->Type = TT_LeadingJavaAnnotation;

	if (!HasMultipleLines)
	Left->PackingKind = PPK_Inconclusive;
	else if (HasMultipleParametersOnALine)
	Left->PackingKind = PPK_BinPacked;
	else
	Left->PackingKind = PPK_OnePerLine;

	next();
	return true;
	}
	if (CurrentToken->isOneOf(tok::r_square, tok::r_brace))
	return false;

	if (CurrentToken->is(tok::l_brace))
	Left->Type = TT_Unknown; // Not TT_ObjCBlockLParen
	if (CurrentToken->is(tok::comma) && CurrentToken->Next &&
	!CurrentToken->Next->HasUnescapedNewline &&
	!CurrentToken->Next->isTrailingComment())
	HasMultipleParametersOnALine = true;
	if ((CurrentToken->Previous->isOneOf(tok::kw_const, tok::kw_auto) \|\|
	CurrentToken->Previous->isSimpleTypeSpecifier()) &&
	!CurrentToken->is(tok::l_brace))
	Contexts.back().IsExpression = false;
	if (CurrentToken->isOneOf(tok::semi, tok::colon))
	MightBeObjCForRangeLoop = false;
	if (MightBeObjCForRangeLoop && CurrentToken->is(Keywords.kw_in))
	CurrentToken->Type = TT_ObjCForIn;
	// When we discover a 'new', we set CanBeExpression to 'false' in order to
	// parse the type correctly. Reset that after a comma.
	if (CurrentToken->is(tok::comma))
	Contexts.back().CanBeExpression = true;

	FormatToken *Tok = CurrentToken;
	if (!consumeToken())
	return false;
	updateParameterCount(Left, Tok);
	if (CurrentToken && CurrentToken->HasUnescapedNewline)
	HasMultipleLines = true;
	}
	return false;
	}

	bool parseSquare() {
	if (!CurrentToken)
	return false;

	// A '[' could be an index subscript (after an identifier or after
	// ')' or ']'), it could be the start of an Objective-C method
	// expression, or it could the start of an Objective-C array literal.
	FormatToken *Left = CurrentToken->Previous;
	Left->ParentBracket = Contexts.back().ContextKind;
	FormatToken *Parent = Left->getPreviousNonComment();

	// Cases where '>' is followed by '['.
	// In C++, this can happen either in array of templates (foo<int>[10])
	// or when array is a nested template type (unique_ptr<type1<type2>[]>).
	bool CppArrayTemplates =
	Style.isCpp() && Parent && Parent->is(TT_TemplateCloser) &&
	(Contexts.back().CanBeExpression \|\| Contexts.back().IsExpression \|\|
	Contexts.back().InTemplateArgument);

	bool StartsObjCMethodExpr =
	!CppArrayTemplates && Style.isCpp() &&
	Contexts.back().CanBeExpression && Left->isNot(TT_LambdaLSquare) &&
	CurrentToken->isNot(tok::l_brace) &&
	(!Parent \|\|
	Parent->isOneOf(tok::colon, tok::l_square, tok::l_paren,
	tok::kw_return, tok::kw_throw) \|\|
	Parent->isUnaryOperator() \|\|
	Parent->isOneOf(TT_ObjCForIn, TT_CastRParen) \|\|
	getBinOpPrecedence(Parent->Tok.getKind(), true, true) > prec::Unknown);
	bool ColonFound = false;

	unsigned BindingIncrease = 1;
	if (Left->isCppStructuredBinding(Style)) {
	Left->Type = TT_StructuredBindingLSquare;
	} else if (Left->is(TT_Unknown)) {
	if (StartsObjCMethodExpr) {
	Left->Type = TT_ObjCMethodExpr;
	} else if (Style.Language == FormatStyle::LK_JavaScript && Parent &&
	Contexts.back().ContextKind == tok::l_brace &&
	Parent->isOneOf(tok::l_brace, tok::comma)) {
	Left->Type = TT_JsComputedPropertyName;
	} else if (Style.isCpp() && Contexts.back().ContextKind == tok::l_brace &&
	Parent && Parent->isOneOf(tok::l_brace, tok::comma)) {
	Left->Type = TT_DesignatedInitializerLSquare;
	} else if (CurrentToken->is(tok::r_square) && Parent &&
	Parent->is(TT_TemplateCloser)) {
	Left->Type = TT_ArraySubscriptLSquare;
	} else if (Style.Language == FormatStyle::LK_Proto \|\|
	(!CppArrayTemplates && Parent &&
	Parent->isOneOf(TT_BinaryOperator, TT_TemplateCloser, tok::at,
	tok::comma, tok::l_paren, tok::l_square,
	tok::question, tok::colon, tok::kw_return,
	// Should only be relevant to JavaScript:
	tok::kw_default))) {
	Left->Type = TT_ArrayInitializerLSquare;
	} else {
	BindingIncrease = 10;
	Left->Type = TT_ArraySubscriptLSquare;
	}
	}

	ScopedContextCreator ContextCreator(*this, tok::l_square, BindingIncrease);
	Contexts.back().IsExpression = true;
	if (Style.Language == FormatStyle::LK_JavaScript && Parent &&
	Parent->is(TT_JsTypeColon))
	Contexts.back().IsExpression = false;

	Contexts.back().ColonIsObjCMethodExpr = StartsObjCMethodExpr;

	while (CurrentToken) {
	if (CurrentToken->is(tok::r_square)) {
	if (CurrentToken->Next && CurrentToken->Next->is(tok::l_paren) &&
	Left->is(TT_ObjCMethodExpr)) {
	// An ObjC method call is rarely followed by an open parenthesis.
	// FIXME: Do we incorrectly label ":" with this?
	StartsObjCMethodExpr = false;
	Left->Type = TT_Unknown;
	}
	if (StartsObjCMethodExpr && CurrentToken->Previous != Left) {
	CurrentToken->Type = TT_ObjCMethodExpr;
	// determineStarAmpUsage() thinks that '*' '[' is allocating an
	// array of pointers, but if '[' starts a selector then '*' is a
	// binary operator.
	if (Parent && Parent->is(TT_PointerOrReference))
	Parent->Type = TT_BinaryOperator;
	}
	Left->MatchingParen = CurrentToken;
	CurrentToken->MatchingParen = Left;
	if (Contexts.back().FirstObjCSelectorName) {
	Contexts.back().FirstObjCSelectorName->LongestObjCSelectorName =
	Contexts.back().LongestObjCSelectorName;
	if (Left->BlockParameterCount > 1)
	Contexts.back().FirstObjCSelectorName->LongestObjCSelectorName = 0;
	}
	next();
	return true;
	}
	if (CurrentToken->isOneOf(tok::r_paren, tok::r_brace))
	return false;
	if (CurrentToken->is(tok::colon)) {
	if (Left->isOneOf(TT_ArraySubscriptLSquare,
	TT_DesignatedInitializerLSquare)) {
	Left->Type = TT_ObjCMethodExpr;
	StartsObjCMethodExpr = true;
	Contexts.back().ColonIsObjCMethodExpr = true;
	if (Parent && Parent->is(tok::r_paren))
	Parent->Type = TT_CastRParen;
	}
	ColonFound = true;
	}
	if (CurrentToken->is(tok::comma) && Left->is(TT_ObjCMethodExpr) &&
	!ColonFound)
	Left->Type = TT_ArrayInitializerLSquare;
	FormatToken *Tok = CurrentToken;
	if (!consumeToken())
	return false;
	updateParameterCount(Left, Tok);
	}
	return false;
	}

	bool parseBrace() {
	if (CurrentToken) {
	FormatToken *Left = CurrentToken->Previous;
	Left->ParentBracket = Contexts.back().ContextKind;

	if (Contexts.back().CaretFound)
	Left->Type = TT_ObjCBlockLBrace;
	Contexts.back().CaretFound = false;

	ScopedContextCreator ContextCreator(*this, tok::l_brace, 1);
	Contexts.back().ColonIsDictLiteral = true;
	if (Left->BlockKind == BK_BracedInit)
	Contexts.back().IsExpression = true;
	if (Style.Language == FormatStyle::LK_JavaScript && Left->Previous &&
	Left->Previous->is(TT_JsTypeColon))
	Contexts.back().IsExpression = false;

	while (CurrentToken) {
	if (CurrentToken->is(tok::r_brace)) {
	Left->MatchingParen = CurrentToken;
	CurrentToken->MatchingParen = Left;
	next();
	return true;
	}
	if (CurrentToken->isOneOf(tok::r_paren, tok::r_square))
	return false;
	updateParameterCount(Left, CurrentToken);
	if (CurrentToken->isOneOf(tok::colon, tok::l_brace, tok::less)) {
	FormatToken *Previous = CurrentToken->getPreviousNonComment();
	if (Previous->is(TT_JsTypeOptionalQuestion))
	Previous = Previous->getPreviousNonComment();
	if (((CurrentToken->is(tok::colon) &&
	(!Contexts.back().ColonIsDictLiteral \|\| !Style.isCpp())) \|\|
	Style.Language == FormatStyle::LK_Proto \|\|
	Style.Language == FormatStyle::LK_TextProto) &&
	(Previous->Tok.getIdentifierInfo() \|\|
	Previous->is(tok::string_literal)))
	Previous->Type = TT_SelectorName;
	if (CurrentToken->is(tok::colon) \|\|
	Style.Language == FormatStyle::LK_JavaScript)
	Left->Type = TT_DictLiteral;
	}
	if (CurrentToken->is(tok::comma) &&
	Style.Language == FormatStyle::LK_JavaScript)
	Left->Type = TT_DictLiteral;
	if (!consumeToken())
	return false;
	}
	}
	return true;
	}

	void updateParameterCount(FormatToken Left, FormatToken Current) {
	if (Current->is(tok::l_brace) && Current->BlockKind == BK_Block)
	++Left->BlockParameterCount;
	if (Current->is(tok::comma)) {
	++Left->ParameterCount;
	if (!Left->Role)
	Left->Role.reset(new CommaSeparatedList(Style));
	Left->Role->CommaFound(Current);
	} else if (Left->ParameterCount == 0 && Current->isNot(tok::comment)) {
	Left->ParameterCount = 1;
	}
	}

	bool parseConditional() {
	while (CurrentToken) {
	if (CurrentToken->is(tok::colon)) {
	CurrentToken->Type = TT_ConditionalExpr;
	next();
	return true;
	}
	if (!consumeToken())
	return false;
	}
	return false;
	}

	bool parseTemplateDeclaration() {
	if (CurrentToken && CurrentToken->is(tok::less)) {
	CurrentToken->Type = TT_TemplateOpener;
	next();
	if (!parseAngle())
	return false;
	if (CurrentToken)
	CurrentToken->Previous->ClosesTemplateDeclaration = true;
	return true;
	}
	return false;
	}

	bool consumeToken() {
	FormatToken *Tok = CurrentToken;
	next();
	switch (Tok->Tok.getKind()) {
	case tok::plus:
	case tok::minus:
	if (!Tok->Previous && Line.MustBeDeclaration)
	Tok->Type = TT_ObjCMethodSpecifier;
	break;
	case tok::colon:
	if (!Tok->Previous)
	return false;
	// Colons from ?: are handled in parseConditional().
	if (Style.Language == FormatStyle::LK_JavaScript) {
	if (Contexts.back().ColonIsForRangeExpr \|\| // colon in for loop
	(Contexts.size() == 1 && // switch/case labels
	!Line.First->isOneOf(tok::kw_enum, tok::kw_case)) \|\|
	Contexts.back().ContextKind == tok::l_paren \|\| // function params
	Contexts.back().ContextKind == tok::l_square \|\| // array type
	(!Contexts.back().IsExpression &&
	Contexts.back().ContextKind == tok::l_brace) \|\| // object type
	(Contexts.size() == 1 &&
	Line.MustBeDeclaration)) { // method/property declaration
	Contexts.back().IsExpression = false;
	Tok->Type = TT_JsTypeColon;
	break;
	}
	}
	if (Contexts.back().ColonIsDictLiteral \|\|
	Style.Language == FormatStyle::LK_Proto \|\|
	Style.Language == FormatStyle::LK_TextProto) {
	Tok->Type = TT_DictLiteral;
	if (Style.Language == FormatStyle::LK_TextProto) {
	if (FormatToken *Previous = Tok->getPreviousNonComment())
	Previous->Type = TT_SelectorName;
	}
	} else if (Contexts.back().ColonIsObjCMethodExpr \|\|
	Line.startsWith(TT_ObjCMethodSpecifier)) {
	Tok->Type = TT_ObjCMethodExpr;
	const FormatToken *BeforePrevious = Tok->Previous->Previous;
	if (!BeforePrevious \|\|
	!(BeforePrevious->is(TT_CastRParen) \|\|
	(BeforePrevious->is(TT_ObjCMethodExpr) &&
	BeforePrevious->is(tok::colon))) \|\|
	BeforePrevious->is(tok::r_square) \|\|
	Contexts.back().LongestObjCSelectorName == 0) {
	Tok->Previous->Type = TT_SelectorName;
	if (Tok->Previous->ColumnWidth >
	Contexts.back().LongestObjCSelectorName)
	Contexts.back().LongestObjCSelectorName =
	Tok->Previous->ColumnWidth;
	if (!Contexts.back().FirstObjCSelectorName)
	Contexts.back().FirstObjCSelectorName = Tok->Previous;
	}
	} else if (Contexts.back().ColonIsForRangeExpr) {
	Tok->Type = TT_RangeBasedForLoopColon;
	} else if (CurrentToken && CurrentToken->is(tok::numeric_constant)) {
	Tok->Type = TT_BitFieldColon;
	} else if (Contexts.size() == 1 &&
	!Line.First->isOneOf(tok::kw_enum, tok::kw_case)) {
	if (Tok->getPreviousNonComment()->isOneOf(tok::r_paren,
	tok::kw_noexcept))
	Tok->Type = TT_CtorInitializerColon;
	else
	Tok->Type = TT_InheritanceColon;
	} else if (Tok->Previous->is(tok::identifier) && Tok->Next &&
	Tok->Next->isOneOf(tok::r_paren, tok::comma)) {
	// This handles a special macro in ObjC code where selectors including
	// the colon are passed as macro arguments.
	Tok->Type = TT_ObjCMethodExpr;
	} else if (Contexts.back().ContextKind == tok::l_paren) {
	Tok->Type = TT_InlineASMColon;
	}
	break;
	case tok::pipe:
	case tok::amp:
	// \| and & in declarations/type expressions represent union and
	// intersection types, respectively.
	if (Style.Language == FormatStyle::LK_JavaScript &&
	!Contexts.back().IsExpression)
	Tok->Type = TT_JsTypeOperator;
	break;
	case tok::kw_if:
	case tok::kw_while:
	if (Tok->is(tok::kw_if) && CurrentToken &&
	CurrentToken->is(tok::kw_constexpr))
	next();
	if (CurrentToken && CurrentToken->is(tok::l_paren)) {
	next();
	if (!parseParens(/LookForDecls=/true))
	return false;
	}
	break;
	case tok::kw_for:
	if (Style.Language == FormatStyle::LK_JavaScript) {
	// x.for and {for: ...}
	if ((Tok->Previous && Tok->Previous->is(tok::period)) \|\|
	(Tok->Next && Tok->Next->is(tok::colon)))
	break;
	// JS' for await ( ...
	if (CurrentToken && CurrentToken->is(Keywords.kw_await))
	next();
	}
	Contexts.back().ColonIsForRangeExpr = true;
	next();
	if (!parseParens())
	return false;
	break;
	case tok::l_paren:
	// When faced with 'operator()()', the kw_operator handler incorrectly
	// marks the first l_paren as a OverloadedOperatorLParen. Here, we make
	// the first two parens OverloadedOperators and the second l_paren an
	// OverloadedOperatorLParen.
	if (Tok->Previous && Tok->Previous->is(tok::r_paren) &&
	Tok->Previous->MatchingParen &&
	Tok->Previous->MatchingParen->is(TT_OverloadedOperatorLParen)) {
	Tok->Previous->Type = TT_OverloadedOperator;
	Tok->Previous->MatchingParen->Type = TT_OverloadedOperator;
	Tok->Type = TT_OverloadedOperatorLParen;
	}

	if (!parseParens())
	return false;
	if (Line.MustBeDeclaration && Contexts.size() == 1 &&
	!Contexts.back().IsExpression && !Line.startsWith(TT_ObjCProperty) &&
	(!Tok->Previous \|\|
	!Tok->Previous->isOneOf(tok::kw_decltype, tok::kw___attribute,
	TT_LeadingJavaAnnotation)))
	Line.MightBeFunctionDecl = true;
	break;
	case tok::l_square:
	if (!parseSquare())
	return false;
	break;
	case tok::l_brace:
	if (Style.Language == FormatStyle::LK_TextProto) {
	FormatToken *Previous = Tok->getPreviousNonComment();
	if (Previous && Previous->Type != TT_DictLiteral)
	Previous->Type = TT_SelectorName;
	}
	if (!parseBrace())
	return false;
	break;
	case tok::less:
	if (parseAngle()) {
	Tok->Type = TT_TemplateOpener;
	if (Style.Language == FormatStyle::LK_TextProto) {
	FormatToken *Previous = Tok->getPreviousNonComment();
	if (Previous && Previous->Type != TT_DictLiteral)
	Previous->Type = TT_SelectorName;
	}
	} else {
	Tok->Type = TT_BinaryOperator;
	NonTemplateLess.insert(Tok);
	CurrentToken = Tok;
	next();
	}
	break;
	case tok::r_paren:
	case tok::r_square:
	return false;
	case tok::r_brace:
	// Lines can start with '}'.
	if (Tok->Previous)
	return false;
	break;
	case tok::greater:
	Tok->Type = TT_BinaryOperator;
	break;
	case tok::kw_operator:
	while (CurrentToken &&
	!CurrentToken->isOneOf(tok::l_paren, tok::semi, tok::r_paren)) {
	if (CurrentToken->isOneOf(tok::star, tok::amp))
	CurrentToken->Type = TT_PointerOrReference;
	consumeToken();
	if (CurrentToken &&
	CurrentToken->Previous->isOneOf(TT_BinaryOperator, TT_UnaryOperator,
	tok::comma))
	CurrentToken->Previous->Type = TT_OverloadedOperator;
	}
	if (CurrentToken) {
	CurrentToken->Type = TT_OverloadedOperatorLParen;
	if (CurrentToken->Previous->is(TT_BinaryOperator))
	CurrentToken->Previous->Type = TT_OverloadedOperator;
	}
	break;
	case tok::question:
	if (Style.Language == FormatStyle::LK_JavaScript && Tok->Next &&
	Tok->Next->isOneOf(tok::semi, tok::comma, tok::colon, tok::r_paren,
	tok::r_brace)) {
	// Question marks before semicolons, colons, etc. indicate optional
	// types (fields, parameters), e.g.
	// function(x?: string, y?) {...}
	// class X { y?; }
	Tok->Type = TT_JsTypeOptionalQuestion;
	break;
	}
	// Declarations cannot be conditional expressions, this can only be part
	// of a type declaration.
	if (Line.MustBeDeclaration && !Contexts.back().IsExpression &&
	Style.Language == FormatStyle::LK_JavaScript)
	break;
	parseConditional();
	break;
	case tok::kw_template:
	parseTemplateDeclaration();
	break;
	case tok::comma:
	if (Contexts.back().InCtorInitializer)
	Tok->Type = TT_CtorInitializerComma;
	else if (Contexts.back().InInheritanceList)
	Tok->Type = TT_InheritanceComma;
	else if (Contexts.back().FirstStartOfName &&
	(Contexts.size() == 1 \|\| Line.startsWith(tok::kw_for))) {
	Contexts.back().FirstStartOfName->PartOfMultiVariableDeclStmt = true;
	Line.IsMultiVariableDeclStmt = true;
	}
	if (Contexts.back().IsForEachMacro)
	Contexts.back().IsExpression = true;
	break;
	case tok::identifier:
	if (Tok->isOneOf(Keywords.kw___has_include,
	Keywords.kw___has_include_next)) {
	parseHasInclude();
	}
	break;
	default:
	break;
	}
	return true;
	}

	void parseIncludeDirective() {
	if (CurrentToken && CurrentToken->is(tok::less)) {
	next();
	while (CurrentToken) {
	// Mark tokens up to the trailing line comments as implicit string
	// literals.
	if (CurrentToken->isNot(tok::comment) &&
	!CurrentToken->TokenText.startswith("//"))
	CurrentToken->Type = TT_ImplicitStringLiteral;
	next();
	}
	}
	}

	void parseWarningOrError() {
	next();
	// We still want to format the whitespace left of the first token of the
	// warning or error.
	next();
	while (CurrentToken) {
	CurrentToken->Type = TT_ImplicitStringLiteral;
	next();
	}
	}

	void parsePragma() {
	next(); // Consume "pragma".
	if (CurrentToken &&
	CurrentToken->isOneOf(Keywords.kw_mark, Keywords.kw_option)) {
	bool IsMark = CurrentToken->is(Keywords.kw_mark);
	next(); // Consume "mark".
	next(); // Consume first token (so we fix leading whitespace).
	while (CurrentToken) {
	if (IsMark \|\| CurrentToken->Previous->is(TT_BinaryOperator))
	CurrentToken->Type = TT_ImplicitStringLiteral;
	next();
	}
	}
	}

	void parseHasInclude() {
	if (!CurrentToken \|\| !CurrentToken->is(tok::l_paren))
	return;
	next(); // '('
	parseIncludeDirective();
	next(); // ')'
	}

	LineType parsePreprocessorDirective() {
	bool IsFirstToken = CurrentToken->IsFirst;
	LineType Type = LT_PreprocessorDirective;
	next();
	if (!CurrentToken)
	return Type;

	if (Style.Language == FormatStyle::LK_JavaScript && IsFirstToken) {
	// JavaScript files can contain shebang lines of the form:
	// #!/usr/bin/env node
	// Treat these like C++ #include directives.
	while (CurrentToken) {
	// Tokens cannot be comments here.
	CurrentToken->Type = TT_ImplicitStringLiteral;
	next();
	}
	return LT_ImportStatement;
	}

	if (CurrentToken->Tok.is(tok::numeric_constant)) {
	CurrentToken->SpacesRequiredBefore = 1;
	return Type;
	}
	// Hashes in the middle of a line can lead to any strange token
	// sequence.
	if (!CurrentToken->Tok.getIdentifierInfo())
	return Type;
	switch (CurrentToken->Tok.getIdentifierInfo()->getPPKeywordID()) {
	case tok::pp_include:
	case tok::pp_include_next:
	case tok::pp_import:
	next();
	parseIncludeDirective();
	Type = LT_ImportStatement;
	break;
	case tok::pp_error:
	case tok::pp_warning:
	parseWarningOrError();
	break;
	case tok::pp_pragma:
	parsePragma();
	break;
	case tok::pp_if:
	case tok::pp_elif:
	Contexts.back().IsExpression = true;
	parseLine();
	break;
	default:
	break;
	}
	while (CurrentToken) {
	FormatToken *Tok = CurrentToken;
	next();
	if (Tok->is(tok::l_paren))
	parseParens();
	else if (Tok->isOneOf(Keywords.kw___has_include,
	Keywords.kw___has_include_next))
	parseHasInclude();
	}
	return Type;
	}

	public:
	LineType parseLine() {
	NonTemplateLess.clear();
	if (CurrentToken->is(tok::hash))
	return parsePreprocessorDirective();

	// Directly allow to 'import <string-literal>' to support protocol buffer
	// definitions (github.com/google/protobuf) or missing "#" (either way we
	// should not break the line).
	IdentifierInfo *Info = CurrentToken->Tok.getIdentifierInfo();
	if ((Style.Language == FormatStyle::LK_Java &&
	CurrentToken->is(Keywords.kw_package)) \|\|
	(Info && Info->getPPKeywordID() == tok::pp_import &&
	CurrentToken->Next &&
	CurrentToken->Next->isOneOf(tok::string_literal, tok::identifier,
	tok::kw_static))) {
	next();
	parseIncludeDirective();
	return LT_ImportStatement;
	}

	// If this line starts and ends in '<' and '>', respectively, it is likely
	// part of "#define <a/b.h>".
	if (CurrentToken->is(tok::less) && Line.Last->is(tok::greater)) {
	parseIncludeDirective();
	return LT_ImportStatement;
	}

	// In .proto files, top-level options are very similar to import statements
	// and should not be line-wrapped.
	if (Style.Language == FormatStyle::LK_Proto && Line.Level == 0 &&
	CurrentToken->is(Keywords.kw_option)) {
	next();
	if (CurrentToken && CurrentToken->is(tok::identifier))
	return LT_ImportStatement;
	}

	bool KeywordVirtualFound = false;
	bool ImportStatement = false;

	// import {...} from '...';
	if (Style.Language == FormatStyle::LK_JavaScript &&
	CurrentToken->is(Keywords.kw_import))
	ImportStatement = true;

	while (CurrentToken) {
	if (CurrentToken->is(tok::kw_virtual))
	KeywordVirtualFound = true;
	if (Style.Language == FormatStyle::LK_JavaScript) {
	// export {...} from '...';
	// An export followed by "from 'some string';" is a re-export from
	// another module identified by a URI and is treated as a
	// LT_ImportStatement (i.e. prevent wraps on it for long URIs).
	// Just "export {...};" or "export class ..." should not be treated as
	// an import in this sense.
	if (Line.First->is(tok::kw_export) &&
	CurrentToken->is(Keywords.kw_from) && CurrentToken->Next &&
	CurrentToken->Next->isStringLiteral())
	ImportStatement = true;
	if (isClosureImportStatement(*CurrentToken))
	ImportStatement = true;
	}
	if (!consumeToken())
	return LT_Invalid;
	}
	if (KeywordVirtualFound)
	return LT_VirtualFunctionDecl;
	if (ImportStatement)
	return LT_ImportStatement;

	if (Line.startsWith(TT_ObjCMethodSpecifier)) {
	if (Contexts.back().FirstObjCSelectorName)
	Contexts.back().FirstObjCSelectorName->LongestObjCSelectorName =
	Contexts.back().LongestObjCSelectorName;
	return LT_ObjCMethodDecl;
	}

	return LT_Other;
	}

	private:
	bool isClosureImportStatement(const FormatToken &Tok) {
	// FIXME: Closure-library specific stuff should not be hard-coded but be
	// configurable.
	return Tok.TokenText == "goog" && Tok.Next && Tok.Next->is(tok::period) &&
	Tok.Next->Next &&
	(Tok.Next->Next->TokenText == "module" \|\|
	Tok.Next->Next->TokenText == "provide" \|\|
	Tok.Next->Next->TokenText == "require" \|\|
	Tok.Next->Next->TokenText == "forwardDeclare") &&
	Tok.Next->Next->Next && Tok.Next->Next->Next->is(tok::l_paren);
	}

	void resetTokenMetadata(FormatToken *Token) {
	if (!Token)
	return;

	// Reset token type in case we have already looked at it and then
	// recovered from an error (e.g. failure to find the matching >).
	if (!CurrentToken->isOneOf(TT_LambdaLSquare, TT_ForEachMacro,
	TT_FunctionLBrace, TT_ImplicitStringLiteral,
	TT_InlineASMBrace, TT_JsFatArrow, TT_LambdaArrow,
	TT_OverloadedOperator, TT_RegexLiteral,
	TT_TemplateString, TT_ObjCStringLiteral))
	CurrentToken->Type = TT_Unknown;
	CurrentToken->Role.reset();
	CurrentToken->MatchingParen = nullptr;
	CurrentToken->FakeLParens.clear();
	CurrentToken->FakeRParens = 0;
	}

	void next() {
	if (CurrentToken) {
	CurrentToken->NestingLevel = Contexts.size() - 1;
	CurrentToken->BindingStrength = Contexts.back().BindingStrength;
	modifyContext(*CurrentToken);
	determineTokenType(*CurrentToken);
	CurrentToken = CurrentToken->Next;
	}

	resetTokenMetadata(CurrentToken);
	}

	/// \brief A struct to hold information valid in a specific context, e.g.
	/// a pair of parenthesis.
	struct Context {
	Context(tok::TokenKind ContextKind, unsigned BindingStrength,
	bool IsExpression)
	: ContextKind(ContextKind), BindingStrength(BindingStrength),
	IsExpression(IsExpression) {}

	tok::TokenKind ContextKind;
	unsigned BindingStrength;
	bool IsExpression;
	unsigned LongestObjCSelectorName = 0;
	bool ColonIsForRangeExpr = false;
	bool ColonIsDictLiteral = false;
	bool ColonIsObjCMethodExpr = false;
	FormatToken *FirstObjCSelectorName = nullptr;
	FormatToken *FirstStartOfName = nullptr;
	bool CanBeExpression = true;
	bool InTemplateArgument = false;
	bool InCtorInitializer = false;
	bool InInheritanceList = false;
	bool CaretFound = false;
	bool IsForEachMacro = false;
	};

	/// \brief Puts a new \c Context onto the stack \c Contexts for the lifetime
	/// of each instance.
	struct ScopedContextCreator {
	AnnotatingParser &P;

	ScopedContextCreator(AnnotatingParser &P, tok::TokenKind ContextKind,
	unsigned Increase)
	: P(P) {
	P.Contexts.push_back(Context(ContextKind,
	P.Contexts.back().BindingStrength + Increase,
	P.Contexts.back().IsExpression));
	}

	~ScopedContextCreator() { P.Contexts.pop_back(); }
	};

	void modifyContext(const FormatToken &Current) {
	if (Current.getPrecedence() == prec::Assignment &&
	!Line.First->isOneOf(tok::kw_template, tok::kw_using, tok::kw_return) &&
	// Type aliases use `type X = ...;` in TypeScript and can be exported
	// using `export type ...`.
	!(Style.Language == FormatStyle::LK_JavaScript &&
	(Line.startsWith(Keywords.kw_type, tok::identifier) \|\|
	Line.startsWith(tok::kw_export, Keywords.kw_type,
	tok::identifier))) &&
	(!Current.Previous \|\| Current.Previous->isNot(tok::kw_operator))) {
	Contexts.back().IsExpression = true;
	if (!Line.startsWith(TT_UnaryOperator)) {
	for (FormatToken *Previous = Current.Previous;
	Previous && Previous->Previous &&
	!Previous->Previous->isOneOf(tok::comma, tok::semi);
	Previous = Previous->Previous) {
	if (Previous->isOneOf(tok::r_square, tok::r_paren)) {
	Previous = Previous->MatchingParen;
	if (!Previous)
	break;
	}
	if (Previous->opensScope())
	break;
	if (Previous->isOneOf(TT_BinaryOperator, TT_UnaryOperator) &&
	Previous->isOneOf(tok::star, tok::amp, tok::ampamp) &&
	Previous->Previous && Previous->Previous->isNot(tok::equal))
	Previous->Type = TT_PointerOrReference;
	}
	}
	} else if (Current.is(tok::lessless) &&
	(!Current.Previous \|\| !Current.Previous->is(tok::kw_operator))) {
	Contexts.back().IsExpression = true;
	} else if (Current.isOneOf(tok::kw_return, tok::kw_throw)) {
	Contexts.back().IsExpression = true;
	} else if (Current.is(TT_TrailingReturnArrow)) {
	Contexts.back().IsExpression = false;
	} else if (Current.is(TT_LambdaArrow) \|\| Current.is(Keywords.kw_assert)) {
	Contexts.back().IsExpression = Style.Language == FormatStyle::LK_Java;
	} else if (Current.Previous &&
	Current.Previous->is(TT_CtorInitializerColon)) {
	Contexts.back().IsExpression = true;
	Contexts.back().InCtorInitializer = true;
	} else if (Current.Previous && Current.Previous->is(TT_InheritanceColon)) {
	Contexts.back().InInheritanceList = true;
	} else if (Current.isOneOf(tok::r_paren, tok::greater, tok::comma)) {
	for (FormatToken *Previous = Current.Previous;
	Previous && Previous->isOneOf(tok::star, tok::amp);
	Previous = Previous->Previous)
	Previous->Type = TT_PointerOrReference;
	if (Line.MustBeDeclaration && !Contexts.front().InCtorInitializer)
	Contexts.back().IsExpression = false;
	} else if (Current.is(tok::kw_new)) {
	Contexts.back().CanBeExpression = false;
	} else if (Current.isOneOf(tok::semi, tok::exclaim)) {
	// This should be the condition or increment in a for-loop.
	Contexts.back().IsExpression = true;
	}
	}

	void determineTokenType(FormatToken &Current) {
	if (!Current.is(TT_Unknown))
	// The token type is already known.
	return;

	if (Style.Language == FormatStyle::LK_JavaScript) {
	if (Current.is(tok::exclaim)) {
	if (Current.Previous &&
	(Current.Previous->isOneOf(tok::identifier, tok::kw_namespace,
	tok::r_paren, tok::r_square,
	tok::r_brace) \|\|
	Current.Previous->Tok.isLiteral())) {
	Current.Type = TT_JsNonNullAssertion;
	return;
	}
	if (Current.Next &&
	Current.Next->isOneOf(TT_BinaryOperator, Keywords.kw_as)) {
	Current.Type = TT_JsNonNullAssertion;
	return;
	}
	}
	}

	// Line.MightBeFunctionDecl can only be true after the parentheses of a
	// function declaration have been found. In this case, 'Current' is a
	// trailing token of this declaration and thus cannot be a name.
	if (Current.is(Keywords.kw_instanceof)) {
	Current.Type = TT_BinaryOperator;
	} else if (isStartOfName(Current) &&
	(!Line.MightBeFunctionDecl \|\| Current.NestingLevel != 0)) {
	Contexts.back().FirstStartOfName = &Current;
	Current.Type = TT_StartOfName;
	} else if (Current.is(tok::semi)) {
	// Reset FirstStartOfName after finding a semicolon so that a for loop
	// with multiple increment statements is not confused with a for loop
	// having multiple variable declarations.
	Contexts.back().FirstStartOfName = nullptr;
	} else if (Current.isOneOf(tok::kw_auto, tok::kw___auto_type)) {
	AutoFound = true;
	} else if (Current.is(tok::arrow) &&
	Style.Language == FormatStyle::LK_Java) {
	Current.Type = TT_LambdaArrow;
	} else if (Current.is(tok::arrow) && AutoFound && Line.MustBeDeclaration &&
	Current.NestingLevel == 0) {
	Current.Type = TT_TrailingReturnArrow;
	} else if (Current.isOneOf(tok::star, tok::amp, tok::ampamp)) {
	Current.Type = determineStarAmpUsage(Current,
	Contexts.back().CanBeExpression &&
	Contexts.back().IsExpression,
	Contexts.back().InTemplateArgument);
	} else if (Current.isOneOf(tok::minus, tok::plus, tok::caret)) {
	Current.Type = determinePlusMinusCaretUsage(Current);
	if (Current.is(TT_UnaryOperator) && Current.is(tok::caret))
	Contexts.back().CaretFound = true;
	} else if (Current.isOneOf(tok::minusminus, tok::plusplus)) {
	Current.Type = determineIncrementUsage(Current);
	} else if (Current.isOneOf(tok::exclaim, tok::tilde)) {
	Current.Type = TT_UnaryOperator;
	} else if (Current.is(tok::question)) {
	if (Style.Language == FormatStyle::LK_JavaScript &&
	Line.MustBeDeclaration && !Contexts.back().IsExpression) {
	// In JavaScript, `interface X { foo?(): bar; }` is an optional method
	// on the interface, not a ternary expression.
	Current.Type = TT_JsTypeOptionalQuestion;
	} else {
	Current.Type = TT_ConditionalExpr;
	}
	} else if (Current.isBinaryOperator() &&
	(!Current.Previous \|\| Current.Previous->isNot(tok::l_square))) {
	Current.Type = TT_BinaryOperator;
	} else if (Current.is(tok::comment)) {
	if (Current.TokenText.startswith("/*")) {
	if (Current.TokenText.endswith("*/"))
	Current.Type = TT_BlockComment;
	else
	// The lexer has for some reason determined a comment here. But we
	// cannot really handle it, if it isn't properly terminated.
	Current.Tok.setKind(tok::unknown);
	} else {
	Current.Type = TT_LineComment;
	}
	} else if (Current.is(tok::r_paren)) {
	if (rParenEndsCast(Current))
	Current.Type = TT_CastRParen;
	if (Current.MatchingParen && Current.Next &&
	!Current.Next->isBinaryOperator() &&
	!Current.Next->isOneOf(tok::semi, tok::colon, tok::l_brace,
	tok::comma, tok::period, tok::arrow,
	tok::coloncolon))
	if (FormatToken *AfterParen = Current.MatchingParen->Next) {
	// Make sure this isn't the return type of an Obj-C block declaration
	if (AfterParen->Tok.isNot(tok::caret)) {
	if (FormatToken *BeforeParen = Current.MatchingParen->Previous)
	if (BeforeParen->is(tok::identifier) &&
	BeforeParen->TokenText == BeforeParen->TokenText.upper() &&
	(!BeforeParen->Previous \|\|
	BeforeParen->Previous->ClosesTemplateDeclaration))
	Current.Type = TT_FunctionAnnotationRParen;
	}
	}
	} else if (Current.is(tok::at) && Current.Next &&
	Style.Language != FormatStyle::LK_JavaScript &&
	Style.Language != FormatStyle::LK_Java) {
	// In Java & JavaScript, "@..." is a decorator or annotation. In ObjC, it
	// marks declarations and properties that need special formatting.
	switch (Current.Next->Tok.getObjCKeywordID()) {
	case tok::objc_interface:
	case tok::objc_implementation:
	case tok::objc_protocol:
	Current.Type = TT_ObjCDecl;
	break;
	case tok::objc_property:
	Current.Type = TT_ObjCProperty;
	break;
	default:
	break;
	}
	} else if (Current.is(tok::period)) {
	FormatToken *PreviousNoComment = Current.getPreviousNonComment();
	if (PreviousNoComment &&
	PreviousNoComment->isOneOf(tok::comma, tok::l_brace))
	Current.Type = TT_DesignatedInitializerPeriod;
	else if (Style.Language == FormatStyle::LK_Java && Current.Previous &&
	Current.Previous->isOneOf(TT_JavaAnnotation,
	TT_LeadingJavaAnnotation)) {
	Current.Type = Current.Previous->Type;
	}
	} else if (Current.isOneOf(tok::identifier, tok::kw_const) &&
	Current.Previous &&
	!Current.Previous->isOneOf(tok::equal, tok::at) &&
	Line.MightBeFunctionDecl && Contexts.size() == 1) {
	// Line.MightBeFunctionDecl can only be true after the parentheses of a
	// function declaration have been found.
	Current.Type = TT_TrailingAnnotation;
	} else if ((Style.Language == FormatStyle::LK_Java \|\|
	Style.Language == FormatStyle::LK_JavaScript) &&
	Current.Previous) {
	if (Current.Previous->is(tok::at) &&
	Current.isNot(Keywords.kw_interface)) {
	const FormatToken &AtToken = *Current.Previous;
	const FormatToken *Previous = AtToken.getPreviousNonComment();
	if (!Previous \|\| Previous->is(TT_LeadingJavaAnnotation))
	Current.Type = TT_LeadingJavaAnnotation;
	else
	Current.Type = TT_JavaAnnotation;
	} else if (Current.Previous->is(tok::period) &&
	Current.Previous->isOneOf(TT_JavaAnnotation,
	TT_LeadingJavaAnnotation)) {
	Current.Type = Current.Previous->Type;
	}
	}
	}

	/// \brief Take a guess at whether \p Tok starts a name of a function or
	/// variable declaration.
	///
	/// This is a heuristic based on whether \p Tok is an identifier following
	/// something that is likely a type.
	bool isStartOfName(const FormatToken &Tok) {
	if (Tok.isNot(tok::identifier) \|\| !Tok.Previous)
	return false;

	if (Tok.Previous->isOneOf(TT_LeadingJavaAnnotation, Keywords.kw_instanceof,
	Keywords.kw_as))
	return false;
	if (Style.Language == FormatStyle::LK_JavaScript &&
	Tok.Previous->is(Keywords.kw_in))
	return false;

	// Skip "const" as it does not have an influence on whether this is a name.
	FormatToken *PreviousNotConst = Tok.getPreviousNonComment();
	while (PreviousNotConst && PreviousNotConst->is(tok::kw_const))
	PreviousNotConst = PreviousNotConst->getPreviousNonComment();

	if (!PreviousNotConst)
	return false;

	bool IsPPKeyword = PreviousNotConst->is(tok::identifier) &&
	PreviousNotConst->Previous &&
	PreviousNotConst->Previous->is(tok::hash);

	if (PreviousNotConst->is(TT_TemplateCloser))
	return PreviousNotConst && PreviousNotConst->MatchingParen &&
	PreviousNotConst->MatchingParen->Previous &&
	PreviousNotConst->MatchingParen->Previous->isNot(tok::period) &&
	PreviousNotConst->MatchingParen->Previous->isNot(tok::kw_template);

	if (PreviousNotConst->is(tok::r_paren) && PreviousNotConst->MatchingParen &&
	PreviousNotConst->MatchingParen->Previous &&
	PreviousNotConst->MatchingParen->Previous->is(tok::kw_decltype))
	return true;

	return (!IsPPKeyword &&
	PreviousNotConst->isOneOf(tok::identifier, tok::kw_auto)) \|\|
	PreviousNotConst->is(TT_PointerOrReference) \|\|
	PreviousNotConst->isSimpleTypeSpecifier();
	}

	/// \brief Determine whether ')' is ending a cast.
	bool rParenEndsCast(const FormatToken &Tok) {
	// C-style casts are only used in C++ and Java.
	if (!Style.isCpp() && Style.Language != FormatStyle::LK_Java)
	return false;

	// Empty parens aren't casts and there are no casts at the end of the line.
	if (Tok.Previous == Tok.MatchingParen \|\| !Tok.Next \|\| !Tok.MatchingParen)
	return false;

	FormatToken *LeftOfParens = Tok.MatchingParen->getPreviousNonComment();
	if (LeftOfParens) {
	// If there is a closing parenthesis left of the current parentheses,
	// look past it as these might be chained casts.
	if (LeftOfParens->is(tok::r_paren)) {
	if (!LeftOfParens->MatchingParen \|\|
	!LeftOfParens->MatchingParen->Previous)
	return false;
	LeftOfParens = LeftOfParens->MatchingParen->Previous;
	}

	// If there is an identifier (or with a few exceptions a keyword) right
	// before the parentheses, this is unlikely to be a cast.
	if (LeftOfParens->Tok.getIdentifierInfo() &&
	!LeftOfParens->isOneOf(Keywords.kw_in, tok::kw_return, tok::kw_case,
	tok::kw_delete))
	return false;

	// Certain other tokens right before the parentheses are also signals that
	// this cannot be a cast.
	if (LeftOfParens->isOneOf(tok::at, tok::r_square, TT_OverloadedOperator,
	TT_TemplateCloser, tok::ellipsis))
	return false;
	}

	if (Tok.Next->is(tok::question))
	return false;

	// As Java has no function types, a "(" after the ")" likely means that this
	// is a cast.
	if (Style.Language == FormatStyle::LK_Java && Tok.Next->is(tok::l_paren))
	return true;

	// If a (non-string) literal follows, this is likely a cast.
	if (Tok.Next->isNot(tok::string_literal) &&
	(Tok.Next->Tok.isLiteral() \|\|
	Tok.Next->isOneOf(tok::kw_sizeof, tok::kw_alignof)))
	return true;

	// Heuristically try to determine whether the parentheses contain a type.
	bool ParensAreType =
	!Tok.Previous \|\|
	Tok.Previous->isOneOf(TT_PointerOrReference, TT_TemplateCloser) \|\|
	Tok.Previous->isSimpleTypeSpecifier();
	bool ParensCouldEndDecl =
	Tok.Next->isOneOf(tok::equal, tok::semi, tok::l_brace, tok::greater);
	if (ParensAreType && !ParensCouldEndDecl)
	return true;

	// At this point, we heuristically assume that there are no casts at the
	// start of the line. We assume that we have found most cases where there
	// are by the logic above, e.g. "(void)x;".
	if (!LeftOfParens)
	return false;

	// Certain token types inside the parentheses mean that this can't be a
	// cast.
	for (const FormatToken *Token = Tok.MatchingParen->Next; Token != &Tok;
	Token = Token->Next)
	if (Token->is(TT_BinaryOperator))
	return false;

	// If the following token is an identifier or 'this', this is a cast. All
	// cases where this can be something else are handled above.
	if (Tok.Next->isOneOf(tok::identifier, tok::kw_this))
	return true;

	if (!Tok.Next->Next)
	return false;

	// If the next token after the parenthesis is a unary operator, assume
	// that this is cast, unless there are unexpected tokens inside the
	// parenthesis.
	bool NextIsUnary =
	Tok.Next->isUnaryOperator() \|\| Tok.Next->isOneOf(tok::amp, tok::star);
	if (!NextIsUnary \|\| Tok.Next->is(tok::plus) \|\|
	!Tok.Next->Next->isOneOf(tok::identifier, tok::numeric_constant))
	return false;
	// Search for unexpected tokens.
	for (FormatToken *Prev = Tok.Previous; Prev != Tok.MatchingParen;
	Prev = Prev->Previous) {
	if (!Prev->isOneOf(tok::kw_const, tok::identifier, tok::coloncolon))
	return false;
	}
	return true;
	}

	/// \brief Return the type of the given token assuming it is * or &.
	TokenType determineStarAmpUsage(const FormatToken &Tok, bool IsExpression,
	bool InTemplateArgument) {
	if (Style.Language == FormatStyle::LK_JavaScript)
	return TT_BinaryOperator;

	const FormatToken *PrevToken = Tok.getPreviousNonComment();
	if (!PrevToken)
	return TT_UnaryOperator;

	const FormatToken *NextToken = Tok.getNextNonComment();
	if (!NextToken \|\|
	NextToken->isOneOf(tok::arrow, tok::equal, tok::kw_const) \|\|
	(NextToken->is(tok::l_brace) && !NextToken->getNextNonComment()))
	return TT_PointerOrReference;

	if (PrevToken->is(tok::coloncolon))
	return TT_PointerOrReference;

	if (PrevToken->isOneOf(tok::l_paren, tok::l_square, tok::l_brace,
	tok::comma, tok::semi, tok::kw_return, tok::colon,
	tok::equal, tok::kw_delete, tok::kw_sizeof,
	tok::kw_throw) \|\|
	PrevToken->isOneOf(TT_BinaryOperator, TT_ConditionalExpr,
	TT_UnaryOperator, TT_CastRParen))
	return TT_UnaryOperator;

	if (NextToken->is(tok::l_square) && NextToken->isNot(TT_LambdaLSquare))
	return TT_PointerOrReference;
	if (NextToken->is(tok::kw_operator) && !IsExpression)
	return TT_PointerOrReference;
	if (NextToken->isOneOf(tok::comma, tok::semi))
	return TT_PointerOrReference;

	if (PrevToken->is(tok::r_paren) && PrevToken->MatchingParen) {
	FormatToken *TokenBeforeMatchingParen =
	PrevToken->MatchingParen->getPreviousNonComment();
	if (TokenBeforeMatchingParen &&
	TokenBeforeMatchingParen->isOneOf(tok::kw_typeof, tok::kw_decltype))
	return TT_PointerOrReference;
	}

	if (PrevToken->Tok.isLiteral() \|\|
	PrevToken->isOneOf(tok::r_paren, tok::r_square, tok::kw_true,
	tok::kw_false, tok::r_brace) \|\|
	NextToken->Tok.isLiteral() \|\|
	NextToken->isOneOf(tok::kw_true, tok::kw_false) \|\|
	NextToken->isUnaryOperator() \|\|
	// If we know we're in a template argument, there are no named
	// declarations. Thus, having an identifier on the right-hand side
	// indicates a binary operator.
	(InTemplateArgument && NextToken->Tok.isAnyIdentifier()))
	return TT_BinaryOperator;

	// "&&(" is quite unlikely to be two successive unary "&".
	if (Tok.is(tok::ampamp) && NextToken && NextToken->is(tok::l_paren))
	return TT_BinaryOperator;

	// This catches some cases where evaluation order is used as control flow:
	// aaa && aaa->f();
	const FormatToken *NextNextToken = NextToken->getNextNonComment();
	if (NextNextToken && NextNextToken->is(tok::arrow))
	return TT_BinaryOperator;

	// It is very unlikely that we are going to find a pointer or reference type
	// definition on the RHS of an assignment.
	if (IsExpression && !Contexts.back().CaretFound)
	return TT_BinaryOperator;

	return TT_PointerOrReference;
	}

	TokenType determinePlusMinusCaretUsage(const FormatToken &Tok) {
	const FormatToken *PrevToken = Tok.getPreviousNonComment();
	if (!PrevToken)
	return TT_UnaryOperator;

	if (PrevToken->isOneOf(TT_CastRParen, TT_UnaryOperator) &&
	!PrevToken->is(tok::exclaim))
	// There aren't any trailing unary operators except for TypeScript's
	// non-null operator (!). Thus, this must be squence of leading operators.
	return TT_UnaryOperator;

	// Use heuristics to recognize unary operators.
	if (PrevToken->isOneOf(tok::equal, tok::l_paren, tok::comma, tok::l_square,
	tok::question, tok::colon, tok::kw_return,
	tok::kw_case, tok::at, tok::l_brace))
	return TT_UnaryOperator;

	// There can't be two consecutive binary operators.
	if (PrevToken->is(TT_BinaryOperator))
	return TT_UnaryOperator;

	// Fall back to marking the token as binary operator.
	return TT_BinaryOperator;
	}

	/// \brief Determine whether ++/-- are pre- or post-increments/-decrements.
	TokenType determineIncrementUsage(const FormatToken &Tok) {
	const FormatToken *PrevToken = Tok.getPreviousNonComment();
	if (!PrevToken \|\| PrevToken->is(TT_CastRParen))
	return TT_UnaryOperator;
	if (PrevToken->isOneOf(tok::r_paren, tok::r_square, tok::identifier))
	return TT_TrailingUnaryOperator;

	return TT_UnaryOperator;
	}

	SmallVector<Context, 8> Contexts;

	const FormatStyle &Style;
	AnnotatedLine &Line;
	FormatToken *CurrentToken;
	bool AutoFound;
	const AdditionalKeywords &Keywords;

	// Set of "<" tokens that do not open a template parameter list. If parseAngle
	// determines that a specific token can't be a template opener, it will make
	// same decision irrespective of the decisions for tokens leading up to it.
	// Store this information to prevent this from causing exponential runtime.
	llvm::SmallPtrSet<FormatToken *, 16> NonTemplateLess;
	};

	static const int PrecedenceUnaryOperator = prec::PointerToMember + 1;
	static const int PrecedenceArrowAndPeriod = prec::PointerToMember + 2;

	/// \brief Parses binary expressions by inserting fake parenthesis based on
	/// operator precedence.
	class ExpressionParser {
	public:
	ExpressionParser(const FormatStyle &Style, const AdditionalKeywords &Keywords,
	AnnotatedLine &Line)
	: Style(Style), Keywords(Keywords), Current(Line.First) {}

	/// \brief Parse expressions with the given operatore precedence.
	void parse(int Precedence = 0) {
	// Skip 'return' and ObjC selector colons as they are not part of a binary
	// expression.
	while (Current && (Current->is(tok::kw_return) \|\|
	(Current->is(tok::colon) &&
	Current->isOneOf(TT_ObjCMethodExpr, TT_DictLiteral))))
	next();

	if (!Current \|\| Precedence > PrecedenceArrowAndPeriod)
	return;

	// Conditional expressions need to be parsed separately for proper nesting.
	if (Precedence == prec::Conditional) {
	parseConditionalExpr();
	return;
	}

	// Parse unary operators, which all have a higher precedence than binary
	// operators.
	if (Precedence == PrecedenceUnaryOperator) {
	parseUnaryOperator();
	return;
	}

	FormatToken *Start = Current;
	FormatToken *LatestOperator = nullptr;
	unsigned OperatorIndex = 0;

	while (Current) {
	// Consume operators with higher precedence.
	parse(Precedence + 1);

	int CurrentPrecedence = getCurrentPrecedence();

	if (Current && Current->is(TT_SelectorName) &&
	Precedence == CurrentPrecedence) {
	if (LatestOperator)
	addFakeParenthesis(Start, prec::Level(Precedence));
	Start = Current;
	}

	// At the end of the line or when an operator with higher precedence is
	// found, insert fake parenthesis and return.
	if (!Current \|\|
	(Current->closesScope() &&
	(Current->MatchingParen \|\| Current->is(TT_TemplateString))) \|\|
	(CurrentPrecedence != -1 && CurrentPrecedence < Precedence) \|\|
	(CurrentPrecedence == prec::Conditional &&
	Precedence == prec::Assignment && Current->is(tok::colon))) {
	break;
	}

	// Consume scopes: (), [], <> and {}
	if (Current->opensScope()) {
	// In fragment of a JavaScript template string can look like '}..${' and
	// thus close a scope and open a new one at the same time.
	while (Current && (!Current->closesScope() \|\| Current->opensScope())) {
	next();
	parse();
	}
	next();
	} else {
	// Operator found.
	if (CurrentPrecedence == Precedence) {
	if (LatestOperator)
	LatestOperator->NextOperator = Current;
	LatestOperator = Current;
	Current->OperatorIndex = OperatorIndex;
	++OperatorIndex;
	}
	next(/SkipPastLeadingComments=/Precedence > 0);
	}
	}

	if (LatestOperator && (Current \|\| Precedence > 0)) {
	// LatestOperator->LastOperator = true;
	if (Precedence == PrecedenceArrowAndPeriod) {
	// Call expressions don't have a binary operator precedence.
	addFakeParenthesis(Start, prec::Unknown);
	} else {
	addFakeParenthesis(Start, prec::Level(Precedence));
	}
	}
	}

	private:
	/// \brief Gets the precedence (+1) of the given token for binary operators
	/// and other tokens that we treat like binary operators.
	int getCurrentPrecedence() {
	if (Current) {
	const FormatToken *NextNonComment = Current->getNextNonComment();
	if (Current->is(TT_ConditionalExpr))
	return prec::Conditional;
	if (NextNonComment && Current->is(TT_SelectorName) &&
	(NextNonComment->isOneOf(TT_DictLiteral, TT_JsTypeColon) \|\|
	((Style.Language == FormatStyle::LK_Proto \|\|
	Style.Language == FormatStyle::LK_TextProto) &&
	NextNonComment->is(tok::less))))
	return prec::Assignment;
	if (Current->is(TT_JsComputedPropertyName))
	return prec::Assignment;
	if (Current->is(TT_LambdaArrow))
	return prec::Comma;
	if (Current->is(TT_JsFatArrow))
	return prec::Assignment;
	if (Current->isOneOf(tok::semi, TT_InlineASMColon, TT_SelectorName) \|\|
	(Current->is(tok::comment) && NextNonComment &&
	NextNonComment->is(TT_SelectorName)))
	return 0;
	if (Current->is(TT_RangeBasedForLoopColon))
	return prec::Comma;
	if ((Style.Language == FormatStyle::LK_Java \|\|
	Style.Language == FormatStyle::LK_JavaScript) &&
	Current->is(Keywords.kw_instanceof))
	return prec::Relational;
	if (Style.Language == FormatStyle::LK_JavaScript &&
	Current->isOneOf(Keywords.kw_in, Keywords.kw_as))
	return prec::Relational;
	if (Current->is(TT_BinaryOperator) \|\| Current->is(tok::comma))
	return Current->getPrecedence();
	if (Current->isOneOf(tok::period, tok::arrow))
	return PrecedenceArrowAndPeriod;
	if ((Style.Language == FormatStyle::LK_Java \|\|
	Style.Language == FormatStyle::LK_JavaScript) &&
	Current->isOneOf(Keywords.kw_extends, Keywords.kw_implements,
	Keywords.kw_throws))
	return 0;
	}
	return -1;
	}

	void addFakeParenthesis(FormatToken *Start, prec::Level Precedence) {
	Start->FakeLParens.push_back(Precedence);
	if (Precedence > prec::Unknown)
	Start->StartsBinaryExpression = true;
	if (Current) {
	FormatToken *Previous = Current->Previous;
	while (Previous->is(tok::comment) && Previous->Previous)
	Previous = Previous->Previous;
	++Previous->FakeRParens;
	if (Precedence > prec::Unknown)
	Previous->EndsBinaryExpression = true;
	}
	}

	/// \brief Parse unary operator expressions and surround them with fake
	/// parentheses if appropriate.
	void parseUnaryOperator() {
	llvm::SmallVector<FormatToken *, 2> Tokens;
	while (Current && Current->is(TT_UnaryOperator)) {
	Tokens.push_back(Current);
	next();
	}
	parse(PrecedenceArrowAndPeriod);
	for (FormatToken *Token : llvm::reverse(Tokens))
	// The actual precedence doesn't matter.
	addFakeParenthesis(Token, prec::Unknown);
	}

	void parseConditionalExpr() {
	while (Current && Current->isTrailingComment()) {
	next();
	}
	FormatToken *Start = Current;
	parse(prec::LogicalOr);
	if (!Current \|\| !Current->is(tok::question))
	return;
	next();
	parse(prec::Assignment);
	if (!Current \|\| Current->isNot(TT_ConditionalExpr))
	return;
	next();
	parse(prec::Assignment);
	addFakeParenthesis(Start, prec::Conditional);
	}

	void next(bool SkipPastLeadingComments = true) {
	if (Current)
	Current = Current->Next;
	while (Current &&
	(Current->NewlinesBefore == 0 \|\| SkipPastLeadingComments) &&
	Current->isTrailingComment())
	Current = Current->Next;
	}

	const FormatStyle &Style;
	const AdditionalKeywords &Keywords;
	FormatToken *Current;
	};

	} // end anonymous namespace

	void TokenAnnotator::setCommentLineLevels(
	SmallVectorImpl<AnnotatedLine *> &Lines) {
	const AnnotatedLine *NextNonCommentLine = nullptr;
	for (SmallVectorImpl<AnnotatedLine *>::reverse_iterator I = Lines.rbegin(),
	E = Lines.rend();
	I != E; ++I) {
	bool CommentLine = true;
	for (const FormatToken Tok = (I)->First; Tok; Tok = Tok->Next) {
	if (!Tok->is(tok::comment)) {
	CommentLine = false;
	break;
	}
	}

	- if (NextNonCommentLine && CommentLine) {
	- // If the comment is currently aligned with the line immediately following
	- // it, that's probably intentional and we should keep it.
	- bool AlignedWithNextLine =
	- NextNonCommentLine->First->NewlinesBefore <= 1 &&
	- NextNonCommentLine->First->OriginalColumn ==
	- (*I)->First->OriginalColumn;
	- if (AlignedWithNextLine)
	- (*I)->Level = NextNonCommentLine->Level;
	+ // If the comment is currently aligned with the line immediately following
	+ // it, that's probably intentional and we should keep it.
	+ if (NextNonCommentLine && CommentLine &&
	+ NextNonCommentLine->First->NewlinesBefore <= 1 &&
	+ NextNonCommentLine->First->OriginalColumn ==
	+ (*I)->First->OriginalColumn) {
	+ // Align comments for preprocessor lines with the # in column 0.
	+ // Otherwise, align with the next line.
	+ (*I)->Level = (NextNonCommentLine->Type == LT_PreprocessorDirective \|\|
	+ NextNonCommentLine->Type == LT_ImportStatement)
	+ ? 0
	+ : NextNonCommentLine->Level;
	} else {
	NextNonCommentLine = (I)->First->isNot(tok::r_brace) ? (I) : nullptr;
	}

	setCommentLineLevels((*I)->Children);
	}
	}

	static unsigned maxNestingDepth(const AnnotatedLine &Line) {
	unsigned Result = 0;
	for (const auto *Tok = Line.First; Tok != nullptr; Tok = Tok->Next)
	Result = std::max(Result, Tok->NestingLevel);
	return Result;
	}

	void TokenAnnotator::annotate(AnnotatedLine &Line) {
	for (SmallVectorImpl<AnnotatedLine *>::iterator I = Line.Children.begin(),
	E = Line.Children.end();
	I != E; ++I) {
	annotate(**I);
	}
	AnnotatingParser Parser(Style, Line, Keywords);
	Line.Type = Parser.parseLine();

	// With very deep nesting, ExpressionParser uses lots of stack and the
	// formatting algorithm is very slow. We're not going to do a good job here
	// anyway - it's probably generated code being formatted by mistake.
	// Just skip the whole line.
	if (maxNestingDepth(Line) > 50)
	Line.Type = LT_Invalid;

	if (Line.Type == LT_Invalid)
	return;

	ExpressionParser ExprParser(Style, Keywords, Line);
	ExprParser.parse();

	if (Line.startsWith(TT_ObjCMethodSpecifier))
	Line.Type = LT_ObjCMethodDecl;
	else if (Line.startsWith(TT_ObjCDecl))
	Line.Type = LT_ObjCDecl;
	else if (Line.startsWith(TT_ObjCProperty))
	Line.Type = LT_ObjCProperty;

	Line.First->SpacesRequiredBefore = 1;
	Line.First->CanBreakBefore = Line.First->MustBreakBefore;
	}

	// This function heuristically determines whether 'Current' starts the name of a
	// function declaration.
	static bool isFunctionDeclarationName(const FormatToken &Current,
	const AnnotatedLine &Line) {
	auto skipOperatorName = [](const FormatToken Next) -> const FormatToken {
	for (; Next; Next = Next->Next) {
	if (Next->is(TT_OverloadedOperatorLParen))
	return Next;
	if (Next->is(TT_OverloadedOperator))
	continue;
	if (Next->isOneOf(tok::kw_new, tok::kw_delete)) {
	// For 'new[]' and 'delete[]'.
	if (Next->Next && Next->Next->is(tok::l_square) && Next->Next->Next &&
	Next->Next->Next->is(tok::r_square))
	Next = Next->Next->Next;
	continue;
	}

	break;
	}
	return nullptr;
	};

	// Find parentheses of parameter list.
	const FormatToken *Next = Current.Next;
	if (Current.is(tok::kw_operator)) {
	if (Current.Previous && Current.Previous->is(tok::coloncolon))
	return false;
	Next = skipOperatorName(Next);
	} else {
	if (!Current.is(TT_StartOfName) \|\| Current.NestingLevel != 0)
	return false;
	for (; Next; Next = Next->Next) {
	if (Next->is(TT_TemplateOpener)) {
	Next = Next->MatchingParen;
	} else if (Next->is(tok::coloncolon)) {
	Next = Next->Next;
	if (!Next)
	return false;
	if (Next->is(tok::kw_operator)) {
	Next = skipOperatorName(Next->Next);
	break;
	}
	if (!Next->is(tok::identifier))
	return false;
	} else if (Next->is(tok::l_paren)) {
	break;
	} else {
	return false;
	}
	}
	}

	// Check whether parameter list can belong to a function declaration.
	if (!Next \|\| !Next->is(tok::l_paren) \|\| !Next->MatchingParen)
	return false;
	// If the lines ends with "{", this is likely an function definition.
	if (Line.Last->is(tok::l_brace))
	return true;
	if (Next->Next == Next->MatchingParen)
	return true; // Empty parentheses.
	// If there is an &/&& after the r_paren, this is likely a function.
	if (Next->MatchingParen->Next &&
	Next->MatchingParen->Next->is(TT_PointerOrReference))
	return true;
	for (const FormatToken *Tok = Next->Next; Tok && Tok != Next->MatchingParen;
	Tok = Tok->Next) {
	if (Tok->is(tok::l_paren) && Tok->MatchingParen) {
	Tok = Tok->MatchingParen;
	continue;
	}
	if (Tok->is(tok::kw_const) \|\| Tok->isSimpleTypeSpecifier() \|\|
	Tok->isOneOf(TT_PointerOrReference, TT_StartOfName, tok::ellipsis))
	return true;
	if (Tok->isOneOf(tok::l_brace, tok::string_literal, TT_ObjCMethodExpr) \|\|
	Tok->Tok.isLiteral())
	return false;
	}
	return false;
	}

	bool TokenAnnotator::mustBreakForReturnType(const AnnotatedLine &Line) const {
	assert(Line.MightBeFunctionDecl);

	if ((Style.AlwaysBreakAfterReturnType == FormatStyle::RTBS_TopLevel \|\|
	Style.AlwaysBreakAfterReturnType ==
	FormatStyle::RTBS_TopLevelDefinitions) &&
	Line.Level > 0)
	return false;

	switch (Style.AlwaysBreakAfterReturnType) {
	case FormatStyle::RTBS_None:
	return false;
	case FormatStyle::RTBS_All:
	case FormatStyle::RTBS_TopLevel:
	return true;
	case FormatStyle::RTBS_AllDefinitions:
	case FormatStyle::RTBS_TopLevelDefinitions:
	return Line.mightBeFunctionDefinition();
	}

	return false;
	}

	void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) {
	for (SmallVectorImpl<AnnotatedLine *>::iterator I = Line.Children.begin(),
	E = Line.Children.end();
	I != E; ++I) {
	calculateFormattingInformation(**I);
	}

	Line.First->TotalLength =
	Line.First->IsMultiline ? Style.ColumnLimit
	: Line.FirstStartColumn + Line.First->ColumnWidth;
	FormatToken *Current = Line.First->Next;
	bool InFunctionDecl = Line.MightBeFunctionDecl;
	while (Current) {
	if (isFunctionDeclarationName(*Current, Line))
	Current->Type = TT_FunctionDeclarationName;
	if (Current->is(TT_LineComment)) {
	if (Current->Previous->BlockKind == BK_BracedInit &&
	Current->Previous->opensScope())
	Current->SpacesRequiredBefore = Style.Cpp11BracedListStyle ? 0 : 1;
	else
	Current->SpacesRequiredBefore = Style.SpacesBeforeTrailingComments;

	// If we find a trailing comment, iterate backwards to determine whether
	// it seems to relate to a specific parameter. If so, break before that
	// parameter to avoid changing the comment's meaning. E.g. don't move 'b'
	// to the previous line in:
	// SomeFunction(a,
	// b, // comment
	// c);
	if (!Current->HasUnescapedNewline) {
	for (FormatToken *Parameter = Current->Previous; Parameter;
	Parameter = Parameter->Previous) {
	if (Parameter->isOneOf(tok::comment, tok::r_brace))
	break;
	if (Parameter->Previous && Parameter->Previous->is(tok::comma)) {
	if (!Parameter->Previous->is(TT_CtorInitializerComma) &&
	Parameter->HasUnescapedNewline)
	Parameter->MustBreakBefore = true;
	break;
	}
	}
	}
	} else if (Current->SpacesRequiredBefore == 0 &&
	spaceRequiredBefore(Line, *Current)) {
	Current->SpacesRequiredBefore = 1;
	}

	Current->MustBreakBefore =
	Current->MustBreakBefore \|\| mustBreakBefore(Line, *Current);

	if (!Current->MustBreakBefore && InFunctionDecl &&
	Current->is(TT_FunctionDeclarationName))
	Current->MustBreakBefore = mustBreakForReturnType(Line);

	Current->CanBreakBefore =
	Current->MustBreakBefore \|\| canBreakBefore(Line, *Current);
	unsigned ChildSize = 0;
	if (Current->Previous->Children.size() == 1) {
	FormatToken &LastOfChild = *Current->Previous->Children[0]->Last;
	ChildSize = LastOfChild.isTrailingComment() ? Style.ColumnLimit
	: LastOfChild.TotalLength + 1;
	}
	const FormatToken *Prev = Current->Previous;
	if (Current->MustBreakBefore \|\| Prev->Children.size() > 1 \|\|
	(Prev->Children.size() == 1 &&
	Prev->Children[0]->First->MustBreakBefore) \|\|
	Current->IsMultiline)
	Current->TotalLength = Prev->TotalLength + Style.ColumnLimit;
	else
	Current->TotalLength = Prev->TotalLength + Current->ColumnWidth +
	ChildSize + Current->SpacesRequiredBefore;

	if (Current->is(TT_CtorInitializerColon))
	InFunctionDecl = false;

	// FIXME: Only calculate this if CanBreakBefore is true once static
	// initializers etc. are sorted out.
	// FIXME: Move magic numbers to a better place.
	Current->SplitPenalty = 20 * Current->BindingStrength +
	splitPenalty(Line, *Current, InFunctionDecl);

	Current = Current->Next;
	}

	calculateUnbreakableTailLengths(Line);
	unsigned IndentLevel = Line.Level;
	for (Current = Line.First; Current != nullptr; Current = Current->Next) {
	if (Current->Role)
	Current->Role->precomputeFormattingInfos(Current);
	if (Current->MatchingParen &&
	Current->MatchingParen->opensBlockOrBlockTypeList(Style)) {
	assert(IndentLevel > 0);
	--IndentLevel;
	}
	Current->IndentLevel = IndentLevel;
	if (Current->opensBlockOrBlockTypeList(Style))
	++IndentLevel;
	}

	DEBUG({ printDebugInfo(Line); });
	}

	void TokenAnnotator::calculateUnbreakableTailLengths(AnnotatedLine &Line) {
	unsigned UnbreakableTailLength = 0;
	FormatToken *Current = Line.Last;
	while (Current) {
	Current->UnbreakableTailLength = UnbreakableTailLength;
	if (Current->CanBreakBefore \|\|
	Current->isOneOf(tok::comment, tok::string_literal)) {
	UnbreakableTailLength = 0;
	} else {
	UnbreakableTailLength +=
	Current->ColumnWidth + Current->SpacesRequiredBefore;
	}
	Current = Current->Previous;
	}
	}

	unsigned TokenAnnotator::splitPenalty(const AnnotatedLine &Line,
	const FormatToken &Tok,
	bool InFunctionDecl) {
	const FormatToken &Left = *Tok.Previous;
	const FormatToken &Right = Tok;

	if (Left.is(tok::semi))
	return 0;

	if (Style.Language == FormatStyle::LK_Java) {
	if (Right.isOneOf(Keywords.kw_extends, Keywords.kw_throws))
	return 1;
	if (Right.is(Keywords.kw_implements))
	return 2;
	if (Left.is(tok::comma) && Left.NestingLevel == 0)
	return 3;
	} else if (Style.Language == FormatStyle::LK_JavaScript) {
	if (Right.is(Keywords.kw_function) && Left.isNot(tok::comma))
	return 100;
	if (Left.is(TT_JsTypeColon))
	return 35;
	if ((Left.is(TT_TemplateString) && Left.TokenText.endswith("${")) \|\|
	(Right.is(TT_TemplateString) && Right.TokenText.startswith("}")))
	return 100;
	// Prefer breaking call chains (".foo") over empty "{}", "[]" or "()".
	if (Left.opensScope() && Right.closesScope())
	return 200;
	}

	if (Right.is(tok::identifier) && Right.Next && Right.Next->is(TT_DictLiteral))
	return 1;
	if (Right.is(tok::l_square)) {
	if (Style.Language == FormatStyle::LK_Proto)
	return 1;
	if (Left.is(tok::r_square))
	return 200;
	// Slightly prefer formatting local lambda definitions like functions.
	if (Right.is(TT_LambdaLSquare) && Left.is(tok::equal))
	return 35;
	if (!Right.isOneOf(TT_ObjCMethodExpr, TT_LambdaLSquare,
	TT_ArrayInitializerLSquare,
	TT_DesignatedInitializerLSquare))
	return 500;
	}

	if (Right.isOneOf(TT_StartOfName, TT_FunctionDeclarationName) \|\|
	Right.is(tok::kw_operator)) {
	if (Line.startsWith(tok::kw_for) && Right.PartOfMultiVariableDeclStmt)
	return 3;
	if (Left.is(TT_StartOfName))
	return 110;
	if (InFunctionDecl && Right.NestingLevel == 0)
	return Style.PenaltyReturnTypeOnItsOwnLine;
	return 200;
	}
	if (Right.is(TT_PointerOrReference))
	return 190;
	if (Right.is(TT_LambdaArrow))
	return 110;
	if (Left.is(tok::equal) && Right.is(tok::l_brace))
	return 160;
	if (Left.is(TT_CastRParen))
	return 100;
	if (Left.is(tok::coloncolon) \|\|
	(Right.is(tok::period) && Style.Language == FormatStyle::LK_Proto))
	return 500;
	if (Left.isOneOf(tok::kw_class, tok::kw_struct))
	return 5000;
	if (Left.is(tok::comment))
	return 1000;

	if (Left.isOneOf(TT_RangeBasedForLoopColon, TT_InheritanceColon,
	TT_CtorInitializerColon))
	return 2;

	if (Right.isMemberAccess()) {
	// Breaking before the "./->" of a chained call/member access is reasonably
	// cheap, as formatting those with one call per line is generally
	// desirable. In particular, it should be cheaper to break before the call
	// than it is to break inside a call's parameters, which could lead to weird
	// "hanging" indents. The exception is the very last "./->" to support this
	// frequent pattern:
	//
	// aaaaaaaa.aaaaaaaa.bbbbbbb().ccccccccccccccccccccc(
	// dddddddd);
	//
	// which might otherwise be blown up onto many lines. Here, clang-format
	// won't produce "hanging" indents anyway as there is no other trailing
	// call.
	//
	// Also apply higher penalty is not a call as that might lead to a wrapping
	// like:
	//
	// aaaaaaa
	// .aaaaaaaaa.bbbbbbbb(cccccccc);
	return !Right.NextOperator \|\| !Right.NextOperator->Previous->closesScope()
	? 150
	: 35;
	}

	if (Right.is(TT_TrailingAnnotation) &&
	(!Right.Next \|\| Right.Next->isNot(tok::l_paren))) {
	// Moving trailing annotations to the next line is fine for ObjC method
	// declarations.
	if (Line.startsWith(TT_ObjCMethodSpecifier))
	return 10;
	// Generally, breaking before a trailing annotation is bad unless it is
	// function-like. It seems to be especially preferable to keep standard
	// annotations (i.e. "const", "final" and "override") on the same line.
	// Use a slightly higher penalty after ")" so that annotations like
	// "const override" are kept together.
	bool is_short_annotation = Right.TokenText.size() < 10;
	return (Left.is(tok::r_paren) ? 100 : 120) + (is_short_annotation ? 50 : 0);
	}

	// In for-loops, prefer breaking at ',' and ';'.
	if (Line.startsWith(tok::kw_for) && Left.is(tok::equal))
	return 4;

	// In Objective-C method expressions, prefer breaking before "param:" over
	// breaking after it.
	if (Right.is(TT_SelectorName))
	return 0;
	if (Left.is(tok::colon) && Left.is(TT_ObjCMethodExpr))
	return Line.MightBeFunctionDecl ? 50 : 500;

	if (Left.is(tok::l_paren) && InFunctionDecl &&
	Style.AlignAfterOpenBracket != FormatStyle::BAS_DontAlign)
	return 100;
	if (Left.is(tok::l_paren) && Left.Previous &&
	(Left.Previous->isOneOf(tok::kw_if, tok::kw_for) \|\|
	Left.Previous->endsSequence(tok::kw_constexpr, tok::kw_if)))
	return 1000;
	if (Left.is(tok::equal) && InFunctionDecl)
	return 110;
	if (Right.is(tok::r_brace))
	return 1;
	if (Left.is(TT_TemplateOpener))
	return 100;
	if (Left.opensScope()) {
	if (Style.AlignAfterOpenBracket == FormatStyle::BAS_DontAlign)
	return 0;
	return Left.ParameterCount > 1 ? Style.PenaltyBreakBeforeFirstCallParameter
	: 19;
	}
	if (Left.is(TT_JavaAnnotation))
	return 50;

	if (Left.isOneOf(tok::plus, tok::comma) && Left.Previous &&
	Left.Previous->isLabelString() &&
	(Left.NextOperator \|\| Left.OperatorIndex != 0))
	return 50;
	if (Right.is(tok::plus) && Left.isLabelString() &&
	(Right.NextOperator \|\| Right.OperatorIndex != 0))
	return 25;
	if (Left.is(tok::comma))
	return 1;
	if (Right.is(tok::lessless) && Left.isLabelString() &&
	(Right.NextOperator \|\| Right.OperatorIndex != 1))
	return 25;
	if (Right.is(tok::lessless)) {
	// Breaking at a << is really cheap.
	if (!Left.is(tok::r_paren) \|\| Right.OperatorIndex > 0)
	// Slightly prefer to break before the first one in log-like statements.
	return 2;
	return 1;
	}
	if (Left.is(TT_ConditionalExpr))
	return prec::Conditional;
	prec::Level Level = Left.getPrecedence();
	if (Level == prec::Unknown)
	Level = Right.getPrecedence();
	if (Level == prec::Assignment)
	return Style.PenaltyBreakAssignment;
	if (Level != prec::Unknown)
	return Level;

	return 3;
	}

	bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
	const FormatToken &Left,
	const FormatToken &Right) {
	if (Left.is(tok::kw_return) && Right.isNot(tok::semi))
	return true;
	if (Left.is(Keywords.kw_assert) && Style.Language == FormatStyle::LK_Java)
	return true;
	if (Style.ObjCSpaceAfterProperty && Line.Type == LT_ObjCProperty &&
	Left.Tok.getObjCKeywordID() == tok::objc_property)
	return true;
	if (Right.is(tok::hashhash))
	return Left.is(tok::hash);
	if (Left.isOneOf(tok::hashhash, tok::hash))
	return Right.is(tok::hash);
	if (Left.is(tok::l_paren) && Right.is(tok::r_paren))
	return Style.SpaceInEmptyParentheses;
	if (Left.is(tok::l_paren) \|\| Right.is(tok::r_paren))
	return (Right.is(TT_CastRParen) \|\|
	(Left.MatchingParen && Left.MatchingParen->is(TT_CastRParen)))
	? Style.SpacesInCStyleCastParentheses
	: Style.SpacesInParentheses;
	if (Right.isOneOf(tok::semi, tok::comma))
	return false;
	if (Right.is(tok::less) && Line.Type == LT_ObjCDecl &&
	Style.ObjCSpaceBeforeProtocolList)
	return true;
	if (Right.is(tok::less) && Left.is(tok::kw_template))
	return Style.SpaceAfterTemplateKeyword;
	if (Left.isOneOf(tok::exclaim, tok::tilde))
	return false;
	if (Left.is(tok::at) &&
	Right.isOneOf(tok::identifier, tok::string_literal, tok::char_constant,
	tok::numeric_constant, tok::l_paren, tok::l_brace,
	tok::kw_true, tok::kw_false))
	return false;
	if (Left.is(tok::colon))
	return !Left.is(TT_ObjCMethodExpr);
	if (Left.is(tok::coloncolon))
	return false;
	if (Left.is(tok::less) \|\| Right.isOneOf(tok::greater, tok::less))
	return false;
	if (Right.is(tok::ellipsis))
	return Left.Tok.isLiteral() \|\| (Left.is(tok::identifier) && Left.Previous &&
	Left.Previous->is(tok::kw_case));
	if (Left.is(tok::l_square) && Right.is(tok::amp))
	return false;
	if (Right.is(TT_PointerOrReference)) {
	if (Left.is(tok::r_paren) && Line.MightBeFunctionDecl) {
	if (!Left.MatchingParen)
	return true;
	FormatToken *TokenBeforeMatchingParen =
	Left.MatchingParen->getPreviousNonComment();
	if (!TokenBeforeMatchingParen \|\|
	!TokenBeforeMatchingParen->isOneOf(tok::kw_typeof, tok::kw_decltype))
	return true;
	}
	return (Left.Tok.isLiteral() \|\|
	(!Left.isOneOf(TT_PointerOrReference, tok::l_paren) &&
	(Style.PointerAlignment != FormatStyle::PAS_Left \|\|
	(Line.IsMultiVariableDeclStmt &&
	(Left.NestingLevel == 0 \|\|
	(Left.NestingLevel == 1 && Line.First->is(tok::kw_for)))))));
	}
	if (Right.is(TT_FunctionTypeLParen) && Left.isNot(tok::l_paren) &&
	(!Left.is(TT_PointerOrReference) \|\|
	(Style.PointerAlignment != FormatStyle::PAS_Right &&
	!Line.IsMultiVariableDeclStmt)))
	return true;
	if (Left.is(TT_PointerOrReference))
	return Right.Tok.isLiteral() \|\| Right.is(TT_BlockComment) \|\|
	(Right.isOneOf(Keywords.kw_override, Keywords.kw_final) &&
	!Right.is(TT_StartOfName)) \|\|
	(Right.is(tok::l_brace) && Right.BlockKind == BK_Block) \|\|
	(!Right.isOneOf(TT_PointerOrReference, TT_ArraySubscriptLSquare,
	tok::l_paren) &&
	(Style.PointerAlignment != FormatStyle::PAS_Right &&
	!Line.IsMultiVariableDeclStmt) &&
	Left.Previous &&
	!Left.Previous->isOneOf(tok::l_paren, tok::coloncolon));
	if (Right.is(tok::star) && Left.is(tok::l_paren))
	return false;
	if (Left.is(tok::l_square))
	return (Left.is(TT_ArrayInitializerLSquare) &&
	Style.SpacesInContainerLiterals && Right.isNot(tok::r_square)) \|\|
	(Left.isOneOf(TT_ArraySubscriptLSquare,
	TT_StructuredBindingLSquare) &&
	Style.SpacesInSquareBrackets && Right.isNot(tok::r_square));
	if (Right.is(tok::r_square))
	return Right.MatchingParen &&
	((Style.SpacesInContainerLiterals &&
	Right.MatchingParen->is(TT_ArrayInitializerLSquare)) \|\|
	(Style.SpacesInSquareBrackets &&
	Right.MatchingParen->isOneOf(TT_ArraySubscriptLSquare,
	TT_StructuredBindingLSquare)));
	if (Right.is(tok::l_square) &&
	!Right.isOneOf(TT_ObjCMethodExpr, TT_LambdaLSquare,
	TT_DesignatedInitializerLSquare,
	TT_StructuredBindingLSquare) &&
	!Left.isOneOf(tok::numeric_constant, TT_DictLiteral))
	return false;
	if (Left.is(tok::l_brace) && Right.is(tok::r_brace))
	return !Left.Children.empty(); // No spaces in "{}".
	if ((Left.is(tok::l_brace) && Left.BlockKind != BK_Block) \|\|
	(Right.is(tok::r_brace) && Right.MatchingParen &&
	Right.MatchingParen->BlockKind != BK_Block))
	return !Style.Cpp11BracedListStyle;
	if (Left.is(TT_BlockComment))
	return !Left.TokenText.endswith("=*/");
	if (Right.is(tok::l_paren)) {
	if (Left.is(tok::r_paren) && Left.is(TT_AttributeParen))
	return true;
	return Line.Type == LT_ObjCDecl \|\| Left.is(tok::semi) \|\|
	(Style.SpaceBeforeParens != FormatStyle::SBPO_Never &&
	(Left.isOneOf(tok::kw_if, tok::pp_elif, tok::kw_for, tok::kw_while,
	tok::kw_switch, tok::kw_case, TT_ForEachMacro,
	TT_ObjCForIn) \|\|
	Left.endsSequence(tok::kw_constexpr, tok::kw_if) \|\|
	(Left.isOneOf(tok::kw_try, Keywords.kw___except, tok::kw_catch,
	tok::kw_new, tok::kw_delete) &&
	(!Left.Previous \|\| Left.Previous->isNot(tok::period))))) \|\|
	(Style.SpaceBeforeParens == FormatStyle::SBPO_Always &&
	(Left.is(tok::identifier) \|\| Left.isFunctionLikeKeyword() \|\|
	Left.is(tok::r_paren)) &&
	Line.Type != LT_PreprocessorDirective);
	}
	if (Left.is(tok::at) && Right.Tok.getObjCKeywordID() != tok::objc_not_keyword)
	return false;
	if (Right.is(TT_UnaryOperator))
	return !Left.isOneOf(tok::l_paren, tok::l_square, tok::at) &&
	(Left.isNot(tok::colon) \|\| Left.isNot(TT_ObjCMethodExpr));
	if ((Left.isOneOf(tok::identifier, tok::greater, tok::r_square,
	tok::r_paren) \|\|
	Left.isSimpleTypeSpecifier()) &&
	Right.is(tok::l_brace) && Right.getNextNonComment() &&
	Right.BlockKind != BK_Block)
	return false;
	if (Left.is(tok::period) \|\| Right.is(tok::period))
	return false;
	if (Right.is(tok::hash) && Left.is(tok::identifier) && Left.TokenText == "L")
	return false;
	if (Left.is(TT_TemplateCloser) && Left.MatchingParen &&
	Left.MatchingParen->Previous &&
	Left.MatchingParen->Previous->is(tok::period))
	// A.<B<C<...>>>DoSomething();
	return false;
	if (Left.is(TT_TemplateCloser) && Right.is(tok::l_square))
	return false;
	return true;
	}

	bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
	const FormatToken &Right) {
	const FormatToken &Left = *Right.Previous;
	if (Right.Tok.getIdentifierInfo() && Left.Tok.getIdentifierInfo())
	return true; // Never ever merge two identifiers.
	if (Style.isCpp()) {
	if (Left.is(tok::kw_operator))
	return Right.is(tok::coloncolon);
	} else if (Style.Language == FormatStyle::LK_Proto \|\|
	Style.Language == FormatStyle::LK_TextProto) {
	if (Right.is(tok::period) &&
	Left.isOneOf(Keywords.kw_optional, Keywords.kw_required,
	Keywords.kw_repeated, Keywords.kw_extend))
	return true;
	if (Right.is(tok::l_paren) &&
	Left.isOneOf(Keywords.kw_returns, Keywords.kw_option))
	return true;
	if (Right.isOneOf(tok::l_brace, tok::less) && Left.is(TT_SelectorName))
	return true;
	} else if (Style.Language == FormatStyle::LK_JavaScript) {
	if (Left.is(TT_JsFatArrow))
	return true;
	// for await ( ...
	if (Right.is(tok::l_paren) && Left.is(Keywords.kw_await) && Left.Previous &&
	Left.Previous->is(tok::kw_for))
	return true;
	if (Left.is(Keywords.kw_async) && Right.is(tok::l_paren) &&
	Right.MatchingParen) {
	const FormatToken *Next = Right.MatchingParen->getNextNonComment();
	// An async arrow function, for example: `x = async () => foo();`,
	// as opposed to calling a function called async: `x = async();`
	if (Next && Next->is(TT_JsFatArrow))
	return true;
	}
	if ((Left.is(TT_TemplateString) && Left.TokenText.endswith("${")) \|\|
	(Right.is(TT_TemplateString) && Right.TokenText.startswith("}")))
	return false;
	// In tagged template literals ("html`bar baz`"), there is no space between
	// the tag identifier and the template string. getIdentifierInfo makes sure
	// that the identifier is not a pseudo keyword like `yield`, either.
	if (Left.is(tok::identifier) && Keywords.IsJavaScriptIdentifier(Left) &&
	Right.is(TT_TemplateString))
	return false;
	if (Right.is(tok::star) &&
	Left.isOneOf(Keywords.kw_function, Keywords.kw_yield))
	return false;
	if (Right.isOneOf(tok::l_brace, tok::l_square) &&
	Left.isOneOf(Keywords.kw_function, Keywords.kw_yield,
	Keywords.kw_extends, Keywords.kw_implements))
	return true;
	if (Right.is(tok::l_paren)) {
	// JS methods can use some keywords as names (e.g. `delete()`).
	if (Line.MustBeDeclaration && Left.Tok.getIdentifierInfo())
	return false;
	// Valid JS method names can include keywords, e.g. `foo.delete()` or
	// `bar.instanceof()`. Recognize call positions by preceding period.
	if (Left.Previous && Left.Previous->is(tok::period) &&
	Left.Tok.getIdentifierInfo())
	return false;
	// Additional unary JavaScript operators that need a space after.
	if (Left.isOneOf(tok::kw_throw, Keywords.kw_await, Keywords.kw_typeof,
	tok::kw_void))
	return true;
	}
	if ((Left.isOneOf(Keywords.kw_let, Keywords.kw_var, Keywords.kw_in,
	tok::kw_const) \|\|
	// "of" is only a keyword if it appears after another identifier
	// (e.g. as "const x of y" in a for loop), or after a destructuring
	// operation (const [x, y] of z, const {a, b} of c).
	(Left.is(Keywords.kw_of) && Left.Previous &&
	(Left.Previous->Tok.getIdentifierInfo() \|\|
	Left.Previous->isOneOf(tok::r_square, tok::r_brace)))) &&
	(!Left.Previous \|\| !Left.Previous->is(tok::period)))
	return true;
	if (Left.isOneOf(tok::kw_for, Keywords.kw_as) && Left.Previous &&
	Left.Previous->is(tok::period) && Right.is(tok::l_paren))
	return false;
	if (Left.is(Keywords.kw_as) &&
	Right.isOneOf(tok::l_square, tok::l_brace, tok::l_paren))
	return true;
	if (Left.is(tok::kw_default) && Left.Previous &&
	Left.Previous->is(tok::kw_export))
	return true;
	if (Left.is(Keywords.kw_is) && Right.is(tok::l_brace))
	return true;
	if (Right.isOneOf(TT_JsTypeColon, TT_JsTypeOptionalQuestion))
	return false;
	if (Left.is(TT_JsTypeOperator) \|\| Right.is(TT_JsTypeOperator))
	return false;
	if ((Left.is(tok::l_brace) \|\| Right.is(tok::r_brace)) &&
	Line.First->isOneOf(Keywords.kw_import, tok::kw_export))
	return false;
	if (Left.is(tok::ellipsis))
	return false;
	if (Left.is(TT_TemplateCloser) &&
	!Right.isOneOf(tok::equal, tok::l_brace, tok::comma, tok::l_square,
	Keywords.kw_implements, Keywords.kw_extends))
	// Type assertions ('<type>expr') are not followed by whitespace. Other
	// locations that should have whitespace following are identified by the
	// above set of follower tokens.
	return false;
	if (Right.is(TT_JsNonNullAssertion))
	return false;
	if (Left.is(TT_JsNonNullAssertion) &&
	Right.isOneOf(Keywords.kw_as, Keywords.kw_in))
	return true; // "x! as string", "x! in y"
	} else if (Style.Language == FormatStyle::LK_Java) {
	if (Left.is(tok::r_square) && Right.is(tok::l_brace))
	return true;
	if (Left.is(Keywords.kw_synchronized) && Right.is(tok::l_paren))
	return Style.SpaceBeforeParens != FormatStyle::SBPO_Never;
	if ((Left.isOneOf(tok::kw_static, tok::kw_public, tok::kw_private,
	tok::kw_protected) \|\|
	Left.isOneOf(Keywords.kw_final, Keywords.kw_abstract,
	Keywords.kw_native)) &&
	Right.is(TT_TemplateOpener))
	return true;
	}
	if (Left.is(TT_ImplicitStringLiteral))
	return Right.WhitespaceRange.getBegin() != Right.WhitespaceRange.getEnd();
	if (Line.Type == LT_ObjCMethodDecl) {
	if (Left.is(TT_ObjCMethodSpecifier))
	return true;
	if (Left.is(tok::r_paren) && Right.is(tok::identifier))
	// Don't space between ')' and <id>
	return false;
	}
	if (Line.Type == LT_ObjCProperty &&
	(Right.is(tok::equal) \|\| Left.is(tok::equal)))
	return false;

	if (Right.isOneOf(TT_TrailingReturnArrow, TT_LambdaArrow) \|\|
	Left.isOneOf(TT_TrailingReturnArrow, TT_LambdaArrow))
	return true;
	if (Right.is(TT_OverloadedOperatorLParen))
	return Style.SpaceBeforeParens == FormatStyle::SBPO_Always;
	if (Left.is(tok::comma))
	return true;
	if (Right.is(tok::comma))
	return false;
	if (Right.isOneOf(TT_CtorInitializerColon, TT_ObjCBlockLParen))
	return true;
	if (Right.is(tok::colon)) {
	if (Line.First->isOneOf(tok::kw_case, tok::kw_default) \|\|
	!Right.getNextNonComment() \|\| Right.getNextNonComment()->is(tok::semi))
	return false;
	if (Right.is(TT_ObjCMethodExpr))
	return false;
	if (Left.is(tok::question))
	return false;
	if (Right.is(TT_InlineASMColon) && Left.is(tok::coloncolon))
	return false;
	if (Right.is(TT_DictLiteral))
	return Style.SpacesInContainerLiterals;
	return true;
	}
	if (Left.is(TT_UnaryOperator))
	return Right.is(TT_BinaryOperator);

	// If the next token is a binary operator or a selector name, we have
	// incorrectly classified the parenthesis as a cast. FIXME: Detect correctly.
	if (Left.is(TT_CastRParen))
	return Style.SpaceAfterCStyleCast \|\|
	Right.isOneOf(TT_BinaryOperator, TT_SelectorName);

	if (Left.is(tok::greater) && Right.is(tok::greater))
	return Right.is(TT_TemplateCloser) && Left.is(TT_TemplateCloser) &&
	(Style.Standard != FormatStyle::LS_Cpp11 \|\| Style.SpacesInAngles);
	if (Right.isOneOf(tok::arrow, tok::arrowstar, tok::periodstar) \|\|
	Left.isOneOf(tok::arrow, tok::period, tok::arrowstar, tok::periodstar) \|\|
	(Right.is(tok::period) && Right.isNot(TT_DesignatedInitializerPeriod)))
	return false;
	if (!Style.SpaceBeforeAssignmentOperators &&
	Right.getPrecedence() == prec::Assignment)
	return false;
	if (Right.is(tok::coloncolon) && Left.is(tok::identifier))
	// Generally don't remove existing spaces between an identifier and "::".
	// The identifier might actually be a macro name such as ALWAYS_INLINE. If
	// this turns out to be too lenient, add analysis of the identifier itself.
	return Right.WhitespaceRange.getBegin() != Right.WhitespaceRange.getEnd();
	if (Right.is(tok::coloncolon) && !Left.isOneOf(tok::l_brace, tok::comment))
	return (Left.is(TT_TemplateOpener) &&
	Style.Standard == FormatStyle::LS_Cpp03) \|\|
	!(Left.isOneOf(tok::l_paren, tok::r_paren, tok::l_square,
	tok::kw___super, TT_TemplateCloser,
	TT_TemplateOpener));
	if ((Left.is(TT_TemplateOpener)) != (Right.is(TT_TemplateCloser)))
	return Style.SpacesInAngles;
	// Space before TT_StructuredBindingLSquare.
	if (Right.is(TT_StructuredBindingLSquare))
	return !Left.isOneOf(tok::amp, tok::ampamp) \|\|
	Style.PointerAlignment != FormatStyle::PAS_Right;
	// Space before & or && following a TT_StructuredBindingLSquare.
	if (Right.Next && Right.Next->is(TT_StructuredBindingLSquare) &&
	Right.isOneOf(tok::amp, tok::ampamp))
	return Style.PointerAlignment != FormatStyle::PAS_Left;
	if ((Right.is(TT_BinaryOperator) && !Left.is(tok::l_paren)) \|\|
	(Left.isOneOf(TT_BinaryOperator, TT_ConditionalExpr) &&
	!Right.is(tok::r_paren)))
	return true;
	if (Left.is(TT_TemplateCloser) && Right.is(tok::l_paren) &&
	Right.isNot(TT_FunctionTypeLParen))
	return Style.SpaceBeforeParens == FormatStyle::SBPO_Always;
	if (Right.is(TT_TemplateOpener) && Left.is(tok::r_paren) &&
	Left.MatchingParen && Left.MatchingParen->is(TT_OverloadedOperatorLParen))
	return false;
	if (Right.is(tok::less) && Left.isNot(tok::l_paren) &&
	Line.startsWith(tok::hash))
	return true;
	if (Right.is(TT_TrailingUnaryOperator))
	return false;
	if (Left.is(TT_RegexLiteral))
	return false;
	return spaceRequiredBetween(Line, Left, Right);
	}

	// Returns 'true' if 'Tok' is a brace we'd want to break before in Allman style.
	static bool isAllmanBrace(const FormatToken &Tok) {
	return Tok.is(tok::l_brace) && Tok.BlockKind == BK_Block &&
	!Tok.isOneOf(TT_ObjCBlockLBrace, TT_DictLiteral);
	}

	bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line,
	const FormatToken &Right) {
	const FormatToken &Left = *Right.Previous;
	if (Right.NewlinesBefore > 1 && Style.MaxEmptyLinesToKeep > 0)
	return true;

	if (Style.Language == FormatStyle::LK_JavaScript) {
	// FIXME: This might apply to other languages and token kinds.
	if (Right.is(tok::string_literal) && Left.is(tok::plus) && Left.Previous &&
	Left.Previous->is(tok::string_literal))
	return true;
	if (Left.is(TT_DictLiteral) && Left.is(tok::l_brace) && Line.Level == 0 &&
	Left.Previous && Left.Previous->is(tok::equal) &&
	Line.First->isOneOf(tok::identifier, Keywords.kw_import, tok::kw_export,
	tok::kw_const) &&
	// kw_var/kw_let are pseudo-tokens that are tok::identifier, so match
	// above.
	!Line.First->isOneOf(Keywords.kw_var, Keywords.kw_let))
	// Object literals on the top level of a file are treated as "enum-style".
	// Each key/value pair is put on a separate line, instead of bin-packing.
	return true;
	if (Left.is(tok::l_brace) && Line.Level == 0 &&
	(Line.startsWith(tok::kw_enum) \|\|
	Line.startsWith(tok::kw_const, tok::kw_enum) \|\|
	Line.startsWith(tok::kw_export, tok::kw_enum) \|\|
	Line.startsWith(tok::kw_export, tok::kw_const, tok::kw_enum)))
	// JavaScript top-level enum key/value pairs are put on separate lines
	// instead of bin-packing.
	return true;
	if (Right.is(tok::r_brace) && Left.is(tok::l_brace) &&
	!Left.Children.empty())
	// Support AllowShortFunctionsOnASingleLine for JavaScript.
	return Style.AllowShortFunctionsOnASingleLine == FormatStyle::SFS_None \|\|
	Style.AllowShortFunctionsOnASingleLine == FormatStyle::SFS_Empty \|\|
	(Left.NestingLevel == 0 && Line.Level == 0 &&
	Style.AllowShortFunctionsOnASingleLine &
	FormatStyle::SFS_InlineOnly);
	} else if (Style.Language == FormatStyle::LK_Java) {
	if (Right.is(tok::plus) && Left.is(tok::string_literal) && Right.Next &&
	Right.Next->is(tok::string_literal))
	return true;
	} else if (Style.Language == FormatStyle::LK_Cpp \|\|
	Style.Language == FormatStyle::LK_ObjC \|\|
	Style.Language == FormatStyle::LK_Proto) {
	if (Left.isStringLiteral() && Right.isStringLiteral())
	return true;
	}

	// If the last token before a '}', ']', or ')' is a comma or a trailing
	// comment, the intention is to insert a line break after it in order to make
	// shuffling around entries easier. Import statements, especially in
	// JavaScript, can be an exception to this rule.
	if (Style.JavaScriptWrapImports \|\| Line.Type != LT_ImportStatement) {
	const FormatToken *BeforeClosingBrace = nullptr;
	if ((Left.isOneOf(tok::l_brace, TT_ArrayInitializerLSquare) \|\|
	(Style.Language == FormatStyle::LK_JavaScript &&
	Left.is(tok::l_paren))) &&
	Left.BlockKind != BK_Block && Left.MatchingParen)
	BeforeClosingBrace = Left.MatchingParen->Previous;
	else if (Right.MatchingParen &&
	(Right.MatchingParen->isOneOf(tok::l_brace,
	TT_ArrayInitializerLSquare) \|\|
	(Style.Language == FormatStyle::LK_JavaScript &&
	Right.MatchingParen->is(tok::l_paren))))
	BeforeClosingBrace = &Left;
	if (BeforeClosingBrace && (BeforeClosingBrace->is(tok::comma) \|\|
	BeforeClosingBrace->isTrailingComment()))
	return true;
	}

	if (Right.is(tok::comment))
	return Left.BlockKind != BK_BracedInit &&
	Left.isNot(TT_CtorInitializerColon) &&
	(Right.NewlinesBefore > 0 && Right.HasUnescapedNewline);
	if (Left.isTrailingComment())
	return true;
	if (Right.Previous->IsUnterminatedLiteral)
	return true;
	if (Right.is(tok::lessless) && Right.Next &&
	Right.Previous->is(tok::string_literal) &&
	Right.Next->is(tok::string_literal))
	return true;
	if (Right.Previous->ClosesTemplateDeclaration &&
	Right.Previous->MatchingParen &&
	Right.Previous->MatchingParen->NestingLevel == 0 &&
	Style.AlwaysBreakTemplateDeclarations)
	return true;
	if (Right.is(TT_CtorInitializerComma) &&
	Style.BreakConstructorInitializers == FormatStyle::BCIS_BeforeComma &&
	!Style.ConstructorInitializerAllOnOneLineOrOnePerLine)
	return true;
	if (Right.is(TT_CtorInitializerColon) &&
	Style.BreakConstructorInitializers == FormatStyle::BCIS_BeforeComma &&
	!Style.ConstructorInitializerAllOnOneLineOrOnePerLine)
	return true;
	// Break only if we have multiple inheritance.
	if (Style.BreakBeforeInheritanceComma && Right.is(TT_InheritanceComma))
	return true;
	if (Right.is(tok::string_literal) && Right.TokenText.startswith("R\""))
	// Raw string literals are special wrt. line breaks. The author has made a
	// deliberate choice and might have aligned the contents of the string
	// literal accordingly. Thus, we try keep existing line breaks.
	return Right.NewlinesBefore > 0;
	if ((Right.Previous->is(tok::l_brace) \|\|
	(Right.Previous->is(tok::less) && Right.Previous->Previous &&
	Right.Previous->Previous->is(tok::equal))) &&
	Right.NestingLevel == 1 && Style.Language == FormatStyle::LK_Proto) {
	// Don't put enums or option definitions onto single lines in protocol
	// buffers.
	return true;
	}
	if (Right.is(TT_InlineASMBrace))
	return Right.HasUnescapedNewline;
	if (isAllmanBrace(Left) \|\| isAllmanBrace(Right))
	return (Line.startsWith(tok::kw_enum) && Style.BraceWrapping.AfterEnum) \|\|
	(Line.startsWith(tok::kw_typedef, tok::kw_enum) &&
	Style.BraceWrapping.AfterEnum) \|\|
	(Line.startsWith(tok::kw_class) && Style.BraceWrapping.AfterClass) \|\|
	(Line.startsWith(tok::kw_struct) && Style.BraceWrapping.AfterStruct);
	if (Left.is(TT_ObjCBlockLBrace) && !Style.AllowShortBlocksOnASingleLine)
	return true;

	if ((Style.Language == FormatStyle::LK_Java \|\|
	Style.Language == FormatStyle::LK_JavaScript) &&
	Left.is(TT_LeadingJavaAnnotation) &&
	Right.isNot(TT_LeadingJavaAnnotation) && Right.isNot(tok::l_paren) &&
	(Line.Last->is(tok::l_brace) \|\| Style.BreakAfterJavaFieldAnnotations))
	return true;

	return false;
	}

	bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line,
	const FormatToken &Right) {
	const FormatToken &Left = *Right.Previous;

	// Language-specific stuff.
	if (Style.Language == FormatStyle::LK_Java) {
	if (Left.isOneOf(Keywords.kw_throws, Keywords.kw_extends,
	Keywords.kw_implements))
	return false;
	if (Right.isOneOf(Keywords.kw_throws, Keywords.kw_extends,
	Keywords.kw_implements))
	return true;
	} else if (Style.Language == FormatStyle::LK_JavaScript) {
	const FormatToken *NonComment = Right.getPreviousNonComment();
	if (NonComment &&
	NonComment->isOneOf(
	tok::kw_return, Keywords.kw_yield, tok::kw_continue, tok::kw_break,
	tok::kw_throw, Keywords.kw_interface, Keywords.kw_type,
	tok::kw_static, tok::kw_public, tok::kw_private, tok::kw_protected,
	Keywords.kw_readonly, Keywords.kw_abstract, Keywords.kw_get,
	Keywords.kw_set, Keywords.kw_async, Keywords.kw_await))
	return false; // Otherwise automatic semicolon insertion would trigger.
	if (Left.Tok.getIdentifierInfo() &&
	Right.startsSequence(tok::l_square, tok::r_square))
	return false; // breaking in "foo[]" creates illegal TS type syntax.
	if (Left.is(TT_JsFatArrow) && Right.is(tok::l_brace))
	return false;
	if (Left.is(TT_JsTypeColon))
	return true;
	if (Right.NestingLevel == 0 && Right.is(Keywords.kw_is))
	return false;
	if (Left.is(Keywords.kw_in))
	return Style.BreakBeforeBinaryOperators == FormatStyle::BOS_None;
	if (Right.is(Keywords.kw_in))
	return Style.BreakBeforeBinaryOperators != FormatStyle::BOS_None;
	if (Right.is(Keywords.kw_as))
	return false; // must not break before as in 'x as type' casts
	if (Left.is(Keywords.kw_as))
	return true;
	if (Left.is(TT_JsNonNullAssertion))
	return true;
	if (Left.is(Keywords.kw_declare) &&
	Right.isOneOf(Keywords.kw_module, tok::kw_namespace,
	Keywords.kw_function, tok::kw_class, tok::kw_enum,
	Keywords.kw_interface, Keywords.kw_type, Keywords.kw_var,
	Keywords.kw_let, tok::kw_const))
	// See grammar for 'declare' statements at:
	// https://github.com/Microsoft/TypeScript/blob/master/doc/spec.md#A.10
	return false;
	if (Left.isOneOf(Keywords.kw_module, tok::kw_namespace) &&
	Right.isOneOf(tok::identifier, tok::string_literal))
	return false; // must not break in "module foo { ...}"
	if (Right.is(TT_TemplateString) && Right.closesScope())
	return false;
	if (Left.is(TT_TemplateString) && Left.opensScope())
	return true;
	}

	if (Left.is(tok::at))
	return false;
	if (Left.Tok.getObjCKeywordID() == tok::objc_interface)
	return false;
	if (Left.isOneOf(TT_JavaAnnotation, TT_LeadingJavaAnnotation))
	return !Right.is(tok::l_paren);
	if (Right.is(TT_PointerOrReference))
	return Line.IsMultiVariableDeclStmt \|\|
	(Style.PointerAlignment == FormatStyle::PAS_Right &&
	(!Right.Next \|\| Right.Next->isNot(TT_FunctionDeclarationName)));
	if (Right.isOneOf(TT_StartOfName, TT_FunctionDeclarationName) \|\|
	Right.is(tok::kw_operator))
	return true;
	if (Left.is(TT_PointerOrReference))
	return false;
	if (Right.isTrailingComment())
	// We rely on MustBreakBefore being set correctly here as we should not
	// change the "binding" behavior of a comment.
	// The first comment in a braced lists is always interpreted as belonging to
	// the first list element. Otherwise, it should be placed outside of the
	// list.
	return Left.BlockKind == BK_BracedInit \|\|
	(Left.is(TT_CtorInitializerColon) &&
	Style.BreakConstructorInitializers == FormatStyle::BCIS_AfterColon);
	if (Left.is(tok::question) && Right.is(tok::colon))
	return false;
	if (Right.is(TT_ConditionalExpr) \|\| Right.is(tok::question))
	return Style.BreakBeforeTernaryOperators;
	if (Left.is(TT_ConditionalExpr) \|\| Left.is(tok::question))
	return !Style.BreakBeforeTernaryOperators;
	if (Right.is(TT_InheritanceColon))
	return true;
	if (Right.is(TT_ObjCMethodExpr) && !Right.is(tok::r_square) &&
	Left.isNot(TT_SelectorName))
	return true;
	if (Right.is(tok::colon) &&
	!Right.isOneOf(TT_CtorInitializerColon, TT_InlineASMColon))
	return false;
	if (Left.is(tok::colon) && Left.isOneOf(TT_DictLiteral, TT_ObjCMethodExpr))
	return true;
	if (Right.is(TT_SelectorName) \|\| (Right.is(tok::identifier) && Right.Next &&
	Right.Next->is(TT_ObjCMethodExpr)))
	return Left.isNot(tok::period); // FIXME: Properly parse ObjC calls.
	if (Left.is(tok::r_paren) && Line.Type == LT_ObjCProperty)
	return true;
	if (Left.ClosesTemplateDeclaration \|\| Left.is(TT_FunctionAnnotationRParen))
	return true;
	if (Right.isOneOf(TT_RangeBasedForLoopColon, TT_OverloadedOperatorLParen,
	TT_OverloadedOperator))
	return false;
	if (Left.is(TT_RangeBasedForLoopColon))
	return true;
	if (Right.is(TT_RangeBasedForLoopColon))
	return false;
	if (Left.is(TT_TemplateCloser) && Right.is(TT_TemplateOpener))
	return true;
	if (Left.isOneOf(TT_TemplateCloser, TT_UnaryOperator) \|\|
	Left.is(tok::kw_operator))
	return false;
	if (Left.is(tok::equal) && !Right.isOneOf(tok::kw_default, tok::kw_delete) &&
	Line.Type == LT_VirtualFunctionDecl && Left.NestingLevel == 0)
	return false;
	if (Left.is(tok::l_paren) && Left.is(TT_AttributeParen))
	return false;
	if (Left.is(tok::l_paren) && Left.Previous &&
	(Left.Previous->isOneOf(TT_BinaryOperator, TT_CastRParen)))
	return false;
	if (Right.is(TT_ImplicitStringLiteral))
	return false;

	if (Right.is(tok::r_paren) \|\| Right.is(TT_TemplateCloser))
	return false;
	if (Right.is(tok::r_square) && Right.MatchingParen &&
	Right.MatchingParen->is(TT_LambdaLSquare))
	return false;

	// We only break before r_brace if there was a corresponding break before
	// the l_brace, which is tracked by BreakBeforeClosingBrace.
	if (Right.is(tok::r_brace))
	return Right.MatchingParen && Right.MatchingParen->BlockKind == BK_Block;

	// Allow breaking after a trailing annotation, e.g. after a method
	// declaration.
	if (Left.is(TT_TrailingAnnotation))
	return !Right.isOneOf(tok::l_brace, tok::semi, tok::equal, tok::l_paren,
	tok::less, tok::coloncolon);

	if (Right.is(tok::kw___attribute))
	return true;

	if (Left.is(tok::identifier) && Right.is(tok::string_literal))
	return true;

	if (Right.is(tok::identifier) && Right.Next && Right.Next->is(TT_DictLiteral))
	return true;

	if (Left.is(TT_CtorInitializerColon))
	return Style.BreakConstructorInitializers == FormatStyle::BCIS_AfterColon;
	if (Right.is(TT_CtorInitializerColon))
	return Style.BreakConstructorInitializers != FormatStyle::BCIS_AfterColon;
	if (Left.is(TT_CtorInitializerComma) &&
	Style.BreakConstructorInitializers == FormatStyle::BCIS_BeforeComma)
	return false;
	if (Right.is(TT_CtorInitializerComma) &&
	Style.BreakConstructorInitializers == FormatStyle::BCIS_BeforeComma)
	return true;
	if (Left.is(TT_InheritanceComma) && Style.BreakBeforeInheritanceComma)
	return false;
	if (Right.is(TT_InheritanceComma) && Style.BreakBeforeInheritanceComma)
	return true;
	if ((Left.is(tok::greater) && Right.is(tok::greater)) \|\|
	(Left.is(tok::less) && Right.is(tok::less)))
	return false;
	if (Right.is(TT_BinaryOperator) &&
	Style.BreakBeforeBinaryOperators != FormatStyle::BOS_None &&
	(Style.BreakBeforeBinaryOperators == FormatStyle::BOS_All \|\|
	Right.getPrecedence() != prec::Assignment))
	return true;
	if (Left.is(TT_ArrayInitializerLSquare))
	return true;
	if (Right.is(tok::kw_typename) && Left.isNot(tok::kw_const))
	return true;
	if ((Left.isBinaryOperator() \|\| Left.is(TT_BinaryOperator)) &&
	!Left.isOneOf(tok::arrowstar, tok::lessless) &&
	Style.BreakBeforeBinaryOperators != FormatStyle::BOS_All &&
	(Style.BreakBeforeBinaryOperators == FormatStyle::BOS_None \|\|
	Left.getPrecedence() == prec::Assignment))
	return true;
	return Left.isOneOf(tok::comma, tok::coloncolon, tok::semi, tok::l_brace,
	tok::kw_class, tok::kw_struct, tok::comment) \|\|
	Right.isMemberAccess() \|\|
	Right.isOneOf(TT_TrailingReturnArrow, TT_LambdaArrow, tok::lessless,
	tok::colon, tok::l_square, tok::at) \|\|
	(Left.is(tok::r_paren) &&
	Right.isOneOf(tok::identifier, tok::kw_const)) \|\|
	(Left.is(tok::l_paren) && !Right.is(tok::r_paren)) \|\|
	(Left.is(TT_TemplateOpener) && !Right.is(TT_TemplateCloser));
	}

	void TokenAnnotator::printDebugInfo(const AnnotatedLine &Line) {
	llvm::errs() << "AnnotatedTokens(L=" << Line.Level << "):\n";
	const FormatToken *Tok = Line.First;
	while (Tok) {
	llvm::errs() << " M=" << Tok->MustBreakBefore
	<< " C=" << Tok->CanBreakBefore
	<< " T=" << getTokenTypeName(Tok->Type)
	<< " S=" << Tok->SpacesRequiredBefore
	<< " B=" << Tok->BlockParameterCount
	<< " BK=" << Tok->BlockKind << " P=" << Tok->SplitPenalty
	<< " Name=" << Tok->Tok.getName() << " L=" << Tok->TotalLength
	<< " PPK=" << Tok->PackingKind << " FakeLParens=";
	for (unsigned i = 0, e = Tok->FakeLParens.size(); i != e; ++i)
	llvm::errs() << Tok->FakeLParens[i] << "/";
	llvm::errs() << " FakeRParens=" << Tok->FakeRParens;
	llvm::errs() << " Text='" << Tok->TokenText << "'\n";
	if (!Tok->Next)
	assert(Tok == Line.Last);
	Tok = Tok->Next;
	}
	llvm::errs() << "----\n";
	}

	} // namespace format
	} // namespace clang
	Index: head/contrib/llvm/tools/clang/lib/Format/UnwrappedLineParser.cpp
	===================================================================
	--- head/contrib/llvm/tools/clang/lib/Format/UnwrappedLineParser.cpp (revision 329409)
	+++ head/contrib/llvm/tools/clang/lib/Format/UnwrappedLineParser.cpp (revision 329410)
	@@ -1,2523 +1,2537 @@
	//===--- UnwrappedLineParser.cpp - Format C++ code ------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	///
	/// \file
	/// \brief This file contains the implementation of the UnwrappedLineParser,
	/// which turns a stream of tokens into UnwrappedLines.
	///
	//===----------------------------------------------------------------------===//

	#include "UnwrappedLineParser.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/raw_ostream.h"

	#include <algorithm>

	#define DEBUG_TYPE "format-parser"

	namespace clang {
	namespace format {

	class FormatTokenSource {
	public:
	virtual ~FormatTokenSource() {}
	virtual FormatToken *getNextToken() = 0;

	virtual unsigned getPosition() = 0;
	virtual FormatToken *setPosition(unsigned Position) = 0;
	};

	namespace {

	class ScopedDeclarationState {
	public:
	ScopedDeclarationState(UnwrappedLine &Line, std::vector<bool> &Stack,
	bool MustBeDeclaration)
	: Line(Line), Stack(Stack) {
	Line.MustBeDeclaration = MustBeDeclaration;
	Stack.push_back(MustBeDeclaration);
	}
	~ScopedDeclarationState() {
	Stack.pop_back();
	if (!Stack.empty())
	Line.MustBeDeclaration = Stack.back();
	else
	Line.MustBeDeclaration = true;
	}

	private:
	UnwrappedLine &Line;
	std::vector<bool> &Stack;
	};

	static bool isLineComment(const FormatToken &FormatTok) {
	return FormatTok.is(tok::comment) && !FormatTok.TokenText.startswith("/*");
	}

	// Checks if \p FormatTok is a line comment that continues the line comment
	// \p Previous. The original column of \p MinColumnToken is used to determine
	// whether \p FormatTok is indented enough to the right to continue \p Previous.
	static bool continuesLineComment(const FormatToken &FormatTok,
	const FormatToken *Previous,
	const FormatToken *MinColumnToken) {
	if (!Previous \|\| !MinColumnToken)
	return false;
	unsigned MinContinueColumn =
	MinColumnToken->OriginalColumn + (isLineComment(*MinColumnToken) ? 0 : 1);
	return isLineComment(FormatTok) && FormatTok.NewlinesBefore == 1 &&
	isLineComment(*Previous) &&
	FormatTok.OriginalColumn >= MinContinueColumn;
	}

	class ScopedMacroState : public FormatTokenSource {
	public:
	ScopedMacroState(UnwrappedLine &Line, FormatTokenSource *&TokenSource,
	FormatToken *&ResetToken)
	: Line(Line), TokenSource(TokenSource), ResetToken(ResetToken),
	PreviousLineLevel(Line.Level), PreviousTokenSource(TokenSource),
	Token(nullptr), PreviousToken(nullptr) {
	TokenSource = this;
	Line.Level = 0;
	Line.InPPDirective = true;
	}

	~ScopedMacroState() override {
	TokenSource = PreviousTokenSource;
	ResetToken = Token;
	Line.InPPDirective = false;
	Line.Level = PreviousLineLevel;
	}

	FormatToken *getNextToken() override {
	// The \c UnwrappedLineParser guards against this by never calling
	// \c getNextToken() after it has encountered the first eof token.
	assert(!eof());
	PreviousToken = Token;
	Token = PreviousTokenSource->getNextToken();
	if (eof())
	return getFakeEOF();
	return Token;
	}

	unsigned getPosition() override { return PreviousTokenSource->getPosition(); }

	FormatToken *setPosition(unsigned Position) override {
	PreviousToken = nullptr;
	Token = PreviousTokenSource->setPosition(Position);
	return Token;
	}

	private:
	bool eof() {
	return Token && Token->HasUnescapedNewline &&
	!continuesLineComment(*Token, PreviousToken,
	/MinColumnToken=/PreviousToken);
	}

	FormatToken *getFakeEOF() {
	static bool EOFInitialized = false;
	static FormatToken FormatTok;
	if (!EOFInitialized) {
	FormatTok.Tok.startToken();
	FormatTok.Tok.setKind(tok::eof);
	EOFInitialized = true;
	}
	return &FormatTok;
	}

	UnwrappedLine &Line;
	FormatTokenSource *&TokenSource;
	FormatToken *&ResetToken;
	unsigned PreviousLineLevel;
	FormatTokenSource *PreviousTokenSource;

	FormatToken *Token;
	FormatToken *PreviousToken;
	};

	} // end anonymous namespace

	class ScopedLineState {
	public:
	ScopedLineState(UnwrappedLineParser &Parser,
	bool SwitchToPreprocessorLines = false)
	: Parser(Parser), OriginalLines(Parser.CurrentLines) {
	if (SwitchToPreprocessorLines)
	Parser.CurrentLines = &Parser.PreprocessorDirectives;
	else if (!Parser.Line->Tokens.empty())
	Parser.CurrentLines = &Parser.Line->Tokens.back().Children;
	PreBlockLine = std::move(Parser.Line);
	Parser.Line = llvm::make_unique<UnwrappedLine>();
	Parser.Line->Level = PreBlockLine->Level;
	Parser.Line->InPPDirective = PreBlockLine->InPPDirective;
	}

	~ScopedLineState() {
	if (!Parser.Line->Tokens.empty()) {
	Parser.addUnwrappedLine();
	}
	assert(Parser.Line->Tokens.empty());
	Parser.Line = std::move(PreBlockLine);
	if (Parser.CurrentLines == &Parser.PreprocessorDirectives)
	Parser.MustBreakBeforeNextToken = true;
	Parser.CurrentLines = OriginalLines;
	}

	private:
	UnwrappedLineParser &Parser;

	std::unique_ptr<UnwrappedLine> PreBlockLine;
	SmallVectorImpl<UnwrappedLine> *OriginalLines;
	};

	class CompoundStatementIndenter {
	public:
	CompoundStatementIndenter(UnwrappedLineParser *Parser,
	const FormatStyle &Style, unsigned &LineLevel)
	: LineLevel(LineLevel), OldLineLevel(LineLevel) {
	if (Style.BraceWrapping.AfterControlStatement)
	Parser->addUnwrappedLine();
	if (Style.BraceWrapping.IndentBraces)
	++LineLevel;
	}
	~CompoundStatementIndenter() { LineLevel = OldLineLevel; }

	private:
	unsigned &LineLevel;
	unsigned OldLineLevel;
	};

	namespace {

	class IndexedTokenSource : public FormatTokenSource {
	public:
	IndexedTokenSource(ArrayRef<FormatToken *> Tokens)
	: Tokens(Tokens), Position(-1) {}

	FormatToken *getNextToken() override {
	++Position;
	return Tokens[Position];
	}

	unsigned getPosition() override {
	assert(Position >= 0);
	return Position;
	}

	FormatToken *setPosition(unsigned P) override {
	Position = P;
	return Tokens[Position];
	}

	void reset() { Position = -1; }

	private:
	ArrayRef<FormatToken *> Tokens;
	int Position;
	};

	} // end anonymous namespace

	UnwrappedLineParser::UnwrappedLineParser(const FormatStyle &Style,
	const AdditionalKeywords &Keywords,
	unsigned FirstStartColumn,
	ArrayRef<FormatToken *> Tokens,
	UnwrappedLineConsumer &Callback)
	: Line(new UnwrappedLine), MustBreakBeforeNextToken(false),
	CurrentLines(&Lines), Style(Style), Keywords(Keywords),
	CommentPragmasRegex(Style.CommentPragmas), Tokens(nullptr),
	Callback(Callback), AllTokens(Tokens), PPBranchLevel(-1),
	- IfNdefCondition(nullptr), FoundIncludeGuardStart(false),
	- IncludeGuardRejected(false), FirstStartColumn(FirstStartColumn) {}
	+ IncludeGuard(Style.IndentPPDirectives == FormatStyle::PPDIS_None
	+ ? IG_Rejected
	+ : IG_Inited),
	+ IncludeGuardToken(nullptr), FirstStartColumn(FirstStartColumn) {}

	void UnwrappedLineParser::reset() {
	PPBranchLevel = -1;
	- IfNdefCondition = nullptr;
	- FoundIncludeGuardStart = false;
	- IncludeGuardRejected = false;
	+ IncludeGuard = Style.IndentPPDirectives == FormatStyle::PPDIS_None
	+ ? IG_Rejected
	+ : IG_Inited;
	+ IncludeGuardToken = nullptr;
	Line.reset(new UnwrappedLine);
	CommentsBeforeNextToken.clear();
	FormatTok = nullptr;
	MustBreakBeforeNextToken = false;
	PreprocessorDirectives.clear();
	CurrentLines = &Lines;
	DeclarationScopeStack.clear();
	PPStack.clear();
	Line->FirstStartColumn = FirstStartColumn;
	}

	void UnwrappedLineParser::parse() {
	IndexedTokenSource TokenSource(AllTokens);
	Line->FirstStartColumn = FirstStartColumn;
	do {
	DEBUG(llvm::dbgs() << "----\n");
	reset();
	Tokens = &TokenSource;
	TokenSource.reset();

	readToken();
	parseFile();
	+
	+ // If we found an include guard then all preprocessor directives (other than
	+ // the guard) are over-indented by one.
	+ if (IncludeGuard == IG_Found)
	+ for (auto &Line : Lines)
	+ if (Line.InPPDirective && Line.Level > 0)
	+ --Line.Level;
	+
	// Create line with eof token.
	pushToken(FormatTok);
	addUnwrappedLine();

	for (SmallVectorImpl<UnwrappedLine>::iterator I = Lines.begin(),
	E = Lines.end();
	I != E; ++I) {
	Callback.consumeUnwrappedLine(*I);
	}
	Callback.finishRun();
	Lines.clear();
	while (!PPLevelBranchIndex.empty() &&
	PPLevelBranchIndex.back() + 1 >= PPLevelBranchCount.back()) {
	PPLevelBranchIndex.resize(PPLevelBranchIndex.size() - 1);
	PPLevelBranchCount.resize(PPLevelBranchCount.size() - 1);
	}
	if (!PPLevelBranchIndex.empty()) {
	++PPLevelBranchIndex.back();
	assert(PPLevelBranchIndex.size() == PPLevelBranchCount.size());
	assert(PPLevelBranchIndex.back() <= PPLevelBranchCount.back());
	}
	} while (!PPLevelBranchIndex.empty());
	}

	void UnwrappedLineParser::parseFile() {
	// The top-level context in a file always has declarations, except for pre-
	// processor directives and JavaScript files.
	bool MustBeDeclaration =
	!Line->InPPDirective && Style.Language != FormatStyle::LK_JavaScript;
	ScopedDeclarationState DeclarationState(*Line, DeclarationScopeStack,
	MustBeDeclaration);
	if (Style.Language == FormatStyle::LK_TextProto)
	parseBracedList();
	else
	parseLevel(/HasOpeningBrace=/false);
	// Make sure to format the remaining tokens.
	flushComments(true);
	addUnwrappedLine();
	}

	void UnwrappedLineParser::parseLevel(bool HasOpeningBrace) {
	bool SwitchLabelEncountered = false;
	do {
	tok::TokenKind kind = FormatTok->Tok.getKind();
	if (FormatTok->Type == TT_MacroBlockBegin) {
	kind = tok::l_brace;
	} else if (FormatTok->Type == TT_MacroBlockEnd) {
	kind = tok::r_brace;
	}

	switch (kind) {
	case tok::comment:
	nextToken();
	addUnwrappedLine();
	break;
	case tok::l_brace:
	// FIXME: Add parameter whether this can happen - if this happens, we must
	// be in a non-declaration context.
	if (!FormatTok->is(TT_MacroBlockBegin) && tryToParseBracedList())
	continue;
	parseBlock(/MustBeDeclaration=/false);
	addUnwrappedLine();
	break;
	case tok::r_brace:
	if (HasOpeningBrace)
	return;
	nextToken();
	addUnwrappedLine();
	break;
	case tok::kw_default:
	case tok::kw_case:
	if (Style.Language == FormatStyle::LK_JavaScript &&
	Line->MustBeDeclaration) {
	// A 'case: string' style field declaration.
	parseStructuralElement();
	break;
	}
	if (!SwitchLabelEncountered &&
	(Style.IndentCaseLabels \|\| (Line->InPPDirective && Line->Level == 1)))
	++Line->Level;
	SwitchLabelEncountered = true;
	parseStructuralElement();
	break;
	default:
	parseStructuralElement();
	break;
	}
	} while (!eof());
	}

	void UnwrappedLineParser::calculateBraceTypes(bool ExpectClassBody) {
	// We'll parse forward through the tokens until we hit
	// a closing brace or eof - note that getNextToken() will
	// parse macros, so this will magically work inside macro
	// definitions, too.
	unsigned StoredPosition = Tokens->getPosition();
	FormatToken *Tok = FormatTok;
	const FormatToken *PrevTok = Tok->Previous;
	// Keep a stack of positions of lbrace tokens. We will
	// update information about whether an lbrace starts a
	// braced init list or a different block during the loop.
	SmallVector<FormatToken *, 8> LBraceStack;
	assert(Tok->Tok.is(tok::l_brace));
	do {
	// Get next non-comment token.
	FormatToken *NextTok;
	unsigned ReadTokens = 0;
	do {
	NextTok = Tokens->getNextToken();
	++ReadTokens;
	} while (NextTok->is(tok::comment));

	switch (Tok->Tok.getKind()) {
	case tok::l_brace:
	if (Style.Language == FormatStyle::LK_JavaScript && PrevTok) {
	if (PrevTok->isOneOf(tok::colon, tok::less))
	// A ':' indicates this code is in a type, or a braced list
	// following a label in an object literal ({a: {b: 1}}).
	// A '<' could be an object used in a comparison, but that is nonsense
	// code (can never return true), so more likely it is a generic type
	// argument (`X<{a: string; b: number}>`).
	// The code below could be confused by semicolons between the
	// individual members in a type member list, which would normally
	// trigger BK_Block. In both cases, this must be parsed as an inline
	// braced init.
	Tok->BlockKind = BK_BracedInit;
	else if (PrevTok->is(tok::r_paren))
	// `) { }` can only occur in function or method declarations in JS.
	Tok->BlockKind = BK_Block;
	} else {
	Tok->BlockKind = BK_Unknown;
	}
	LBraceStack.push_back(Tok);
	break;
	case tok::r_brace:
	if (LBraceStack.empty())
	break;
	if (LBraceStack.back()->BlockKind == BK_Unknown) {
	bool ProbablyBracedList = false;
	if (Style.Language == FormatStyle::LK_Proto) {
	ProbablyBracedList = NextTok->isOneOf(tok::comma, tok::r_square);
	} else {
	// Using OriginalColumn to distinguish between ObjC methods and
	// binary operators is a bit hacky.
	bool NextIsObjCMethod = NextTok->isOneOf(tok::plus, tok::minus) &&
	NextTok->OriginalColumn == 0;

	// If there is a comma, semicolon or right paren after the closing
	// brace, we assume this is a braced initializer list. Note that
	// regardless how we mark inner braces here, we will overwrite the
	// BlockKind later if we parse a braced list (where all blocks
	// inside are by default braced lists), or when we explicitly detect
	// blocks (for example while parsing lambdas).
	// FIXME: Some of these do not apply to JS, e.g. "} {" can never be a
	// braced list in JS.
	ProbablyBracedList =
	(Style.Language == FormatStyle::LK_JavaScript &&
	NextTok->isOneOf(Keywords.kw_of, Keywords.kw_in,
	Keywords.kw_as)) \|\|
	(Style.isCpp() && NextTok->is(tok::l_paren)) \|\|
	NextTok->isOneOf(tok::comma, tok::period, tok::colon,
	tok::r_paren, tok::r_square, tok::l_brace,
	tok::l_square, tok::ellipsis) \|\|
	(NextTok->is(tok::identifier) &&
	!PrevTok->isOneOf(tok::semi, tok::r_brace, tok::l_brace)) \|\|
	(NextTok->is(tok::semi) &&
	(!ExpectClassBody \|\| LBraceStack.size() != 1)) \|\|
	(NextTok->isBinaryOperator() && !NextIsObjCMethod);
	}
	if (ProbablyBracedList) {
	Tok->BlockKind = BK_BracedInit;
	LBraceStack.back()->BlockKind = BK_BracedInit;
	} else {
	Tok->BlockKind = BK_Block;
	LBraceStack.back()->BlockKind = BK_Block;
	}
	}
	LBraceStack.pop_back();
	break;
	case tok::at:
	case tok::semi:
	case tok::kw_if:
	case tok::kw_while:
	case tok::kw_for:
	case tok::kw_switch:
	case tok::kw_try:
	case tok::kw___try:
	if (!LBraceStack.empty() && LBraceStack.back()->BlockKind == BK_Unknown)
	LBraceStack.back()->BlockKind = BK_Block;
	break;
	default:
	break;
	}
	PrevTok = Tok;
	Tok = NextTok;
	} while (Tok->Tok.isNot(tok::eof) && !LBraceStack.empty());

	// Assume other blocks for all unclosed opening braces.
	for (unsigned i = 0, e = LBraceStack.size(); i != e; ++i) {
	if (LBraceStack[i]->BlockKind == BK_Unknown)
	LBraceStack[i]->BlockKind = BK_Block;
	}

	FormatTok = Tokens->setPosition(StoredPosition);
	}

	template <class T>
	static inline void hash_combine(std::size_t &seed, const T &v) {
	std::hash<T> hasher;
	seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
	}

	size_t UnwrappedLineParser::computePPHash() const {
	size_t h = 0;
	for (const auto &i : PPStack) {
	hash_combine(h, size_t(i.Kind));
	hash_combine(h, i.Line);
	}
	return h;
	}

	void UnwrappedLineParser::parseBlock(bool MustBeDeclaration, bool AddLevel,
	bool MunchSemi) {
	assert(FormatTok->isOneOf(tok::l_brace, TT_MacroBlockBegin) &&
	"'{' or macro block token expected");
	const bool MacroBlock = FormatTok->is(TT_MacroBlockBegin);
	FormatTok->BlockKind = BK_Block;

	size_t PPStartHash = computePPHash();

	unsigned InitialLevel = Line->Level;
	nextToken(/LevelDifference=/AddLevel ? 1 : 0);

	if (MacroBlock && FormatTok->is(tok::l_paren))
	parseParens();

	size_t NbPreprocessorDirectives =
	CurrentLines == &Lines ? PreprocessorDirectives.size() : 0;
	addUnwrappedLine();
	size_t OpeningLineIndex =
	CurrentLines->empty()
	? (UnwrappedLine::kInvalidIndex)
	: (CurrentLines->size() - 1 - NbPreprocessorDirectives);

	ScopedDeclarationState DeclarationState(*Line, DeclarationScopeStack,
	MustBeDeclaration);
	if (AddLevel)
	++Line->Level;
	parseLevel(/HasOpeningBrace=/true);

	if (eof())
	return;

	if (MacroBlock ? !FormatTok->is(TT_MacroBlockEnd)
	: !FormatTok->is(tok::r_brace)) {
	Line->Level = InitialLevel;
	FormatTok->BlockKind = BK_Block;
	return;
	}

	size_t PPEndHash = computePPHash();

	// Munch the closing brace.
	nextToken(/LevelDifference=/AddLevel ? -1 : 0);

	if (MacroBlock && FormatTok->is(tok::l_paren))
	parseParens();

	if (MunchSemi && FormatTok->Tok.is(tok::semi))
	nextToken();
	Line->Level = InitialLevel;

	if (PPStartHash == PPEndHash) {
	Line->MatchingOpeningBlockLineIndex = OpeningLineIndex;
	if (OpeningLineIndex != UnwrappedLine::kInvalidIndex) {
	// Update the opening line to add the forward reference as well
	(*CurrentLines)[OpeningLineIndex].MatchingOpeningBlockLineIndex =
	CurrentLines->size() - 1;
	}
	}
	}

	static bool isGoogScope(const UnwrappedLine &Line) {
	// FIXME: Closure-library specific stuff should not be hard-coded but be
	// configurable.
	if (Line.Tokens.size() < 4)
	return false;
	auto I = Line.Tokens.begin();
	if (I->Tok->TokenText != "goog")
	return false;
	++I;
	if (I->Tok->isNot(tok::period))
	return false;
	++I;
	if (I->Tok->TokenText != "scope")
	return false;
	++I;
	return I->Tok->is(tok::l_paren);
	}

	static bool isIIFE(const UnwrappedLine &Line,
	const AdditionalKeywords &Keywords) {
	// Look for the start of an immediately invoked anonymous function.
	// https://en.wikipedia.org/wiki/Immediately-invoked_function_expression
	// This is commonly done in JavaScript to create a new, anonymous scope.
	// Example: (function() { ... })()
	if (Line.Tokens.size() < 3)
	return false;
	auto I = Line.Tokens.begin();
	if (I->Tok->isNot(tok::l_paren))
	return false;
	++I;
	if (I->Tok->isNot(Keywords.kw_function))
	return false;
	++I;
	return I->Tok->is(tok::l_paren);
	}

	static bool ShouldBreakBeforeBrace(const FormatStyle &Style,
	const FormatToken &InitialToken) {
	if (InitialToken.is(tok::kw_namespace))
	return Style.BraceWrapping.AfterNamespace;
	if (InitialToken.is(tok::kw_class))
	return Style.BraceWrapping.AfterClass;
	if (InitialToken.is(tok::kw_union))
	return Style.BraceWrapping.AfterUnion;
	if (InitialToken.is(tok::kw_struct))
	return Style.BraceWrapping.AfterStruct;
	return false;
	}

	void UnwrappedLineParser::parseChildBlock() {
	FormatTok->BlockKind = BK_Block;
	nextToken();
	{
	bool SkipIndent = (Style.Language == FormatStyle::LK_JavaScript &&
	(isGoogScope(Line) \|\| isIIFE(Line, Keywords)));
	ScopedLineState LineState(*this);
	ScopedDeclarationState DeclarationState(*Line, DeclarationScopeStack,
	/MustBeDeclaration=/false);
	Line->Level += SkipIndent ? 0 : 1;
	parseLevel(/HasOpeningBrace=/true);
	flushComments(isOnNewLine(*FormatTok));
	Line->Level -= SkipIndent ? 0 : 1;
	}
	nextToken();
	}

	void UnwrappedLineParser::parsePPDirective() {
	assert(FormatTok->Tok.is(tok::hash) && "'#' expected");
	ScopedMacroState MacroState(*Line, Tokens, FormatTok);
	nextToken();

	if (!FormatTok->Tok.getIdentifierInfo()) {
	parsePPUnknown();
	return;
	}

	switch (FormatTok->Tok.getIdentifierInfo()->getPPKeywordID()) {
	case tok::pp_define:
	parsePPDefine();
	return;
	case tok::pp_if:
	parsePPIf(/IfDef=/false);
	break;
	case tok::pp_ifdef:
	case tok::pp_ifndef:
	parsePPIf(/IfDef=/true);
	break;
	case tok::pp_else:
	parsePPElse();
	break;
	case tok::pp_elif:
	parsePPElIf();
	break;
	case tok::pp_endif:
	parsePPEndIf();
	break;
	default:
	parsePPUnknown();
	break;
	}
	}

	void UnwrappedLineParser::conditionalCompilationCondition(bool Unreachable) {
	size_t Line = CurrentLines->size();
	if (CurrentLines == &PreprocessorDirectives)
	Line += Lines.size();

	if (Unreachable \|\|
	(!PPStack.empty() && PPStack.back().Kind == PP_Unreachable))
	PPStack.push_back({PP_Unreachable, Line});
	else
	PPStack.push_back({PP_Conditional, Line});
	}

	void UnwrappedLineParser::conditionalCompilationStart(bool Unreachable) {
	++PPBranchLevel;
	assert(PPBranchLevel >= 0 && PPBranchLevel <= (int)PPLevelBranchIndex.size());
	if (PPBranchLevel == (int)PPLevelBranchIndex.size()) {
	PPLevelBranchIndex.push_back(0);
	PPLevelBranchCount.push_back(0);
	}
	PPChainBranchIndex.push(0);
	bool Skip = PPLevelBranchIndex[PPBranchLevel] > 0;
	conditionalCompilationCondition(Unreachable \|\| Skip);
	}

	void UnwrappedLineParser::conditionalCompilationAlternative() {
	if (!PPStack.empty())
	PPStack.pop_back();
	assert(PPBranchLevel < (int)PPLevelBranchIndex.size());
	if (!PPChainBranchIndex.empty())
	++PPChainBranchIndex.top();
	conditionalCompilationCondition(
	PPBranchLevel >= 0 && !PPChainBranchIndex.empty() &&
	PPLevelBranchIndex[PPBranchLevel] != PPChainBranchIndex.top());
	}

	void UnwrappedLineParser::conditionalCompilationEnd() {
	assert(PPBranchLevel < (int)PPLevelBranchIndex.size());
	if (PPBranchLevel >= 0 && !PPChainBranchIndex.empty()) {
	if (PPChainBranchIndex.top() + 1 > PPLevelBranchCount[PPBranchLevel]) {
	PPLevelBranchCount[PPBranchLevel] = PPChainBranchIndex.top() + 1;
	}
	}
	// Guard against #endif's without #if.
	if (PPBranchLevel > -1)
	--PPBranchLevel;
	if (!PPChainBranchIndex.empty())
	PPChainBranchIndex.pop();
	if (!PPStack.empty())
	PPStack.pop_back();
	}

	void UnwrappedLineParser::parsePPIf(bool IfDef) {
	bool IfNDef = FormatTok->is(tok::pp_ifndef);
	nextToken();
	bool Unreachable = false;
	if (!IfDef && (FormatTok->is(tok::kw_false) \|\| FormatTok->TokenText == "0"))
	Unreachable = true;
	if (IfDef && !IfNDef && FormatTok->TokenText == "SWIG")
	Unreachable = true;
	conditionalCompilationStart(Unreachable);
	FormatToken *IfCondition = FormatTok;
	// If there's a #ifndef on the first line, and the only lines before it are
	// comments, it could be an include guard.
	bool MaybeIncludeGuard = IfNDef;
	- if (!IncludeGuardRejected && !FoundIncludeGuardStart && MaybeIncludeGuard) {
	+ if (IncludeGuard == IG_Inited && MaybeIncludeGuard)
	for (auto &Line : Lines) {
	if (!Line.Tokens.front().Tok->is(tok::comment)) {
	MaybeIncludeGuard = false;
	- IncludeGuardRejected = true;
	+ IncludeGuard = IG_Rejected;
	break;
	}
	}
	- }
	--PPBranchLevel;
	parsePPUnknown();
	++PPBranchLevel;
	- if (!IncludeGuardRejected && !FoundIncludeGuardStart && MaybeIncludeGuard)
	- IfNdefCondition = IfCondition;
	+ if (IncludeGuard == IG_Inited && MaybeIncludeGuard) {
	+ IncludeGuard = IG_IfNdefed;
	+ IncludeGuardToken = IfCondition;
	+ }
	}

	void UnwrappedLineParser::parsePPElse() {
	// If a potential include guard has an #else, it's not an include guard.
	- if (FoundIncludeGuardStart && PPBranchLevel == 0)
	- FoundIncludeGuardStart = false;
	+ if (IncludeGuard == IG_Defined && PPBranchLevel == 0)
	+ IncludeGuard = IG_Rejected;
	conditionalCompilationAlternative();
	if (PPBranchLevel > -1)
	--PPBranchLevel;
	parsePPUnknown();
	++PPBranchLevel;
	}

	void UnwrappedLineParser::parsePPElIf() { parsePPElse(); }

	void UnwrappedLineParser::parsePPEndIf() {
	conditionalCompilationEnd();
	parsePPUnknown();
	// If the #endif of a potential include guard is the last thing in the file,
	- // then we count it as a real include guard and subtract one from every
	- // preprocessor indent.
	+ // then we found an include guard.
	unsigned TokenPosition = Tokens->getPosition();
	FormatToken *PeekNext = AllTokens[TokenPosition];
	- if (FoundIncludeGuardStart && PPBranchLevel == -1 && PeekNext->is(tok::eof) &&
	+ if (IncludeGuard == IG_Defined && PPBranchLevel == -1 &&
	+ PeekNext->is(tok::eof) &&
	Style.IndentPPDirectives != FormatStyle::PPDIS_None)
	- for (auto &Line : Lines)
	- if (Line.InPPDirective && Line.Level > 0)
	- --Line.Level;
	+ IncludeGuard = IG_Found;
	}

	void UnwrappedLineParser::parsePPDefine() {
	nextToken();

	if (FormatTok->Tok.getKind() != tok::identifier) {
	+ IncludeGuard = IG_Rejected;
	+ IncludeGuardToken = nullptr;
	parsePPUnknown();
	return;
	}
	- if (IfNdefCondition && IfNdefCondition->TokenText == FormatTok->TokenText) {
	- FoundIncludeGuardStart = true;
	+
	+ if (IncludeGuard == IG_IfNdefed &&
	+ IncludeGuardToken->TokenText == FormatTok->TokenText) {
	+ IncludeGuard = IG_Defined;
	+ IncludeGuardToken = nullptr;
	for (auto &Line : Lines) {
	if (!Line.Tokens.front().Tok->isOneOf(tok::comment, tok::hash)) {
	- FoundIncludeGuardStart = false;
	+ IncludeGuard = IG_Rejected;
	break;
	}
	}
	}
	- IfNdefCondition = nullptr;
	+
	nextToken();
	if (FormatTok->Tok.getKind() == tok::l_paren &&
	FormatTok->WhitespaceRange.getBegin() ==
	FormatTok->WhitespaceRange.getEnd()) {
	parseParens();
	}
	if (Style.IndentPPDirectives == FormatStyle::PPDIS_AfterHash)
	Line->Level += PPBranchLevel + 1;
	addUnwrappedLine();
	++Line->Level;

	// Errors during a preprocessor directive can only affect the layout of the
	// preprocessor directive, and thus we ignore them. An alternative approach
	// would be to use the same approach we use on the file level (no
	// re-indentation if there was a structural error) within the macro
	// definition.
	parseFile();
	}

	void UnwrappedLineParser::parsePPUnknown() {
	do {
	nextToken();
	} while (!eof());
	if (Style.IndentPPDirectives == FormatStyle::PPDIS_AfterHash)
	Line->Level += PPBranchLevel + 1;
	addUnwrappedLine();
	- IfNdefCondition = nullptr;
	}

	// Here we blacklist certain tokens that are not usually the first token in an
	// unwrapped line. This is used in attempt to distinguish macro calls without
	// trailing semicolons from other constructs split to several lines.
	static bool tokenCanStartNewLine(const clang::Token &Tok) {
	// Semicolon can be a null-statement, l_square can be a start of a macro or
	// a C++11 attribute, but this doesn't seem to be common.
	return Tok.isNot(tok::semi) && Tok.isNot(tok::l_brace) &&
	Tok.isNot(tok::l_square) &&
	// Tokens that can only be used as binary operators and a part of
	// overloaded operator names.
	Tok.isNot(tok::period) && Tok.isNot(tok::periodstar) &&
	Tok.isNot(tok::arrow) && Tok.isNot(tok::arrowstar) &&
	Tok.isNot(tok::less) && Tok.isNot(tok::greater) &&
	Tok.isNot(tok::slash) && Tok.isNot(tok::percent) &&
	Tok.isNot(tok::lessless) && Tok.isNot(tok::greatergreater) &&
	Tok.isNot(tok::equal) && Tok.isNot(tok::plusequal) &&
	Tok.isNot(tok::minusequal) && Tok.isNot(tok::starequal) &&
	Tok.isNot(tok::slashequal) && Tok.isNot(tok::percentequal) &&
	Tok.isNot(tok::ampequal) && Tok.isNot(tok::pipeequal) &&
	Tok.isNot(tok::caretequal) && Tok.isNot(tok::greatergreaterequal) &&
	Tok.isNot(tok::lesslessequal) &&
	// Colon is used in labels, base class lists, initializer lists,
	// range-based for loops, ternary operator, but should never be the
	// first token in an unwrapped line.
	Tok.isNot(tok::colon) &&
	// 'noexcept' is a trailing annotation.
	Tok.isNot(tok::kw_noexcept);
	}

	static bool mustBeJSIdent(const AdditionalKeywords &Keywords,
	const FormatToken *FormatTok) {
	// FIXME: This returns true for C/C++ keywords like 'struct'.
	return FormatTok->is(tok::identifier) &&
	(FormatTok->Tok.getIdentifierInfo() == nullptr \|\|
	!FormatTok->isOneOf(
	Keywords.kw_in, Keywords.kw_of, Keywords.kw_as, Keywords.kw_async,
	Keywords.kw_await, Keywords.kw_yield, Keywords.kw_finally,
	Keywords.kw_function, Keywords.kw_import, Keywords.kw_is,
	Keywords.kw_let, Keywords.kw_var, tok::kw_const,
	Keywords.kw_abstract, Keywords.kw_extends, Keywords.kw_implements,
	Keywords.kw_instanceof, Keywords.kw_interface, Keywords.kw_throws,
	Keywords.kw_from));
	}

	static bool mustBeJSIdentOrValue(const AdditionalKeywords &Keywords,
	const FormatToken *FormatTok) {
	return FormatTok->Tok.isLiteral() \|\|
	FormatTok->isOneOf(tok::kw_true, tok::kw_false) \|\|
	mustBeJSIdent(Keywords, FormatTok);
	}

	// isJSDeclOrStmt returns true if \|FormatTok\| starts a declaration or statement
	// when encountered after a value (see mustBeJSIdentOrValue).
	static bool isJSDeclOrStmt(const AdditionalKeywords &Keywords,
	const FormatToken *FormatTok) {
	return FormatTok->isOneOf(
	tok::kw_return, Keywords.kw_yield,
	// conditionals
	tok::kw_if, tok::kw_else,
	// loops
	tok::kw_for, tok::kw_while, tok::kw_do, tok::kw_continue, tok::kw_break,
	// switch/case
	tok::kw_switch, tok::kw_case,
	// exceptions
	tok::kw_throw, tok::kw_try, tok::kw_catch, Keywords.kw_finally,
	// declaration
	tok::kw_const, tok::kw_class, Keywords.kw_var, Keywords.kw_let,
	Keywords.kw_async, Keywords.kw_function,
	// import/export
	Keywords.kw_import, tok::kw_export);
	}

	// readTokenWithJavaScriptASI reads the next token and terminates the current
	// line if JavaScript Automatic Semicolon Insertion must
	// happen between the current token and the next token.
	//
	// This method is conservative - it cannot cover all edge cases of JavaScript,
	// but only aims to correctly handle certain well known cases. It must not
	// return true in speculative cases.
	void UnwrappedLineParser::readTokenWithJavaScriptASI() {
	FormatToken *Previous = FormatTok;
	readToken();
	FormatToken *Next = FormatTok;

	bool IsOnSameLine =
	CommentsBeforeNextToken.empty()
	? Next->NewlinesBefore == 0
	: CommentsBeforeNextToken.front()->NewlinesBefore == 0;
	if (IsOnSameLine)
	return;

	bool PreviousMustBeValue = mustBeJSIdentOrValue(Keywords, Previous);
	bool PreviousStartsTemplateExpr =
	Previous->is(TT_TemplateString) && Previous->TokenText.endswith("${");
	if (PreviousMustBeValue \|\| Previous->is(tok::r_paren)) {
	// If the line contains an '@' sign, the previous token might be an
	// annotation, which can precede another identifier/value.
	bool HasAt = std::find_if(Line->Tokens.begin(), Line->Tokens.end(),
	[](UnwrappedLineNode &LineNode) {
	return LineNode.Tok->is(tok::at);
	}) != Line->Tokens.end();
	if (HasAt)
	return;
	}
	if (Next->is(tok::exclaim) && PreviousMustBeValue)
	return addUnwrappedLine();
	bool NextMustBeValue = mustBeJSIdentOrValue(Keywords, Next);
	bool NextEndsTemplateExpr =
	Next->is(TT_TemplateString) && Next->TokenText.startswith("}");
	if (NextMustBeValue && !NextEndsTemplateExpr && !PreviousStartsTemplateExpr &&
	(PreviousMustBeValue \|\|
	Previous->isOneOf(tok::r_square, tok::r_paren, tok::plusplus,
	tok::minusminus)))
	return addUnwrappedLine();
	if ((PreviousMustBeValue \|\| Previous->is(tok::r_paren)) &&
	isJSDeclOrStmt(Keywords, Next))
	return addUnwrappedLine();
	}

	void UnwrappedLineParser::parseStructuralElement() {
	assert(!FormatTok->is(tok::l_brace));
	if (Style.Language == FormatStyle::LK_TableGen &&
	FormatTok->is(tok::pp_include)) {
	nextToken();
	if (FormatTok->is(tok::string_literal))
	nextToken();
	addUnwrappedLine();
	return;
	}
	switch (FormatTok->Tok.getKind()) {
	case tok::at:
	nextToken();
	if (FormatTok->Tok.is(tok::l_brace)) {
	nextToken();
	parseBracedList();
	break;
	}
	switch (FormatTok->Tok.getObjCKeywordID()) {
	case tok::objc_public:
	case tok::objc_protected:
	case tok::objc_package:
	case tok::objc_private:
	return parseAccessSpecifier();
	case tok::objc_interface:
	case tok::objc_implementation:
	return parseObjCInterfaceOrImplementation();
	case tok::objc_protocol:
	return parseObjCProtocol();
	case tok::objc_end:
	return; // Handled by the caller.
	case tok::objc_optional:
	case tok::objc_required:
	nextToken();
	addUnwrappedLine();
	return;
	case tok::objc_autoreleasepool:
	nextToken();
	if (FormatTok->Tok.is(tok::l_brace)) {
	if (Style.BraceWrapping.AfterObjCDeclaration)
	addUnwrappedLine();
	parseBlock(/MustBeDeclaration=/false);
	}
	addUnwrappedLine();
	return;
	case tok::objc_try:
	// This branch isn't strictly necessary (the kw_try case below would
	// do this too after the tok::at is parsed above). But be explicit.
	parseTryCatch();
	return;
	default:
	break;
	}
	break;
	case tok::kw_asm:
	nextToken();
	if (FormatTok->is(tok::l_brace)) {
	FormatTok->Type = TT_InlineASMBrace;
	nextToken();
	while (FormatTok && FormatTok->isNot(tok::eof)) {
	if (FormatTok->is(tok::r_brace)) {
	FormatTok->Type = TT_InlineASMBrace;
	nextToken();
	addUnwrappedLine();
	break;
	}
	FormatTok->Finalized = true;
	nextToken();
	}
	}
	break;
	case tok::kw_namespace:
	parseNamespace();
	return;
	case tok::kw_inline:
	nextToken();
	if (FormatTok->Tok.is(tok::kw_namespace)) {
	parseNamespace();
	return;
	}
	break;
	case tok::kw_public:
	case tok::kw_protected:
	case tok::kw_private:
	if (Style.Language == FormatStyle::LK_Java \|\|
	Style.Language == FormatStyle::LK_JavaScript)
	nextToken();
	else
	parseAccessSpecifier();
	return;
	case tok::kw_if:
	parseIfThenElse();
	return;
	case tok::kw_for:
	case tok::kw_while:
	parseForOrWhileLoop();
	return;
	case tok::kw_do:
	parseDoWhile();
	return;
	case tok::kw_switch:
	if (Style.Language == FormatStyle::LK_JavaScript && Line->MustBeDeclaration)
	// 'switch: string' field declaration.
	break;
	parseSwitch();
	return;
	case tok::kw_default:
	if (Style.Language == FormatStyle::LK_JavaScript && Line->MustBeDeclaration)
	// 'default: string' field declaration.
	break;
	nextToken();
	parseLabel();
	return;
	case tok::kw_case:
	if (Style.Language == FormatStyle::LK_JavaScript && Line->MustBeDeclaration)
	// 'case: string' field declaration.
	break;
	parseCaseLabel();
	return;
	case tok::kw_try:
	case tok::kw___try:
	parseTryCatch();
	return;
	case tok::kw_extern:
	nextToken();
	if (FormatTok->Tok.is(tok::string_literal)) {
	nextToken();
	if (FormatTok->Tok.is(tok::l_brace)) {
	if (Style.BraceWrapping.AfterExternBlock) {
	addUnwrappedLine();
	parseBlock(/MustBeDeclaration=/true);
	} else {
	parseBlock(/MustBeDeclaration=/true, /AddLevel=/false);
	}
	addUnwrappedLine();
	return;
	}
	}
	break;
	case tok::kw_export:
	if (Style.Language == FormatStyle::LK_JavaScript) {
	parseJavaScriptEs6ImportExport();
	return;
	}
	break;
	case tok::identifier:
	if (FormatTok->is(TT_ForEachMacro)) {
	parseForOrWhileLoop();
	return;
	}
	if (FormatTok->is(TT_MacroBlockBegin)) {
	parseBlock(/MustBeDeclaration=/false, /AddLevel=/true,
	/MunchSemi=/false);
	return;
	}
	if (FormatTok->is(Keywords.kw_import)) {
	if (Style.Language == FormatStyle::LK_JavaScript) {
	parseJavaScriptEs6ImportExport();
	return;
	}
	if (Style.Language == FormatStyle::LK_Proto) {
	nextToken();
	if (FormatTok->is(tok::kw_public))
	nextToken();
	if (!FormatTok->is(tok::string_literal))
	return;
	nextToken();
	if (FormatTok->is(tok::semi))
	nextToken();
	addUnwrappedLine();
	return;
	}
	}
	if (Style.isCpp() &&
	FormatTok->isOneOf(Keywords.kw_signals, Keywords.kw_qsignals,
	Keywords.kw_slots, Keywords.kw_qslots)) {
	nextToken();
	if (FormatTok->is(tok::colon)) {
	nextToken();
	addUnwrappedLine();
	return;
	}
	}
	// In all other cases, parse the declaration.
	break;
	default:
	break;
	}
	do {
	const FormatToken *Previous = FormatTok->Previous;
	switch (FormatTok->Tok.getKind()) {
	case tok::at:
	nextToken();
	if (FormatTok->Tok.is(tok::l_brace)) {
	nextToken();
	parseBracedList();
	}
	break;
	case tok::kw_enum:
	// Ignore if this is part of "template <enum ...".
	if (Previous && Previous->is(tok::less)) {
	nextToken();
	break;
	}

	// parseEnum falls through and does not yet add an unwrapped line as an
	// enum definition can start a structural element.
	if (!parseEnum())
	break;
	// This only applies for C++.
	if (!Style.isCpp()) {
	addUnwrappedLine();
	return;
	}
	break;
	case tok::kw_typedef:
	nextToken();
	if (FormatTok->isOneOf(Keywords.kw_NS_ENUM, Keywords.kw_NS_OPTIONS,
	Keywords.kw_CF_ENUM, Keywords.kw_CF_OPTIONS))
	parseEnum();
	break;
	case tok::kw_struct:
	case tok::kw_union:
	case tok::kw_class:
	// parseRecord falls through and does not yet add an unwrapped line as a
	// record declaration or definition can start a structural element.
	parseRecord();
	// This does not apply for Java and JavaScript.
	if (Style.Language == FormatStyle::LK_Java \|\|
	Style.Language == FormatStyle::LK_JavaScript) {
	if (FormatTok->is(tok::semi))
	nextToken();
	addUnwrappedLine();
	return;
	}
	break;
	case tok::period:
	nextToken();
	// In Java, classes have an implicit static member "class".
	if (Style.Language == FormatStyle::LK_Java && FormatTok &&
	FormatTok->is(tok::kw_class))
	nextToken();
	if (Style.Language == FormatStyle::LK_JavaScript && FormatTok &&
	FormatTok->Tok.getIdentifierInfo())
	// JavaScript only has pseudo keywords, all keywords are allowed to
	// appear in "IdentifierName" positions. See http://es5.github.io/#x7.6
	nextToken();
	break;
	case tok::semi:
	nextToken();
	addUnwrappedLine();
	return;
	case tok::r_brace:
	addUnwrappedLine();
	return;
	case tok::l_paren:
	parseParens();
	break;
	case tok::kw_operator:
	nextToken();
	if (FormatTok->isBinaryOperator())
	nextToken();
	break;
	case tok::caret:
	nextToken();
	if (FormatTok->Tok.isAnyIdentifier() \|\|
	FormatTok->isSimpleTypeSpecifier())
	nextToken();
	if (FormatTok->is(tok::l_paren))
	parseParens();
	if (FormatTok->is(tok::l_brace))
	parseChildBlock();
	break;
	case tok::l_brace:
	if (!tryToParseBracedList()) {
	// A block outside of parentheses must be the last part of a
	// structural element.
	// FIXME: Figure out cases where this is not true, and add projections
	// for them (the one we know is missing are lambdas).
	if (Style.BraceWrapping.AfterFunction)
	addUnwrappedLine();
	FormatTok->Type = TT_FunctionLBrace;
	parseBlock(/MustBeDeclaration=/false);
	addUnwrappedLine();
	return;
	}
	// Otherwise this was a braced init list, and the structural
	// element continues.
	break;
	case tok::kw_try:
	// We arrive here when parsing function-try blocks.
	parseTryCatch();
	return;
	case tok::identifier: {
	if (FormatTok->is(TT_MacroBlockEnd)) {
	addUnwrappedLine();
	return;
	}

	// Function declarations (as opposed to function expressions) are parsed
	// on their own unwrapped line by continuing this loop. Function
	// expressions (functions that are not on their own line) must not create
	// a new unwrapped line, so they are special cased below.
	size_t TokenCount = Line->Tokens.size();
	if (Style.Language == FormatStyle::LK_JavaScript &&
	FormatTok->is(Keywords.kw_function) &&
	(TokenCount > 1 \|\| (TokenCount == 1 && !Line->Tokens.front().Tok->is(
	Keywords.kw_async)))) {
	tryToParseJSFunction();
	break;
	}
	if ((Style.Language == FormatStyle::LK_JavaScript \|\|
	Style.Language == FormatStyle::LK_Java) &&
	FormatTok->is(Keywords.kw_interface)) {
	if (Style.Language == FormatStyle::LK_JavaScript) {
	// In JavaScript/TypeScript, "interface" can be used as a standalone
	// identifier, e.g. in `var interface = 1;`. If "interface" is
	// followed by another identifier, it is very like to be an actual
	// interface declaration.
	unsigned StoredPosition = Tokens->getPosition();
	FormatToken *Next = Tokens->getNextToken();
	FormatTok = Tokens->setPosition(StoredPosition);
	if (Next && !mustBeJSIdent(Keywords, Next)) {
	nextToken();
	break;
	}
	}
	parseRecord();
	addUnwrappedLine();
	return;
	}

	// See if the following token should start a new unwrapped line.
	StringRef Text = FormatTok->TokenText;
	nextToken();
	if (Line->Tokens.size() == 1 &&
	// JS doesn't have macros, and within classes colons indicate fields,
	// not labels.
	Style.Language != FormatStyle::LK_JavaScript) {
	if (FormatTok->Tok.is(tok::colon) && !Line->MustBeDeclaration) {
	Line->Tokens.begin()->Tok->MustBreakBefore = true;
	parseLabel();
	return;
	}
	// Recognize function-like macro usages without trailing semicolon as
	// well as free-standing macros like Q_OBJECT.
	bool FunctionLike = FormatTok->is(tok::l_paren);
	if (FunctionLike)
	parseParens();

	bool FollowedByNewline =
	CommentsBeforeNextToken.empty()
	? FormatTok->NewlinesBefore > 0
	: CommentsBeforeNextToken.front()->NewlinesBefore > 0;

	if (FollowedByNewline && (Text.size() >= 5 \|\| FunctionLike) &&
	tokenCanStartNewLine(FormatTok->Tok) && Text == Text.upper()) {
	addUnwrappedLine();
	return;
	}
	}
	break;
	}
	case tok::equal:
	// Fat arrows (=>) have tok::TokenKind tok::equal but TokenType
	// TT_JsFatArrow. The always start an expression or a child block if
	// followed by a curly.
	if (FormatTok->is(TT_JsFatArrow)) {
	nextToken();
	if (FormatTok->is(tok::l_brace))
	parseChildBlock();
	break;
	}

	nextToken();
	if (FormatTok->Tok.is(tok::l_brace)) {
	nextToken();
	parseBracedList();
	} else if (Style.Language == FormatStyle::LK_Proto &&
	FormatTok->Tok.is(tok::less)) {
	nextToken();
	parseBracedList(/ContinueOnSemicolons=/false,
	/ClosingBraceKind=/tok::greater);
	}
	break;
	case tok::l_square:
	parseSquare();
	break;
	case tok::kw_new:
	parseNew();
	break;
	default:
	nextToken();
	break;
	}
	} while (!eof());
	}

	bool UnwrappedLineParser::tryToParseLambda() {
	if (!Style.isCpp()) {
	nextToken();
	return false;
	}
	assert(FormatTok->is(tok::l_square));
	FormatToken &LSquare = *FormatTok;
	if (!tryToParseLambdaIntroducer())
	return false;

	while (FormatTok->isNot(tok::l_brace)) {
	if (FormatTok->isSimpleTypeSpecifier()) {
	nextToken();
	continue;
	}
	switch (FormatTok->Tok.getKind()) {
	case tok::l_brace:
	break;
	case tok::l_paren:
	parseParens();
	break;
	case tok::amp:
	case tok::star:
	case tok::kw_const:
	case tok::comma:
	case tok::less:
	case tok::greater:
	case tok::identifier:
	case tok::numeric_constant:
	case tok::coloncolon:
	case tok::kw_mutable:
	nextToken();
	break;
	case tok::arrow:
	FormatTok->Type = TT_LambdaArrow;
	nextToken();
	break;
	default:
	return true;
	}
	}
	LSquare.Type = TT_LambdaLSquare;
	parseChildBlock();
	return true;
	}

	bool UnwrappedLineParser::tryToParseLambdaIntroducer() {
	const FormatToken *Previous = FormatTok->Previous;
	if (Previous &&
	(Previous->isOneOf(tok::identifier, tok::kw_operator, tok::kw_new,
	tok::kw_delete) \|\|
	FormatTok->isCppStructuredBinding(Style) \|\| Previous->closesScope() \|\|
	Previous->isSimpleTypeSpecifier())) {
	nextToken();
	return false;
	}
	nextToken();
	parseSquare(/LambdaIntroducer=/true);
	return true;
	}

	void UnwrappedLineParser::tryToParseJSFunction() {
	assert(FormatTok->is(Keywords.kw_function) \|\|
	FormatTok->startsSequence(Keywords.kw_async, Keywords.kw_function));
	if (FormatTok->is(Keywords.kw_async))
	nextToken();
	// Consume "function".
	nextToken();

	// Consume * (generator function). Treat it like C++'s overloaded operators.
	if (FormatTok->is(tok::star)) {
	FormatTok->Type = TT_OverloadedOperator;
	nextToken();
	}

	// Consume function name.
	if (FormatTok->is(tok::identifier))
	nextToken();

	if (FormatTok->isNot(tok::l_paren))
	return;

	// Parse formal parameter list.
	parseParens();

	if (FormatTok->is(tok::colon)) {
	// Parse a type definition.
	nextToken();

	// Eat the type declaration. For braced inline object types, balance braces,
	// otherwise just parse until finding an l_brace for the function body.
	if (FormatTok->is(tok::l_brace))
	tryToParseBracedList();
	else
	while (!FormatTok->isOneOf(tok::l_brace, tok::semi) && !eof())
	nextToken();
	}

	if (FormatTok->is(tok::semi))
	return;

	parseChildBlock();
	}

	bool UnwrappedLineParser::tryToParseBracedList() {
	if (FormatTok->BlockKind == BK_Unknown)
	calculateBraceTypes();
	assert(FormatTok->BlockKind != BK_Unknown);
	if (FormatTok->BlockKind == BK_Block)
	return false;
	nextToken();
	parseBracedList();
	return true;
	}

	bool UnwrappedLineParser::parseBracedList(bool ContinueOnSemicolons,
	tok::TokenKind ClosingBraceKind) {
	bool HasError = false;

	// FIXME: Once we have an expression parser in the UnwrappedLineParser,
	// replace this by using parseAssigmentExpression() inside.
	do {
	if (Style.Language == FormatStyle::LK_JavaScript) {
	if (FormatTok->is(Keywords.kw_function) \|\|
	FormatTok->startsSequence(Keywords.kw_async, Keywords.kw_function)) {
	tryToParseJSFunction();
	continue;
	}
	if (FormatTok->is(TT_JsFatArrow)) {
	nextToken();
	// Fat arrows can be followed by simple expressions or by child blocks
	// in curly braces.
	if (FormatTok->is(tok::l_brace)) {
	parseChildBlock();
	continue;
	}
	}
	if (FormatTok->is(tok::l_brace)) {
	// Could be a method inside of a braced list `{a() { return 1; }}`.
	if (tryToParseBracedList())
	continue;
	parseChildBlock();
	}
	}
	if (FormatTok->Tok.getKind() == ClosingBraceKind) {
	nextToken();
	return !HasError;
	}
	switch (FormatTok->Tok.getKind()) {
	case tok::caret:
	nextToken();
	if (FormatTok->is(tok::l_brace)) {
	parseChildBlock();
	}
	break;
	case tok::l_square:
	tryToParseLambda();
	break;
	case tok::l_paren:
	parseParens();
	// JavaScript can just have free standing methods and getters/setters in
	// object literals. Detect them by a "{" following ")".
	if (Style.Language == FormatStyle::LK_JavaScript) {
	if (FormatTok->is(tok::l_brace))
	parseChildBlock();
	break;
	}
	break;
	case tok::l_brace:
	// Assume there are no blocks inside a braced init list apart
	// from the ones we explicitly parse out (like lambdas).
	FormatTok->BlockKind = BK_BracedInit;
	nextToken();
	parseBracedList();
	break;
	case tok::less:
	if (Style.Language == FormatStyle::LK_Proto) {
	nextToken();
	parseBracedList(/ContinueOnSemicolons=/false,
	/ClosingBraceKind=/tok::greater);
	} else {
	nextToken();
	}
	break;
	case tok::semi:
	// JavaScript (or more precisely TypeScript) can have semicolons in braced
	// lists (in so-called TypeMemberLists). Thus, the semicolon cannot be
	// used for error recovery if we have otherwise determined that this is
	// a braced list.
	if (Style.Language == FormatStyle::LK_JavaScript) {
	nextToken();
	break;
	}
	HasError = true;
	if (!ContinueOnSemicolons)
	return !HasError;
	nextToken();
	break;
	case tok::comma:
	nextToken();
	break;
	default:
	nextToken();
	break;
	}
	} while (!eof());
	return false;
	}

	void UnwrappedLineParser::parseParens() {
	assert(FormatTok->Tok.is(tok::l_paren) && "'(' expected.");
	nextToken();
	do {
	switch (FormatTok->Tok.getKind()) {
	case tok::l_paren:
	parseParens();
	if (Style.Language == FormatStyle::LK_Java && FormatTok->is(tok::l_brace))
	parseChildBlock();
	break;
	case tok::r_paren:
	nextToken();
	return;
	case tok::r_brace:
	// A "}" inside parenthesis is an error if there wasn't a matching "{".
	return;
	case tok::l_square:
	tryToParseLambda();
	break;
	case tok::l_brace:
	if (!tryToParseBracedList())
	parseChildBlock();
	break;
	case tok::at:
	nextToken();
	if (FormatTok->Tok.is(tok::l_brace)) {
	nextToken();
	parseBracedList();
	}
	break;
	case tok::kw_class:
	if (Style.Language == FormatStyle::LK_JavaScript)
	parseRecord(/ParseAsExpr=/true);
	else
	nextToken();
	break;
	case tok::identifier:
	if (Style.Language == FormatStyle::LK_JavaScript &&
	(FormatTok->is(Keywords.kw_function) \|\|
	FormatTok->startsSequence(Keywords.kw_async, Keywords.kw_function)))
	tryToParseJSFunction();
	else
	nextToken();
	break;
	default:
	nextToken();
	break;
	}
	} while (!eof());
	}

	void UnwrappedLineParser::parseSquare(bool LambdaIntroducer) {
	if (!LambdaIntroducer) {
	assert(FormatTok->Tok.is(tok::l_square) && "'[' expected.");
	if (tryToParseLambda())
	return;
	}
	do {
	switch (FormatTok->Tok.getKind()) {
	case tok::l_paren:
	parseParens();
	break;
	case tok::r_square:
	nextToken();
	return;
	case tok::r_brace:
	// A "}" inside parenthesis is an error if there wasn't a matching "{".
	return;
	case tok::l_square:
	parseSquare();
	break;
	case tok::l_brace: {
	if (!tryToParseBracedList())
	parseChildBlock();
	break;
	}
	case tok::at:
	nextToken();
	if (FormatTok->Tok.is(tok::l_brace)) {
	nextToken();
	parseBracedList();
	}
	break;
	default:
	nextToken();
	break;
	}
	} while (!eof());
	}

	void UnwrappedLineParser::parseIfThenElse() {
	assert(FormatTok->Tok.is(tok::kw_if) && "'if' expected");
	nextToken();
	if (FormatTok->Tok.is(tok::kw_constexpr))
	nextToken();
	if (FormatTok->Tok.is(tok::l_paren))
	parseParens();
	bool NeedsUnwrappedLine = false;
	if (FormatTok->Tok.is(tok::l_brace)) {
	CompoundStatementIndenter Indenter(this, Style, Line->Level);
	parseBlock(/MustBeDeclaration=/false);
	if (Style.BraceWrapping.BeforeElse)
	addUnwrappedLine();
	else
	NeedsUnwrappedLine = true;
	} else {
	addUnwrappedLine();
	++Line->Level;
	parseStructuralElement();
	--Line->Level;
	}
	if (FormatTok->Tok.is(tok::kw_else)) {
	nextToken();
	if (FormatTok->Tok.is(tok::l_brace)) {
	CompoundStatementIndenter Indenter(this, Style, Line->Level);
	parseBlock(/MustBeDeclaration=/false);
	addUnwrappedLine();
	} else if (FormatTok->Tok.is(tok::kw_if)) {
	parseIfThenElse();
	} else {
	addUnwrappedLine();
	++Line->Level;
	parseStructuralElement();
	if (FormatTok->is(tok::eof))
	addUnwrappedLine();
	--Line->Level;
	}
	} else if (NeedsUnwrappedLine) {
	addUnwrappedLine();
	}
	}

	void UnwrappedLineParser::parseTryCatch() {
	assert(FormatTok->isOneOf(tok::kw_try, tok::kw___try) && "'try' expected");
	nextToken();
	bool NeedsUnwrappedLine = false;
	if (FormatTok->is(tok::colon)) {
	// We are in a function try block, what comes is an initializer list.
	nextToken();
	while (FormatTok->is(tok::identifier)) {
	nextToken();
	if (FormatTok->is(tok::l_paren))
	parseParens();
	if (FormatTok->is(tok::comma))
	nextToken();
	}
	}
	// Parse try with resource.
	if (Style.Language == FormatStyle::LK_Java && FormatTok->is(tok::l_paren)) {
	parseParens();
	}
	if (FormatTok->is(tok::l_brace)) {
	CompoundStatementIndenter Indenter(this, Style, Line->Level);
	parseBlock(/MustBeDeclaration=/false);
	if (Style.BraceWrapping.BeforeCatch) {
	addUnwrappedLine();
	} else {
	NeedsUnwrappedLine = true;
	}
	} else if (!FormatTok->is(tok::kw_catch)) {
	// The C++ standard requires a compound-statement after a try.
	// If there's none, we try to assume there's a structuralElement
	// and try to continue.
	addUnwrappedLine();
	++Line->Level;
	parseStructuralElement();
	--Line->Level;
	}
	while (1) {
	if (FormatTok->is(tok::at))
	nextToken();
	if (!(FormatTok->isOneOf(tok::kw_catch, Keywords.kw___except,
	tok::kw___finally) \|\|
	((Style.Language == FormatStyle::LK_Java \|\|
	Style.Language == FormatStyle::LK_JavaScript) &&
	FormatTok->is(Keywords.kw_finally)) \|\|
	(FormatTok->Tok.isObjCAtKeyword(tok::objc_catch) \|\|
	FormatTok->Tok.isObjCAtKeyword(tok::objc_finally))))
	break;
	nextToken();
	while (FormatTok->isNot(tok::l_brace)) {
	if (FormatTok->is(tok::l_paren)) {
	parseParens();
	continue;
	}
	if (FormatTok->isOneOf(tok::semi, tok::r_brace, tok::eof))
	return;
	nextToken();
	}
	NeedsUnwrappedLine = false;
	CompoundStatementIndenter Indenter(this, Style, Line->Level);
	parseBlock(/MustBeDeclaration=/false);
	if (Style.BraceWrapping.BeforeCatch)
	addUnwrappedLine();
	else
	NeedsUnwrappedLine = true;
	}
	if (NeedsUnwrappedLine)
	addUnwrappedLine();
	}

	void UnwrappedLineParser::parseNamespace() {
	assert(FormatTok->Tok.is(tok::kw_namespace) && "'namespace' expected");

	const FormatToken &InitialToken = *FormatTok;
	nextToken();
	while (FormatTok->isOneOf(tok::identifier, tok::coloncolon))
	nextToken();
	if (FormatTok->Tok.is(tok::l_brace)) {
	if (ShouldBreakBeforeBrace(Style, InitialToken))
	addUnwrappedLine();

	bool AddLevel = Style.NamespaceIndentation == FormatStyle::NI_All \|\|
	(Style.NamespaceIndentation == FormatStyle::NI_Inner &&
	DeclarationScopeStack.size() > 1);
	parseBlock(/MustBeDeclaration=/true, AddLevel);
	// Munch the semicolon after a namespace. This is more common than one would
	// think. Puttin the semicolon into its own line is very ugly.
	if (FormatTok->Tok.is(tok::semi))
	nextToken();
	addUnwrappedLine();
	}
	// FIXME: Add error handling.
	}

	void UnwrappedLineParser::parseNew() {
	assert(FormatTok->is(tok::kw_new) && "'new' expected");
	nextToken();
	if (Style.Language != FormatStyle::LK_Java)
	return;

	// In Java, we can parse everything up to the parens, which aren't optional.
	do {
	// There should not be a ;, { or } before the new's open paren.
	if (FormatTok->isOneOf(tok::semi, tok::l_brace, tok::r_brace))
	return;

	// Consume the parens.
	if (FormatTok->is(tok::l_paren)) {
	parseParens();

	// If there is a class body of an anonymous class, consume that as child.
	if (FormatTok->is(tok::l_brace))
	parseChildBlock();
	return;
	}
	nextToken();
	} while (!eof());
	}

	void UnwrappedLineParser::parseForOrWhileLoop() {
	assert(FormatTok->isOneOf(tok::kw_for, tok::kw_while, TT_ForEachMacro) &&
	"'for', 'while' or foreach macro expected");
	nextToken();
	// JS' for await ( ...
	if (Style.Language == FormatStyle::LK_JavaScript &&
	FormatTok->is(Keywords.kw_await))
	nextToken();
	if (FormatTok->Tok.is(tok::l_paren))
	parseParens();
	if (FormatTok->Tok.is(tok::l_brace)) {
	CompoundStatementIndenter Indenter(this, Style, Line->Level);
	parseBlock(/MustBeDeclaration=/false);
	addUnwrappedLine();
	} else {
	addUnwrappedLine();
	++Line->Level;
	parseStructuralElement();
	--Line->Level;
	}
	}

	void UnwrappedLineParser::parseDoWhile() {
	assert(FormatTok->Tok.is(tok::kw_do) && "'do' expected");
	nextToken();
	if (FormatTok->Tok.is(tok::l_brace)) {
	CompoundStatementIndenter Indenter(this, Style, Line->Level);
	parseBlock(/MustBeDeclaration=/false);
	if (Style.BraceWrapping.IndentBraces)
	addUnwrappedLine();
	} else {
	addUnwrappedLine();
	++Line->Level;
	parseStructuralElement();
	--Line->Level;
	}

	// FIXME: Add error handling.
	if (!FormatTok->Tok.is(tok::kw_while)) {
	addUnwrappedLine();
	return;
	}

	nextToken();
	parseStructuralElement();
	}

	void UnwrappedLineParser::parseLabel() {
	nextToken();
	unsigned OldLineLevel = Line->Level;
	if (Line->Level > 1 \|\| (!Line->InPPDirective && Line->Level > 0))
	--Line->Level;
	if (CommentsBeforeNextToken.empty() && FormatTok->Tok.is(tok::l_brace)) {
	CompoundStatementIndenter Indenter(this, Style, Line->Level);
	parseBlock(/MustBeDeclaration=/false);
	if (FormatTok->Tok.is(tok::kw_break)) {
	if (Style.BraceWrapping.AfterControlStatement)
	addUnwrappedLine();
	parseStructuralElement();
	}
	addUnwrappedLine();
	} else {
	if (FormatTok->is(tok::semi))
	nextToken();
	addUnwrappedLine();
	}
	Line->Level = OldLineLevel;
	if (FormatTok->isNot(tok::l_brace)) {
	parseStructuralElement();
	addUnwrappedLine();
	}
	}

	void UnwrappedLineParser::parseCaseLabel() {
	assert(FormatTok->Tok.is(tok::kw_case) && "'case' expected");
	// FIXME: fix handling of complex expressions here.
	do {
	nextToken();
	} while (!eof() && !FormatTok->Tok.is(tok::colon));
	parseLabel();
	}

	void UnwrappedLineParser::parseSwitch() {
	assert(FormatTok->Tok.is(tok::kw_switch) && "'switch' expected");
	nextToken();
	if (FormatTok->Tok.is(tok::l_paren))
	parseParens();
	if (FormatTok->Tok.is(tok::l_brace)) {
	CompoundStatementIndenter Indenter(this, Style, Line->Level);
	parseBlock(/MustBeDeclaration=/false);
	addUnwrappedLine();
	} else {
	addUnwrappedLine();
	++Line->Level;
	parseStructuralElement();
	--Line->Level;
	}
	}

	void UnwrappedLineParser::parseAccessSpecifier() {
	nextToken();
	// Understand Qt's slots.
	if (FormatTok->isOneOf(Keywords.kw_slots, Keywords.kw_qslots))
	nextToken();
	// Otherwise, we don't know what it is, and we'd better keep the next token.
	if (FormatTok->Tok.is(tok::colon))
	nextToken();
	addUnwrappedLine();
	}

	bool UnwrappedLineParser::parseEnum() {
	// Won't be 'enum' for NS_ENUMs.
	if (FormatTok->Tok.is(tok::kw_enum))
	nextToken();

	// In TypeScript, "enum" can also be used as property name, e.g. in interface
	// declarations. An "enum" keyword followed by a colon would be a syntax
	// error and thus assume it is just an identifier.
	if (Style.Language == FormatStyle::LK_JavaScript &&
	FormatTok->isOneOf(tok::colon, tok::question))
	return false;

	// Eat up enum class ...
	if (FormatTok->Tok.is(tok::kw_class) \|\| FormatTok->Tok.is(tok::kw_struct))
	nextToken();

	while (FormatTok->Tok.getIdentifierInfo() \|\|
	FormatTok->isOneOf(tok::colon, tok::coloncolon, tok::less,
	tok::greater, tok::comma, tok::question)) {
	nextToken();
	// We can have macros or attributes in between 'enum' and the enum name.
	if (FormatTok->is(tok::l_paren))
	parseParens();
	if (FormatTok->is(tok::identifier)) {
	nextToken();
	// If there are two identifiers in a row, this is likely an elaborate
	// return type. In Java, this can be "implements", etc.
	if (Style.isCpp() && FormatTok->is(tok::identifier))
	return false;
	}
	}

	// Just a declaration or something is wrong.
	if (FormatTok->isNot(tok::l_brace))
	return true;
	FormatTok->BlockKind = BK_Block;

	if (Style.Language == FormatStyle::LK_Java) {
	// Java enums are different.
	parseJavaEnumBody();
	return true;
	}
	if (Style.Language == FormatStyle::LK_Proto) {
	parseBlock(/MustBeDeclaration=/true);
	return true;
	}

	// Parse enum body.
	nextToken();
	bool HasError = !parseBracedList(/ContinueOnSemicolons=/true);
	if (HasError) {
	if (FormatTok->is(tok::semi))
	nextToken();
	addUnwrappedLine();
	}
	return true;

	// There is no addUnwrappedLine() here so that we fall through to parsing a
	// structural element afterwards. Thus, in "enum A {} n, m;",
	// "} n, m;" will end up in one unwrapped line.
	}

	void UnwrappedLineParser::parseJavaEnumBody() {
	// Determine whether the enum is simple, i.e. does not have a semicolon or
	// constants with class bodies. Simple enums can be formatted like braced
	// lists, contracted to a single line, etc.
	unsigned StoredPosition = Tokens->getPosition();
	bool IsSimple = true;
	FormatToken *Tok = Tokens->getNextToken();
	while (Tok) {
	if (Tok->is(tok::r_brace))
	break;
	if (Tok->isOneOf(tok::l_brace, tok::semi)) {
	IsSimple = false;
	break;
	}
	// FIXME: This will also mark enums with braces in the arguments to enum
	// constants as "not simple". This is probably fine in practice, though.
	Tok = Tokens->getNextToken();
	}
	FormatTok = Tokens->setPosition(StoredPosition);

	if (IsSimple) {
	nextToken();
	parseBracedList();
	addUnwrappedLine();
	return;
	}

	// Parse the body of a more complex enum.
	// First add a line for everything up to the "{".
	nextToken();
	addUnwrappedLine();
	++Line->Level;

	// Parse the enum constants.
	while (FormatTok) {
	if (FormatTok->is(tok::l_brace)) {
	// Parse the constant's class body.
	parseBlock(/MustBeDeclaration=/true, /AddLevel=/true,
	/MunchSemi=/false);
	} else if (FormatTok->is(tok::l_paren)) {
	parseParens();
	} else if (FormatTok->is(tok::comma)) {
	nextToken();
	addUnwrappedLine();
	} else if (FormatTok->is(tok::semi)) {
	nextToken();
	addUnwrappedLine();
	break;
	} else if (FormatTok->is(tok::r_brace)) {
	addUnwrappedLine();
	break;
	} else {
	nextToken();
	}
	}

	// Parse the class body after the enum's ";" if any.
	parseLevel(/HasOpeningBrace=/true);
	nextToken();
	--Line->Level;
	addUnwrappedLine();
	}

	void UnwrappedLineParser::parseRecord(bool ParseAsExpr) {
	const FormatToken &InitialToken = *FormatTok;
	nextToken();

	// The actual identifier can be a nested name specifier, and in macros
	// it is often token-pasted.
	while (FormatTok->isOneOf(tok::identifier, tok::coloncolon, tok::hashhash,
	tok::kw___attribute, tok::kw___declspec,
	tok::kw_alignas) \|\|
	((Style.Language == FormatStyle::LK_Java \|\|
	Style.Language == FormatStyle::LK_JavaScript) &&
	FormatTok->isOneOf(tok::period, tok::comma))) {
	if (Style.Language == FormatStyle::LK_JavaScript &&
	FormatTok->isOneOf(Keywords.kw_extends, Keywords.kw_implements)) {
	// JavaScript/TypeScript supports inline object types in
	// extends/implements positions:
	// class Foo implements {bar: number} { }
	nextToken();
	if (FormatTok->is(tok::l_brace)) {
	tryToParseBracedList();
	continue;
	}
	}
	bool IsNonMacroIdentifier =
	FormatTok->is(tok::identifier) &&
	FormatTok->TokenText != FormatTok->TokenText.upper();
	nextToken();
	// We can have macros or attributes in between 'class' and the class name.
	if (!IsNonMacroIdentifier && FormatTok->Tok.is(tok::l_paren))
	parseParens();
	}

	// Note that parsing away template declarations here leads to incorrectly
	// accepting function declarations as record declarations.
	// In general, we cannot solve this problem. Consider:
	// class A<int> B() {}
	// which can be a function definition or a class definition when B() is a
	// macro. If we find enough real-world cases where this is a problem, we
	// can parse for the 'template' keyword in the beginning of the statement,
	// and thus rule out the record production in case there is no template
	// (this would still leave us with an ambiguity between template function
	// and class declarations).
	if (FormatTok->isOneOf(tok::colon, tok::less)) {
	while (!eof()) {
	if (FormatTok->is(tok::l_brace)) {
	calculateBraceTypes(/ExpectClassBody=/true);
	if (!tryToParseBracedList())
	break;
	}
	if (FormatTok->Tok.is(tok::semi))
	return;
	nextToken();
	}
	}
	if (FormatTok->Tok.is(tok::l_brace)) {
	if (ParseAsExpr) {
	parseChildBlock();
	} else {
	if (ShouldBreakBeforeBrace(Style, InitialToken))
	addUnwrappedLine();

	parseBlock(/MustBeDeclaration=/true, /AddLevel=/true,
	/MunchSemi=/false);
	}
	}
	// There is no addUnwrappedLine() here so that we fall through to parsing a
	// structural element afterwards. Thus, in "class A {} n, m;",
	// "} n, m;" will end up in one unwrapped line.
	}

	void UnwrappedLineParser::parseObjCProtocolList() {
	assert(FormatTok->Tok.is(tok::less) && "'<' expected.");
	do
	nextToken();
	while (!eof() && FormatTok->Tok.isNot(tok::greater));
	nextToken(); // Skip '>'.
	}

	void UnwrappedLineParser::parseObjCUntilAtEnd() {
	do {
	if (FormatTok->Tok.isObjCAtKeyword(tok::objc_end)) {
	nextToken();
	addUnwrappedLine();
	break;
	}
	if (FormatTok->is(tok::l_brace)) {
	parseBlock(/MustBeDeclaration=/false);
	// In ObjC interfaces, nothing should be following the "}".
	addUnwrappedLine();
	} else if (FormatTok->is(tok::r_brace)) {
	// Ignore stray "}". parseStructuralElement doesn't consume them.
	nextToken();
	addUnwrappedLine();
	} else {
	parseStructuralElement();
	}
	} while (!eof());
	}

	void UnwrappedLineParser::parseObjCInterfaceOrImplementation() {
	nextToken();
	nextToken(); // interface name

	// @interface can be followed by either a base class, or a category.
	if (FormatTok->Tok.is(tok::colon)) {
	nextToken();
	nextToken(); // base class name
	} else if (FormatTok->Tok.is(tok::l_paren))
	// Skip category, if present.
	parseParens();

	if (FormatTok->Tok.is(tok::less))
	parseObjCProtocolList();

	if (FormatTok->Tok.is(tok::l_brace)) {
	if (Style.BraceWrapping.AfterObjCDeclaration)
	addUnwrappedLine();
	parseBlock(/MustBeDeclaration=/true);
	}

	// With instance variables, this puts '}' on its own line. Without instance
	// variables, this ends the @interface line.
	addUnwrappedLine();

	parseObjCUntilAtEnd();
	}

	void UnwrappedLineParser::parseObjCProtocol() {
	nextToken();
	nextToken(); // protocol name

	if (FormatTok->Tok.is(tok::less))
	parseObjCProtocolList();

	// Check for protocol declaration.
	if (FormatTok->Tok.is(tok::semi)) {
	nextToken();
	return addUnwrappedLine();
	}

	addUnwrappedLine();
	parseObjCUntilAtEnd();
	}

	void UnwrappedLineParser::parseJavaScriptEs6ImportExport() {
	bool IsImport = FormatTok->is(Keywords.kw_import);
	assert(IsImport \|\| FormatTok->is(tok::kw_export));
	nextToken();

	// Consume the "default" in "export default class/function".
	if (FormatTok->is(tok::kw_default))
	nextToken();

	// Consume "async function", "function" and "default function", so that these
	// get parsed as free-standing JS functions, i.e. do not require a trailing
	// semicolon.
	if (FormatTok->is(Keywords.kw_async))
	nextToken();
	if (FormatTok->is(Keywords.kw_function)) {
	nextToken();
	return;
	}

	// For imports, `export *`, `export {...}`, consume the rest of the line up
	// to the terminating `;`. For everything else, just return and continue
	// parsing the structural element, i.e. the declaration or expression for
	// `export default`.
	if (!IsImport && !FormatTok->isOneOf(tok::l_brace, tok::star) &&
	!FormatTok->isStringLiteral())
	return;

	while (!eof()) {
	if (FormatTok->is(tok::semi))
	return;
	if (Line->Tokens.empty()) {
	// Common issue: Automatic Semicolon Insertion wrapped the line, so the
	// import statement should terminate.
	return;
	}
	if (FormatTok->is(tok::l_brace)) {
	FormatTok->BlockKind = BK_Block;
	nextToken();
	parseBracedList();
	} else {
	nextToken();
	}
	}
	}

	LLVM_ATTRIBUTE_UNUSED static void printDebugInfo(const UnwrappedLine &Line,
	StringRef Prefix = "") {
	llvm::dbgs() << Prefix << "Line(" << Line.Level
	<< ", FSC=" << Line.FirstStartColumn << ")"
	<< (Line.InPPDirective ? " MACRO" : "") << ": ";
	for (std::list<UnwrappedLineNode>::const_iterator I = Line.Tokens.begin(),
	E = Line.Tokens.end();
	I != E; ++I) {
	llvm::dbgs() << I->Tok->Tok.getName() << "["
	<< "T=" << I->Tok->Type << ", OC=" << I->Tok->OriginalColumn
	<< "] ";
	}
	for (std::list<UnwrappedLineNode>::const_iterator I = Line.Tokens.begin(),
	E = Line.Tokens.end();
	I != E; ++I) {
	const UnwrappedLineNode &Node = *I;
	for (SmallVectorImpl<UnwrappedLine>::const_iterator
	I = Node.Children.begin(),
	E = Node.Children.end();
	I != E; ++I) {
	printDebugInfo(*I, "\nChild: ");
	}
	}
	llvm::dbgs() << "\n";
	}

	void UnwrappedLineParser::addUnwrappedLine() {
	if (Line->Tokens.empty())
	return;
	DEBUG({
	if (CurrentLines == &Lines)
	printDebugInfo(*Line);
	});
	CurrentLines->push_back(std::move(*Line));
	Line->Tokens.clear();
	Line->MatchingOpeningBlockLineIndex = UnwrappedLine::kInvalidIndex;
	Line->FirstStartColumn = 0;
	if (CurrentLines == &Lines && !PreprocessorDirectives.empty()) {
	CurrentLines->append(
	std::make_move_iterator(PreprocessorDirectives.begin()),
	std::make_move_iterator(PreprocessorDirectives.end()));
	PreprocessorDirectives.clear();
	}
	// Disconnect the current token from the last token on the previous line.
	FormatTok->Previous = nullptr;
	}

	bool UnwrappedLineParser::eof() const { return FormatTok->Tok.is(tok::eof); }

	bool UnwrappedLineParser::isOnNewLine(const FormatToken &FormatTok) {
	return (Line->InPPDirective \|\| FormatTok.HasUnescapedNewline) &&
	FormatTok.NewlinesBefore > 0;
	}

	// Checks if \p FormatTok is a line comment that continues the line comment
	// section on \p Line.
	static bool continuesLineCommentSection(const FormatToken &FormatTok,
	const UnwrappedLine &Line,
	llvm::Regex &CommentPragmasRegex) {
	if (Line.Tokens.empty())
	return false;

	StringRef IndentContent = FormatTok.TokenText;
	if (FormatTok.TokenText.startswith("//") \|\|
	FormatTok.TokenText.startswith("/*"))
	IndentContent = FormatTok.TokenText.substr(2);
	if (CommentPragmasRegex.match(IndentContent))
	return false;

	// If Line starts with a line comment, then FormatTok continues the comment
	// section if its original column is greater or equal to the original start
	// column of the line.
	//
	// Define the min column token of a line as follows: if a line ends in '{' or
	// contains a '{' followed by a line comment, then the min column token is
	// that '{'. Otherwise, the min column token of the line is the first token of
	// the line.
	//
	// If Line starts with a token other than a line comment, then FormatTok
	// continues the comment section if its original column is greater than the
	// original start column of the min column token of the line.
	//
	// For example, the second line comment continues the first in these cases:
	//
	// // first line
	// // second line
	//
	// and:
	//
	// // first line
	// // second line
	//
	// and:
	//
	// int i; // first line
	// // second line
	//
	// and:
	//
	// do { // first line
	// // second line
	// int i;
	// } while (true);
	//
	// and:
	//
	// enum {
	// a, // first line
	// // second line
	// b
	// };
	//
	// The second line comment doesn't continue the first in these cases:
	//
	// // first line
	// // second line
	//
	// and:
	//
	// int i; // first line
	// // second line
	//
	// and:
	//
	// do { // first line
	// // second line
	// int i;
	// } while (true);
	//
	// and:
	//
	// enum {
	// a, // first line
	// // second line
	// };
	const FormatToken *MinColumnToken = Line.Tokens.front().Tok;

	// Scan for '{//'. If found, use the column of '{' as a min column for line
	// comment section continuation.
	const FormatToken *PreviousToken = nullptr;
	for (const UnwrappedLineNode &Node : Line.Tokens) {
	if (PreviousToken && PreviousToken->is(tok::l_brace) &&
	isLineComment(*Node.Tok)) {
	MinColumnToken = PreviousToken;
	break;
	}
	PreviousToken = Node.Tok;

	// Grab the last newline preceding a token in this unwrapped line.
	if (Node.Tok->NewlinesBefore > 0) {
	MinColumnToken = Node.Tok;
	}
	}
	if (PreviousToken && PreviousToken->is(tok::l_brace)) {
	MinColumnToken = PreviousToken;
	}

	return continuesLineComment(FormatTok, /Previous=/Line.Tokens.back().Tok,
	MinColumnToken);
	}

	void UnwrappedLineParser::flushComments(bool NewlineBeforeNext) {
	bool JustComments = Line->Tokens.empty();
	for (SmallVectorImpl<FormatToken *>::const_iterator
	I = CommentsBeforeNextToken.begin(),
	E = CommentsBeforeNextToken.end();
	I != E; ++I) {
	// Line comments that belong to the same line comment section are put on the
	// same line since later we might want to reflow content between them.
	// Additional fine-grained breaking of line comment sections is controlled
	// by the class BreakableLineCommentSection in case it is desirable to keep
	// several line comment sections in the same unwrapped line.
	//
	// FIXME: Consider putting separate line comment sections as children to the
	// unwrapped line instead.
	(*I)->ContinuesLineCommentSection =
	continuesLineCommentSection(*I, Line, CommentPragmasRegex);
	if (isOnNewLine(*I) && JustComments && !(I)->ContinuesLineCommentSection)
	addUnwrappedLine();
	pushToken(*I);
	}
	if (NewlineBeforeNext && JustComments)
	addUnwrappedLine();
	CommentsBeforeNextToken.clear();
	}

	void UnwrappedLineParser::nextToken(int LevelDifference) {
	if (eof())
	return;
	flushComments(isOnNewLine(*FormatTok));
	pushToken(FormatTok);
	FormatToken *Previous = FormatTok;
	if (Style.Language != FormatStyle::LK_JavaScript)
	readToken(LevelDifference);
	else
	readTokenWithJavaScriptASI();
	FormatTok->Previous = Previous;
	}

	void UnwrappedLineParser::distributeComments(
	const SmallVectorImpl<FormatToken *> &Comments,
	const FormatToken *NextTok) {
	// Whether or not a line comment token continues a line is controlled by
	// the method continuesLineCommentSection, with the following caveat:
	//
	// Define a trail of Comments to be a nonempty proper postfix of Comments such
	// that each comment line from the trail is aligned with the next token, if
	// the next token exists. If a trail exists, the beginning of the maximal
	// trail is marked as a start of a new comment section.
	//
	// For example in this code:
	//
	// int a; // line about a
	// // line 1 about b
	// // line 2 about b
	// int b;
	//
	// the two lines about b form a maximal trail, so there are two sections, the
	// first one consisting of the single comment "// line about a" and the
	// second one consisting of the next two comments.
	if (Comments.empty())
	return;
	bool ShouldPushCommentsInCurrentLine = true;
	bool HasTrailAlignedWithNextToken = false;
	unsigned StartOfTrailAlignedWithNextToken = 0;
	if (NextTok) {
	// We are skipping the first element intentionally.
	for (unsigned i = Comments.size() - 1; i > 0; --i) {
	if (Comments[i]->OriginalColumn == NextTok->OriginalColumn) {
	HasTrailAlignedWithNextToken = true;
	StartOfTrailAlignedWithNextToken = i;
	}
	}
	}
	for (unsigned i = 0, e = Comments.size(); i < e; ++i) {
	FormatToken *FormatTok = Comments[i];
	if (HasTrailAlignedWithNextToken && i == StartOfTrailAlignedWithNextToken) {
	FormatTok->ContinuesLineCommentSection = false;
	} else {
	FormatTok->ContinuesLineCommentSection =
	continuesLineCommentSection(FormatTok, Line, CommentPragmasRegex);
	}
	if (!FormatTok->ContinuesLineCommentSection &&
	(isOnNewLine(*FormatTok) \|\| FormatTok->IsFirst)) {
	ShouldPushCommentsInCurrentLine = false;
	}
	if (ShouldPushCommentsInCurrentLine) {
	pushToken(FormatTok);
	} else {
	CommentsBeforeNextToken.push_back(FormatTok);
	}
	}
	}

	void UnwrappedLineParser::readToken(int LevelDifference) {
	SmallVector<FormatToken *, 1> Comments;
	do {
	FormatTok = Tokens->getNextToken();
	assert(FormatTok);
	while (!Line->InPPDirective && FormatTok->Tok.is(tok::hash) &&
	(FormatTok->HasUnescapedNewline \|\| FormatTok->IsFirst)) {
	distributeComments(Comments, FormatTok);
	Comments.clear();
	// If there is an unfinished unwrapped line, we flush the preprocessor
	// directives only after that unwrapped line was finished later.
	bool SwitchToPreprocessorLines = !Line->Tokens.empty();
	ScopedLineState BlockState(*this, SwitchToPreprocessorLines);
	assert((LevelDifference >= 0 \|\|
	static_cast<unsigned>(-LevelDifference) <= Line->Level) &&
	"LevelDifference makes Line->Level negative");
	Line->Level += LevelDifference;
	// Comments stored before the preprocessor directive need to be output
	// before the preprocessor directive, at the same level as the
	// preprocessor directive, as we consider them to apply to the directive.
	flushComments(isOnNewLine(*FormatTok));
	parsePPDirective();
	}
	while (FormatTok->Type == TT_ConflictStart \|\|
	FormatTok->Type == TT_ConflictEnd \|\|
	FormatTok->Type == TT_ConflictAlternative) {
	if (FormatTok->Type == TT_ConflictStart) {
	conditionalCompilationStart(/Unreachable=/false);
	} else if (FormatTok->Type == TT_ConflictAlternative) {
	conditionalCompilationAlternative();
	} else if (FormatTok->Type == TT_ConflictEnd) {
	conditionalCompilationEnd();
	}
	FormatTok = Tokens->getNextToken();
	FormatTok->MustBreakBefore = true;
	}

	if (!PPStack.empty() && (PPStack.back().Kind == PP_Unreachable) &&
	!Line->InPPDirective) {
	continue;
	}

	if (!FormatTok->Tok.is(tok::comment)) {
	distributeComments(Comments, FormatTok);
	Comments.clear();
	return;
	}

	Comments.push_back(FormatTok);
	} while (!eof());

	distributeComments(Comments, nullptr);
	Comments.clear();
	}

	void UnwrappedLineParser::pushToken(FormatToken *Tok) {
	Line->Tokens.push_back(UnwrappedLineNode(Tok));
	if (MustBreakBeforeNextToken) {
	Line->Tokens.back().Tok->MustBreakBefore = true;
	MustBreakBeforeNextToken = false;
	}
	}

	} // end namespace format
	} // end namespace clang
	Index: head/contrib/llvm/tools/clang/lib/Format/UnwrappedLineParser.h
	===================================================================
	--- head/contrib/llvm/tools/clang/lib/Format/UnwrappedLineParser.h (revision 329409)
	+++ head/contrib/llvm/tools/clang/lib/Format/UnwrappedLineParser.h (revision 329410)
	@@ -1,279 +1,292 @@
	//===--- UnwrappedLineParser.h - Format C++ code ----------------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	///
	/// \file
	/// \brief This file contains the declaration of the UnwrappedLineParser,
	/// which turns a stream of tokens into UnwrappedLines.
	///
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_CLANG_LIB_FORMAT_UNWRAPPEDLINEPARSER_H
	#define LLVM_CLANG_LIB_FORMAT_UNWRAPPEDLINEPARSER_H

	#include "FormatToken.h"
	#include "clang/Basic/IdentifierTable.h"
	#include "clang/Format/Format.h"
	#include "llvm/Support/Regex.h"
	#include <list>
	#include <stack>

	namespace clang {
	namespace format {

	struct UnwrappedLineNode;

	/// \brief An unwrapped line is a sequence of \c Token, that we would like to
	/// put on a single line if there was no column limit.
	///
	/// This is used as a main interface between the \c UnwrappedLineParser and the
	/// \c UnwrappedLineFormatter. The key property is that changing the formatting
	/// within an unwrapped line does not affect any other unwrapped lines.
	struct UnwrappedLine {
	UnwrappedLine();

	// FIXME: Don't use std::list here.
	/// \brief The \c Tokens comprising this \c UnwrappedLine.
	std::list<UnwrappedLineNode> Tokens;

	/// \brief The indent level of the \c UnwrappedLine.
	unsigned Level;

	/// \brief Whether this \c UnwrappedLine is part of a preprocessor directive.
	bool InPPDirective;

	bool MustBeDeclaration;

	/// \brief If this \c UnwrappedLine closes a block in a sequence of lines,
	/// \c MatchingOpeningBlockLineIndex stores the index of the corresponding
	/// opening line. Otherwise, \c MatchingOpeningBlockLineIndex must be
	/// \c kInvalidIndex.
	size_t MatchingOpeningBlockLineIndex;

	static const size_t kInvalidIndex = -1;

	unsigned FirstStartColumn = 0;
	};

	class UnwrappedLineConsumer {
	public:
	virtual ~UnwrappedLineConsumer() {}
	virtual void consumeUnwrappedLine(const UnwrappedLine &Line) = 0;
	virtual void finishRun() = 0;
	};

	class FormatTokenSource;

	class UnwrappedLineParser {
	public:
	UnwrappedLineParser(const FormatStyle &Style,
	const AdditionalKeywords &Keywords,
	unsigned FirstStartColumn,
	ArrayRef<FormatToken *> Tokens,
	UnwrappedLineConsumer &Callback);

	void parse();

	private:
	void reset();
	void parseFile();
	void parseLevel(bool HasOpeningBrace);
	void parseBlock(bool MustBeDeclaration, bool AddLevel = true,
	bool MunchSemi = true);
	void parseChildBlock();
	void parsePPDirective();
	void parsePPDefine();
	void parsePPIf(bool IfDef);
	void parsePPElIf();
	void parsePPElse();
	void parsePPEndIf();
	void parsePPUnknown();
	void readTokenWithJavaScriptASI();
	void parseStructuralElement();
	bool tryToParseBracedList();
	bool parseBracedList(bool ContinueOnSemicolons = false,
	tok::TokenKind ClosingBraceKind = tok::r_brace);
	void parseParens();
	void parseSquare(bool LambdaIntroducer = false);
	void parseIfThenElse();
	void parseTryCatch();
	void parseForOrWhileLoop();
	void parseDoWhile();
	void parseLabel();
	void parseCaseLabel();
	void parseSwitch();
	void parseNamespace();
	void parseNew();
	void parseAccessSpecifier();
	bool parseEnum();
	void parseJavaEnumBody();
	// Parses a record (aka class) as a top level element. If ParseAsExpr is true,
	// parses the record as a child block, i.e. if the class declaration is an
	// expression.
	void parseRecord(bool ParseAsExpr = false);
	void parseObjCProtocolList();
	void parseObjCUntilAtEnd();
	void parseObjCInterfaceOrImplementation();
	void parseObjCProtocol();
	void parseJavaScriptEs6ImportExport();
	bool tryToParseLambda();
	bool tryToParseLambdaIntroducer();
	void tryToParseJSFunction();
	void addUnwrappedLine();
	bool eof() const;
	// LevelDifference is the difference of levels after and before the current
	// token. For example:
	// - if the token is '{' and opens a block, LevelDifference is 1.
	// - if the token is '}' and closes a block, LevelDifference is -1.
	void nextToken(int LevelDifference = 0);
	void readToken(int LevelDifference = 0);

	// Decides which comment tokens should be added to the current line and which
	// should be added as comments before the next token.
	//
	// Comments specifies the sequence of comment tokens to analyze. They get
	// either pushed to the current line or added to the comments before the next
	// token.
	//
	// NextTok specifies the next token. A null pointer NextTok is supported, and
	// signifies either the absense of a next token, or that the next token
	// shouldn't be taken into accunt for the analysis.
	void distributeComments(const SmallVectorImpl<FormatToken *> &Comments,
	const FormatToken *NextTok);

	// Adds the comment preceding the next token to unwrapped lines.
	void flushComments(bool NewlineBeforeNext);
	void pushToken(FormatToken *Tok);
	void calculateBraceTypes(bool ExpectClassBody = false);

	// Marks a conditional compilation edge (for example, an '#if', '#ifdef',
	// '#else' or merge conflict marker). If 'Unreachable' is true, assumes
	// this branch either cannot be taken (for example '#if false'), or should
	// not be taken in this round.
	void conditionalCompilationCondition(bool Unreachable);
	void conditionalCompilationStart(bool Unreachable);
	void conditionalCompilationAlternative();
	void conditionalCompilationEnd();

	bool isOnNewLine(const FormatToken &FormatTok);

	// Compute hash of the current preprocessor branch.
	// This is used to identify the different branches, and thus track if block
	// open and close in the same branch.
	size_t computePPHash() const;

	// FIXME: We are constantly running into bugs where Line.Level is incorrectly
	// subtracted from beyond 0. Introduce a method to subtract from Line.Level
	// and use that everywhere in the Parser.
	std::unique_ptr<UnwrappedLine> Line;

	// Comments are sorted into unwrapped lines by whether they are in the same
	// line as the previous token, or not. If not, they belong to the next token.
	// Since the next token might already be in a new unwrapped line, we need to
	// store the comments belonging to that token.
	SmallVector<FormatToken *, 1> CommentsBeforeNextToken;
	FormatToken *FormatTok;
	bool MustBreakBeforeNextToken;

	// The parsed lines. Only added to through \c CurrentLines.
	SmallVector<UnwrappedLine, 8> Lines;

	// Preprocessor directives are parsed out-of-order from other unwrapped lines.
	// Thus, we need to keep a list of preprocessor directives to be reported
	// after an unwrapped line that has been started was finished.
	SmallVector<UnwrappedLine, 4> PreprocessorDirectives;

	// New unwrapped lines are added via CurrentLines.
	// Usually points to \c &Lines. While parsing a preprocessor directive when
	// there is an unfinished previous unwrapped line, will point to
	// \c &PreprocessorDirectives.
	SmallVectorImpl<UnwrappedLine> *CurrentLines;

	// We store for each line whether it must be a declaration depending on
	// whether we are in a compound statement or not.
	std::vector<bool> DeclarationScopeStack;

	const FormatStyle &Style;
	const AdditionalKeywords &Keywords;

	llvm::Regex CommentPragmasRegex;

	FormatTokenSource *Tokens;
	UnwrappedLineConsumer &Callback;

	// FIXME: This is a temporary measure until we have reworked the ownership
	// of the format tokens. The goal is to have the actual tokens created and
	// owned outside of and handed into the UnwrappedLineParser.
	ArrayRef<FormatToken *> AllTokens;

	// Represents preprocessor branch type, so we can find matching
	// #if/#else/#endif directives.
	enum PPBranchKind {
	PP_Conditional, // Any #if, #ifdef, #ifndef, #elif, block outside #if 0
	PP_Unreachable // #if 0 or a conditional preprocessor block inside #if 0
	};

	struct PPBranch {
	PPBranch(PPBranchKind Kind, size_t Line) : Kind(Kind), Line(Line) {}
	PPBranchKind Kind;
	size_t Line;
	};

	// Keeps a stack of currently active preprocessor branching directives.
	SmallVector<PPBranch, 16> PPStack;

	// The \c UnwrappedLineParser re-parses the code for each combination
	// of preprocessor branches that can be taken.
	// To that end, we take the same branch (#if, #else, or one of the #elif
	// branches) for each nesting level of preprocessor branches.
	// \c PPBranchLevel stores the current nesting level of preprocessor
	// branches during one pass over the code.
	int PPBranchLevel;

	// Contains the current branch (#if, #else or one of the #elif branches)
	// for each nesting level.
	SmallVector<int, 8> PPLevelBranchIndex;

	// Contains the maximum number of branches at each nesting level.
	SmallVector<int, 8> PPLevelBranchCount;

	// Contains the number of branches per nesting level we are currently
	// in while parsing a preprocessor branch sequence.
	// This is used to update PPLevelBranchCount at the end of a branch
	// sequence.
	std::stack<int> PPChainBranchIndex;

	- // Contains the #ifndef condition for a potential include guard.
	- FormatToken *IfNdefCondition;
	- bool FoundIncludeGuardStart;
	- bool IncludeGuardRejected;
	+ // Include guard search state. Used to fixup preprocessor indent levels
	+ // so that include guards do not participate in indentation.
	+ enum IncludeGuardState {
	+ IG_Inited, // Search started, looking for #ifndef.
	+ IG_IfNdefed, // #ifndef found, IncludeGuardToken points to condition.
	+ IG_Defined, // Matching #define found, checking other requirements.
	+ IG_Found, // All requirements met, need to fix indents.
	+ IG_Rejected, // Search failed or never started.
	+ };
	+
	+ // Current state of include guard search.
	+ IncludeGuardState IncludeGuard;
	+
	+ // Points to the #ifndef condition for a potential include guard. Null unless
	+ // IncludeGuardState == IG_IfNdefed.
	+ FormatToken *IncludeGuardToken;
	+
	// Contains the first start column where the source begins. This is zero for
	// normal source code and may be nonzero when formatting a code fragment that
	// does not start at the beginning of the file.
	unsigned FirstStartColumn;

	friend class ScopedLineState;
	friend class CompoundStatementIndenter;
	};

	struct UnwrappedLineNode {
	UnwrappedLineNode() : Tok(nullptr) {}
	UnwrappedLineNode(FormatToken *Tok) : Tok(Tok) {}

	FormatToken *Tok;
	SmallVector<UnwrappedLine, 0> Children;
	};

	inline UnwrappedLine::UnwrappedLine()
	: Level(0), InPPDirective(false), MustBeDeclaration(false),
	MatchingOpeningBlockLineIndex(kInvalidIndex) {}

	} // end namespace format
	} // end namespace clang

	#endif
	Index: head/contrib/llvm/tools/clang/lib/Headers/avx512bwintrin.h
	===================================================================
	--- head/contrib/llvm/tools/clang/lib/Headers/avx512bwintrin.h (revision 329409)
	+++ head/contrib/llvm/tools/clang/lib/Headers/avx512bwintrin.h (revision 329410)
	@@ -1,2138 +1,2140 @@
	/*===------------- avx512bwintrin.h - AVX512BW intrinsics ------------------===
	*
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to deal
	* in the Software without restriction, including without limitation the rights
	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	* copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in
	* all copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	* THE SOFTWARE.
	*
	*===-----------------------------------------------------------------------===
	*/
	#ifndef __IMMINTRIN_H
	#error "Never use <avx512bwintrin.h> directly; include <immintrin.h> instead."
	#endif

	#ifndef __AVX512BWINTRIN_H
	#define __AVX512BWINTRIN_H

	typedef unsigned int __mmask32;
	typedef unsigned long long __mmask64;

	/* Define the default attributes for the functions in this file. */
	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bw")))

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_setzero_qi(void) {
	return (__m512i)(__v64qi){ 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0 };
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_setzero_hi(void) {
	return (__m512i)(__v32hi){ 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0 };
	}

	/* Integer compare */

	#define _mm512_cmp_epi8_mask(a, b, p) __extension__ ({ \
	(__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
	(__v64qi)(__m512i)(b), (int)(p), \
	(__mmask64)-1); })

	#define _mm512_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \
	(__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
	(__v64qi)(__m512i)(b), (int)(p), \
	(__mmask64)(m)); })

	#define _mm512_cmp_epu8_mask(a, b, p) __extension__ ({ \
	(__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
	(__v64qi)(__m512i)(b), (int)(p), \
	(__mmask64)-1); })

	#define _mm512_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \
	(__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
	(__v64qi)(__m512i)(b), (int)(p), \
	(__mmask64)(m)); })

	#define _mm512_cmp_epi16_mask(a, b, p) __extension__ ({ \
	(__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
	(__v32hi)(__m512i)(b), (int)(p), \
	(__mmask32)-1); })

	#define _mm512_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \
	(__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
	(__v32hi)(__m512i)(b), (int)(p), \
	(__mmask32)(m)); })

	#define _mm512_cmp_epu16_mask(a, b, p) __extension__ ({ \
	(__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
	(__v32hi)(__m512i)(b), (int)(p), \
	(__mmask32)-1); })

	#define _mm512_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \
	(__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
	(__v32hi)(__m512i)(b), (int)(p), \
	(__mmask32)(m)); })

	#define _mm512_cmpeq_epi8_mask(A, B) \
	_mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ)
	#define _mm512_mask_cmpeq_epi8_mask(k, A, B) \
	_mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_EQ)
	#define _mm512_cmpge_epi8_mask(A, B) \
	_mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_GE)
	#define _mm512_mask_cmpge_epi8_mask(k, A, B) \
	_mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GE)
	#define _mm512_cmpgt_epi8_mask(A, B) \
	_mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_GT)
	#define _mm512_mask_cmpgt_epi8_mask(k, A, B) \
	_mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GT)
	#define _mm512_cmple_epi8_mask(A, B) \
	_mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_LE)
	#define _mm512_mask_cmple_epi8_mask(k, A, B) \
	_mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LE)
	#define _mm512_cmplt_epi8_mask(A, B) \
	_mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_LT)
	#define _mm512_mask_cmplt_epi8_mask(k, A, B) \
	_mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LT)
	#define _mm512_cmpneq_epi8_mask(A, B) \
	_mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_NE)
	#define _mm512_mask_cmpneq_epi8_mask(k, A, B) \
	_mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_NE)

	#define _mm512_cmpeq_epu8_mask(A, B) \
	_mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_EQ)
	#define _mm512_mask_cmpeq_epu8_mask(k, A, B) \
	_mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_EQ)
	#define _mm512_cmpge_epu8_mask(A, B) \
	_mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_GE)
	#define _mm512_mask_cmpge_epu8_mask(k, A, B) \
	_mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GE)
	#define _mm512_cmpgt_epu8_mask(A, B) \
	_mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_GT)
	#define _mm512_mask_cmpgt_epu8_mask(k, A, B) \
	_mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GT)
	#define _mm512_cmple_epu8_mask(A, B) \
	_mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_LE)
	#define _mm512_mask_cmple_epu8_mask(k, A, B) \
	_mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LE)
	#define _mm512_cmplt_epu8_mask(A, B) \
	_mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_LT)
	#define _mm512_mask_cmplt_epu8_mask(k, A, B) \
	_mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LT)
	#define _mm512_cmpneq_epu8_mask(A, B) \
	_mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_NE)
	#define _mm512_mask_cmpneq_epu8_mask(k, A, B) \
	_mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_NE)

	#define _mm512_cmpeq_epi16_mask(A, B) \
	_mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_EQ)
	#define _mm512_mask_cmpeq_epi16_mask(k, A, B) \
	_mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_EQ)
	#define _mm512_cmpge_epi16_mask(A, B) \
	_mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_GE)
	#define _mm512_mask_cmpge_epi16_mask(k, A, B) \
	_mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GE)
	#define _mm512_cmpgt_epi16_mask(A, B) \
	_mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_GT)
	#define _mm512_mask_cmpgt_epi16_mask(k, A, B) \
	_mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GT)
	#define _mm512_cmple_epi16_mask(A, B) \
	_mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_LE)
	#define _mm512_mask_cmple_epi16_mask(k, A, B) \
	_mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LE)
	#define _mm512_cmplt_epi16_mask(A, B) \
	_mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_LT)
	#define _mm512_mask_cmplt_epi16_mask(k, A, B) \
	_mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LT)
	#define _mm512_cmpneq_epi16_mask(A, B) \
	_mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_NE)
	#define _mm512_mask_cmpneq_epi16_mask(k, A, B) \
	_mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_NE)

	#define _mm512_cmpeq_epu16_mask(A, B) \
	_mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_EQ)
	#define _mm512_mask_cmpeq_epu16_mask(k, A, B) \
	_mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_EQ)
	#define _mm512_cmpge_epu16_mask(A, B) \
	_mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_GE)
	#define _mm512_mask_cmpge_epu16_mask(k, A, B) \
	_mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GE)
	#define _mm512_cmpgt_epu16_mask(A, B) \
	_mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_GT)
	#define _mm512_mask_cmpgt_epu16_mask(k, A, B) \
	_mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GT)
	#define _mm512_cmple_epu16_mask(A, B) \
	_mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_LE)
	#define _mm512_mask_cmple_epu16_mask(k, A, B) \
	_mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LE)
	#define _mm512_cmplt_epu16_mask(A, B) \
	_mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_LT)
	#define _mm512_mask_cmplt_epu16_mask(k, A, B) \
	_mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LT)
	#define _mm512_cmpneq_epu16_mask(A, B) \
	_mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_NE)
	#define _mm512_mask_cmpneq_epu16_mask(k, A, B) \
	_mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE)

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_add_epi8 (__m512i __A, __m512i __B) {
	return (__m512i) ((__v64qu) __A + (__v64qu) __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_add_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
	return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
	(__v64qi)_mm512_add_epi8(__A, __B),
	(__v64qi)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_add_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
	return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
	(__v64qi)_mm512_add_epi8(__A, __B),
	(__v64qi)_mm512_setzero_qi());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_sub_epi8 (__m512i __A, __m512i __B) {
	return (__m512i) ((__v64qu) __A - (__v64qu) __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_sub_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
	return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
	(__v64qi)_mm512_sub_epi8(__A, __B),
	(__v64qi)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_sub_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
	return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
	(__v64qi)_mm512_sub_epi8(__A, __B),
	(__v64qi)_mm512_setzero_qi());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_add_epi16 (__m512i __A, __m512i __B) {
	return (__m512i) ((__v32hu) __A + (__v32hu) __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_add_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_add_epi16(__A, __B),
	(__v32hi)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_add_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_add_epi16(__A, __B),
	(__v32hi)_mm512_setzero_hi());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_sub_epi16 (__m512i __A, __m512i __B) {
	return (__m512i) ((__v32hu) __A - (__v32hu) __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_sub_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_sub_epi16(__A, __B),
	(__v32hi)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_sub_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_sub_epi16(__A, __B),
	(__v32hi)_mm512_setzero_hi());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mullo_epi16 (__m512i __A, __m512i __B) {
	return (__m512i) ((__v32hu) __A * (__v32hu) __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_mullo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_mullo_epi16(__A, __B),
	(__v32hi)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_mullo_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_mullo_epi16(__A, __B),
	(__v32hi)_mm512_setzero_hi());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_blend_epi8 (__mmask64 __U, __m512i __A, __m512i __W)
	{
	return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U,
	(__v64qi) __W,
	(__v64qi) __A);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_blend_epi16 (__mmask32 __U, __m512i __A, __m512i __W)
	{
	return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U,
	(__v32hi) __W,
	(__v32hi) __A);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_abs_epi8 (__m512i __A)
	{
	return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
	(__v64qi) _mm512_setzero_qi(),
	(__mmask64) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_abs_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
	(__v64qi) __W,
	(__mmask64) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_abs_epi8 (__mmask64 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
	(__v64qi) _mm512_setzero_qi(),
	(__mmask64) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_abs_epi16 (__m512i __A)
	{
	return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
	(__v32hi) _mm512_setzero_hi(),
	(__mmask32) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_abs_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
	(__v32hi) __W,
	(__mmask32) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
	(__v32hi) _mm512_setzero_hi(),
	(__mmask32) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_packs_epi32(__m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_packssdw512((__v16si)__A, (__v16si)__B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
	(__v32hi)_mm512_packs_epi32(__A, __B),
	(__v32hi)_mm512_setzero_hi());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
	(__v32hi)_mm512_packs_epi32(__A, __B),
	(__v32hi)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_packs_epi16(__m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_packsswb512((__v32hi)__A, (__v32hi) __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
	(__v64qi)_mm512_packs_epi16(__A, __B),
	(__v64qi)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
	(__v64qi)_mm512_packs_epi16(__A, __B),
	(__v64qi)_mm512_setzero_qi());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_packus_epi32(__m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_packusdw512((__v16si) __A, (__v16si) __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
	(__v32hi)_mm512_packus_epi32(__A, __B),
	(__v32hi)_mm512_setzero_hi());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
	(__v32hi)_mm512_packus_epi32(__A, __B),
	(__v32hi)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_packus_epi16(__m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_packuswb512((__v32hi) __A, (__v32hi) __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
	(__v64qi)_mm512_packus_epi16(__A, __B),
	(__v64qi)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
	(__v64qi)_mm512_packus_epi16(__A, __B),
	(__v64qi)_mm512_setzero_qi());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_adds_epi8 (__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A,
	(__v64qi) __B,
	(__v64qi) _mm512_setzero_qi(),
	(__mmask64) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_adds_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
	__m512i __B)
	{
	return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A,
	(__v64qi) __B,
	(__v64qi) __W,
	(__mmask64) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A,
	(__v64qi) __B,
	(__v64qi) _mm512_setzero_qi(),
	(__mmask64) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_adds_epi16 (__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) _mm512_setzero_hi(),
	(__mmask32) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
	__m512i __B)
	{
	return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) __W,
	(__mmask32) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) _mm512_setzero_hi(),
	(__mmask32) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_adds_epu8 (__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A,
	(__v64qi) __B,
	(__v64qi) _mm512_setzero_qi(),
	(__mmask64) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
	__m512i __B)
	{
	return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A,
	(__v64qi) __B,
	(__v64qi) __W,
	(__mmask64) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A,
	(__v64qi) __B,
	(__v64qi) _mm512_setzero_qi(),
	(__mmask64) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_adds_epu16 (__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) _mm512_setzero_hi(),
	(__mmask32) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_adds_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
	__m512i __B)
	{
	return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) __W,
	(__mmask32) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) _mm512_setzero_hi(),
	(__mmask32) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_avg_epu8 (__m512i __A, __m512i __B)
	{
	typedef unsigned short __v64hu __attribute__((__vector_size__(128)));
	return (__m512i)__builtin_convertvector(
	((__builtin_convertvector((__v64qu) __A, __v64hu) +
	__builtin_convertvector((__v64qu) __B, __v64hu)) + 1)
	>> 1, __v64qu);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_avg_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
	__m512i __B)
	{
	return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
	(__v64qi)_mm512_avg_epu8(__A, __B),
	(__v64qi)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
	(__v64qi)_mm512_avg_epu8(__A, __B),
	(__v64qi)_mm512_setzero_qi());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_avg_epu16 (__m512i __A, __m512i __B)
	{
	typedef unsigned int __v32su __attribute__((__vector_size__(128)));
	return (__m512i)__builtin_convertvector(
	((__builtin_convertvector((__v32hu) __A, __v32su) +
	__builtin_convertvector((__v32hu) __B, __v32su)) + 1)
	>> 1, __v32hu);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_avg_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
	__m512i __B)
	{
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_avg_epu16(__A, __B),
	(__v32hi)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_avg_epu16(__A, __B),
	(__v32hi) _mm512_setzero_hi());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_max_epi8 (__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
	(__v64qi) __B,
	(__v64qi) _mm512_setzero_qi(),
	(__mmask64) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_max_epi8 (__mmask64 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
	(__v64qi) __B,
	(__v64qi) _mm512_setzero_qi(),
	(__mmask64) __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_max_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
	__m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
	(__v64qi) __B,
	(__v64qi) __W,
	(__mmask64) __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_max_epi16 (__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) _mm512_setzero_hi(),
	(__mmask32) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_max_epi16 (__mmask32 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) _mm512_setzero_hi(),
	(__mmask32) __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_max_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
	__m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) __W,
	(__mmask32) __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_max_epu8 (__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
	(__v64qi) __B,
	(__v64qi) _mm512_setzero_qi(),
	(__mmask64) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_max_epu8 (__mmask64 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
	(__v64qi) __B,
	(__v64qi) _mm512_setzero_qi(),
	(__mmask64) __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_max_epu8 (__m512i __W, __mmask64 __M, __m512i __A,
	__m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
	(__v64qi) __B,
	(__v64qi) __W,
	(__mmask64) __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_max_epu16 (__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) _mm512_setzero_hi(),
	(__mmask32) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_max_epu16 (__mmask32 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) _mm512_setzero_hi(),
	(__mmask32) __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_max_epu16 (__m512i __W, __mmask32 __M, __m512i __A,
	__m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) __W,
	(__mmask32) __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_min_epi8 (__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
	(__v64qi) __B,
	(__v64qi) _mm512_setzero_qi(),
	(__mmask64) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_min_epi8 (__mmask64 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
	(__v64qi) __B,
	(__v64qi) _mm512_setzero_qi(),
	(__mmask64) __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_min_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
	__m512i __B)
	{
	return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
	(__v64qi) __B,
	(__v64qi) __W,
	(__mmask64) __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_min_epi16 (__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) _mm512_setzero_hi(),
	(__mmask32) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_min_epi16 (__mmask32 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) _mm512_setzero_hi(),
	(__mmask32) __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_min_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
	__m512i __B)
	{
	return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) __W,
	(__mmask32) __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_min_epu8 (__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
	(__v64qi) __B,
	(__v64qi) _mm512_setzero_qi(),
	(__mmask64) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_min_epu8 (__mmask64 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
	(__v64qi) __B,
	(__v64qi) _mm512_setzero_qi(),
	(__mmask64) __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_min_epu8 (__m512i __W, __mmask64 __M, __m512i __A,
	__m512i __B)
	{
	return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
	(__v64qi) __B,
	(__v64qi) __W,
	(__mmask64) __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_min_epu16 (__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) _mm512_setzero_hi(),
	(__mmask32) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_min_epu16 (__mmask32 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) _mm512_setzero_hi(),
	(__mmask32) __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_min_epu16 (__m512i __W, __mmask32 __M, __m512i __A,
	__m512i __B)
	{
	return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) __W,
	(__mmask32) __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_shuffle_epi8(__m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_pshufb512((__v64qi)__A,(__v64qi)__B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
	(__v64qi)_mm512_shuffle_epi8(__A, __B),
	(__v64qi)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
	(__v64qi)_mm512_shuffle_epi8(__A, __B),
	(__v64qi)_mm512_setzero_qi());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_subs_epi8 (__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A,
	(__v64qi) __B,
	(__v64qi) _mm512_setzero_qi(),
	(__mmask64) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
	__m512i __B)
	{
	return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A,
	(__v64qi) __B,
	(__v64qi) __W,
	(__mmask64) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A,
	(__v64qi) __B,
	(__v64qi) _mm512_setzero_qi(),
	(__mmask64) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_subs_epi16 (__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) _mm512_setzero_hi(),
	(__mmask32) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
	__m512i __B)
	{
	return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) __W,
	(__mmask32) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) _mm512_setzero_hi(),
	(__mmask32) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_subs_epu8 (__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A,
	(__v64qi) __B,
	(__v64qi) _mm512_setzero_qi(),
	(__mmask64) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
	__m512i __B)
	{
	return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A,
	(__v64qi) __B,
	(__v64qi) __W,
	(__mmask64) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A,
	(__v64qi) __B,
	(__v64qi) _mm512_setzero_qi(),
	(__mmask64) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_subs_epu16 (__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) _mm512_setzero_hi(),
	(__mmask32) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
	__m512i __B)
	{
	return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) __W,
	(__mmask32) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) _mm512_setzero_hi(),
	(__mmask32) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask2_permutex2var_epi16 (__m512i __A, __m512i __I,
	__mmask32 __U, __m512i __B)
	{
	return (__m512i) __builtin_ia32_vpermi2varhi512_mask ((__v32hi) __A,
	(__v32hi) __I /* idx */ ,
	(__v32hi) __B,
	(__mmask32) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_permutex2var_epi16 (__m512i __A, __m512i __I, __m512i __B)
	{
	return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I /* idx */,
	(__v32hi) __A,
	(__v32hi) __B,
	(__mmask32) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_permutex2var_epi16 (__m512i __A, __mmask32 __U,
	__m512i __I, __m512i __B)
	{
	return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I /* idx */,
	(__v32hi) __A,
	(__v32hi) __B,
	(__mmask32) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_permutex2var_epi16 (__mmask32 __U, __m512i __A,
	__m512i __I, __m512i __B)
	{
	return (__m512i) __builtin_ia32_vpermt2varhi512_maskz ((__v32hi) __I
	/* idx */ ,
	(__v32hi) __A,
	(__v32hi) __B,
	(__mmask32) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mulhrs_epi16 (__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) _mm512_setzero_hi(),
	(__mmask32) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_mulhrs_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
	__m512i __B)
	{
	return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) __W,
	(__mmask32) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_mulhrs_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) _mm512_setzero_hi(),
	(__mmask32) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mulhi_epi16 (__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) _mm512_setzero_hi(),
	(__mmask32) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_mulhi_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
	__m512i __B)
	{
	return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) __W,
	(__mmask32) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_mulhi_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) _mm512_setzero_hi(),
	(__mmask32) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mulhi_epu16 (__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) _mm512_setzero_hi(),
	(__mmask32) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_mulhi_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
	__m512i __B)
	{
	return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) __W,
	(__mmask32) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_mulhi_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v32hi) _mm512_setzero_hi(),
	(__mmask32) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maddubs_epi16 (__m512i __X, __m512i __Y) {
	return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
	(__v64qi) __Y,
	(__v32hi) _mm512_setzero_hi(),
	(__mmask32) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_maddubs_epi16 (__m512i __W, __mmask32 __U, __m512i __X,
	__m512i __Y) {
	return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
	(__v64qi) __Y,
	(__v32hi) __W,
	(__mmask32) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_maddubs_epi16 (__mmask32 __U, __m512i __X, __m512i __Y) {
	return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
	(__v64qi) __Y,
	(__v32hi) _mm512_setzero_hi(),
	(__mmask32) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_madd_epi16 (__m512i __A, __m512i __B) {
	return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v16si) _mm512_setzero_si512(),
	(__mmask16) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_madd_epi16 (__m512i __W, __mmask16 __U, __m512i __A,
	__m512i __B) {
	return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v16si) __W,
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_madd_epi16 (__mmask16 __U, __m512i __A, __m512i __B) {
	return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
	(__v32hi) __B,
	(__v16si) _mm512_setzero_si512(),
	(__mmask16) __U);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_cvtsepi16_epi8 (__m512i __A) {
	return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
	(__v32qi)_mm256_setzero_si256(),
	(__mmask32) -1);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtsepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) {
	return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
	(__v32qi)__O,
	__M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtsepi16_epi8 (__mmask32 __M, __m512i __A) {
	return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
	(__v32qi) _mm256_setzero_si256(),
	__M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_cvtusepi16_epi8 (__m512i __A) {
	return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
	(__v32qi) _mm256_setzero_si256(),
	(__mmask32) -1);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtusepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) {
	return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
	(__v32qi) __O,
	__M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtusepi16_epi8 (__mmask32 __M, __m512i __A) {
	return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
	(__v32qi) _mm256_setzero_si256(),
	__M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_cvtepi16_epi8 (__m512i __A) {
	return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
	(__v32qi) _mm256_setzero_si256(),
	(__mmask32) -1);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) {
	return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
	(__v32qi) __O,
	__M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepi16_epi8 (__mmask32 __M, __m512i __A) {
	return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
	(__v32qi) _mm256_setzero_si256(),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
	{
	__builtin_ia32_pmovwb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
	{
	__builtin_ia32_pmovswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
	{
	__builtin_ia32_pmovuswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_unpackhi_epi8(__m512i __A, __m512i __B) {
	return (__m512i)__builtin_shufflevector((__v64qi)__A, (__v64qi)__B,
	8, 64+8, 9, 64+9,
	10, 64+10, 11, 64+11,
	12, 64+12, 13, 64+13,
	14, 64+14, 15, 64+15,
	24, 64+24, 25, 64+25,
	26, 64+26, 27, 64+27,
	28, 64+28, 29, 64+29,
	30, 64+30, 31, 64+31,
	40, 64+40, 41, 64+41,
	42, 64+42, 43, 64+43,
	44, 64+44, 45, 64+45,
	46, 64+46, 47, 64+47,
	56, 64+56, 57, 64+57,
	58, 64+58, 59, 64+59,
	60, 64+60, 61, 64+61,
	62, 64+62, 63, 64+63);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_unpackhi_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
	return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
	(__v64qi)_mm512_unpackhi_epi8(__A, __B),
	(__v64qi)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
	return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
	(__v64qi)_mm512_unpackhi_epi8(__A, __B),
	(__v64qi)_mm512_setzero_qi());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_unpackhi_epi16(__m512i __A, __m512i __B) {
	return (__m512i)__builtin_shufflevector((__v32hi)__A, (__v32hi)__B,
	4, 32+4, 5, 32+5,
	6, 32+6, 7, 32+7,
	12, 32+12, 13, 32+13,
	14, 32+14, 15, 32+15,
	20, 32+20, 21, 32+21,
	22, 32+22, 23, 32+23,
	28, 32+28, 29, 32+29,
	30, 32+30, 31, 32+31);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_unpackhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_unpackhi_epi16(__A, __B),
	(__v32hi)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_unpackhi_epi16(__A, __B),
	(__v32hi)_mm512_setzero_hi());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_unpacklo_epi8(__m512i __A, __m512i __B) {
	return (__m512i)__builtin_shufflevector((__v64qi)__A, (__v64qi)__B,
	0, 64+0, 1, 64+1,
	2, 64+2, 3, 64+3,
	4, 64+4, 5, 64+5,
	6, 64+6, 7, 64+7,
	16, 64+16, 17, 64+17,
	18, 64+18, 19, 64+19,
	20, 64+20, 21, 64+21,
	22, 64+22, 23, 64+23,
	32, 64+32, 33, 64+33,
	34, 64+34, 35, 64+35,
	36, 64+36, 37, 64+37,
	38, 64+38, 39, 64+39,
	48, 64+48, 49, 64+49,
	50, 64+50, 51, 64+51,
	52, 64+52, 53, 64+53,
	54, 64+54, 55, 64+55);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_unpacklo_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
	return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
	(__v64qi)_mm512_unpacklo_epi8(__A, __B),
	(__v64qi)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
	return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
	(__v64qi)_mm512_unpacklo_epi8(__A, __B),
	(__v64qi)_mm512_setzero_qi());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_unpacklo_epi16(__m512i __A, __m512i __B) {
	return (__m512i)__builtin_shufflevector((__v32hi)__A, (__v32hi)__B,
	0, 32+0, 1, 32+1,
	2, 32+2, 3, 32+3,
	8, 32+8, 9, 32+9,
	10, 32+10, 11, 32+11,
	16, 32+16, 17, 32+17,
	18, 32+18, 19, 32+19,
	24, 32+24, 25, 32+25,
	26, 32+26, 27, 32+27);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_unpacklo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_unpacklo_epi16(__A, __B),
	(__v32hi)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_unpacklo_epi16(__A, __B),
	(__v32hi)_mm512_setzero_hi());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_cvtepi8_epi16(__m256i __A)
	{
	/* This function always performs a signed extension, but __v32qi is a char
	which may be signed or unsigned, so use __v32qs. */
	return (__m512i)__builtin_convertvector((__v32qs)__A, __v32hi);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi8_epi16(__m512i __W, __mmask32 __U, __m256i __A)
	{
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_cvtepi8_epi16(__A),
	(__v32hi)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepi8_epi16(__mmask32 __U, __m256i __A)
	{
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_cvtepi8_epi16(__A),
	(__v32hi)_mm512_setzero_hi());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_cvtepu8_epi16(__m256i __A)
	{
	return (__m512i)__builtin_convertvector((__v32qu)__A, __v32hi);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepu8_epi16(__m512i __W, __mmask32 __U, __m256i __A)
	{
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_cvtepu8_epi16(__A),
	(__v32hi)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A)
	{
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_cvtepu8_epi16(__A),
	(__v32hi)_mm512_setzero_hi());
	}


	#define _mm512_shufflehi_epi16(A, imm) __extension__ ({ \
	(__m512i)__builtin_shufflevector((__v32hi)(__m512i)(A), \
	(__v32hi)_mm512_undefined_epi32(), \
	0, 1, 2, 3, \
	4 + (((imm) >> 0) & 0x3), \
	4 + (((imm) >> 2) & 0x3), \
	4 + (((imm) >> 4) & 0x3), \
	4 + (((imm) >> 6) & 0x3), \
	8, 9, 10, 11, \
	12 + (((imm) >> 0) & 0x3), \
	12 + (((imm) >> 2) & 0x3), \
	12 + (((imm) >> 4) & 0x3), \
	12 + (((imm) >> 6) & 0x3), \
	16, 17, 18, 19, \
	20 + (((imm) >> 0) & 0x3), \
	20 + (((imm) >> 2) & 0x3), \
	20 + (((imm) >> 4) & 0x3), \
	20 + (((imm) >> 6) & 0x3), \
	24, 25, 26, 27, \
	28 + (((imm) >> 0) & 0x3), \
	28 + (((imm) >> 2) & 0x3), \
	28 + (((imm) >> 4) & 0x3), \
	28 + (((imm) >> 6) & 0x3)); })

	#define _mm512_mask_shufflehi_epi16(W, U, A, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
	(__v32hi)_mm512_shufflehi_epi16((A), \
	(imm)), \
	(__v32hi)(__m512i)(W)); })

	#define _mm512_maskz_shufflehi_epi16(U, A, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
	(__v32hi)_mm512_shufflehi_epi16((A), \
	(imm)), \
	(__v32hi)_mm512_setzero_hi()); })

	#define _mm512_shufflelo_epi16(A, imm) __extension__ ({ \
	(__m512i)__builtin_shufflevector((__v32hi)(__m512i)(A), \
	(__v32hi)_mm512_undefined_epi32(), \
	0 + (((imm) >> 0) & 0x3), \
	0 + (((imm) >> 2) & 0x3), \
	0 + (((imm) >> 4) & 0x3), \
	0 + (((imm) >> 6) & 0x3), \
	4, 5, 6, 7, \
	8 + (((imm) >> 0) & 0x3), \
	8 + (((imm) >> 2) & 0x3), \
	8 + (((imm) >> 4) & 0x3), \
	8 + (((imm) >> 6) & 0x3), \
	12, 13, 14, 15, \
	16 + (((imm) >> 0) & 0x3), \
	16 + (((imm) >> 2) & 0x3), \
	16 + (((imm) >> 4) & 0x3), \
	16 + (((imm) >> 6) & 0x3), \
	20, 21, 22, 23, \
	24 + (((imm) >> 0) & 0x3), \
	24 + (((imm) >> 2) & 0x3), \
	24 + (((imm) >> 4) & 0x3), \
	24 + (((imm) >> 6) & 0x3), \
	28, 29, 30, 31); })


	#define _mm512_mask_shufflelo_epi16(W, U, A, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
	(__v32hi)_mm512_shufflelo_epi16((A), \
	(imm)), \
	(__v32hi)(__m512i)(W)); })


	#define _mm512_maskz_shufflelo_epi16(U, A, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
	(__v32hi)_mm512_shufflelo_epi16((A), \
	(imm)), \
	(__v32hi)_mm512_setzero_hi()); })

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_sllv_epi16(__m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_psllv32hi((__v32hi) __A, (__v32hi) __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_sllv_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_sllv_epi16(__A, __B),
	(__v32hi)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_sllv_epi16(__mmask32 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_sllv_epi16(__A, __B),
	(__v32hi)_mm512_setzero_hi());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_sll_epi16(__m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_psllw512((__v32hi) __A, (__v8hi) __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_sll_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_sll_epi16(__A, __B),
	(__v32hi)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_sll_epi16(__A, __B),
	(__v32hi)_mm512_setzero_hi());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_slli_epi16(__m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_psllwi512((__v32hi)__A, __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_slli_epi16(__A, __B),
	(__v32hi)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_slli_epi16(__A, __B),
	(__v32hi)_mm512_setzero_hi());
	}

	#define _mm512_bslli_epi128(a, imm) __extension__ ({ \
	(__m512i)__builtin_shufflevector( \
	(__v64qi)_mm512_setzero_si512(), \
	(__v64qi)(__m512i)(a), \
	((char)(imm)&0xF0) ? 0 : ((char)(imm)>0x0 ? 16 : 64) - (char)(imm), \
	((char)(imm)&0xF0) ? 1 : ((char)(imm)>0x1 ? 17 : 65) - (char)(imm), \
	((char)(imm)&0xF0) ? 2 : ((char)(imm)>0x2 ? 18 : 66) - (char)(imm), \
	((char)(imm)&0xF0) ? 3 : ((char)(imm)>0x3 ? 19 : 67) - (char)(imm), \
	((char)(imm)&0xF0) ? 4 : ((char)(imm)>0x4 ? 20 : 68) - (char)(imm), \
	((char)(imm)&0xF0) ? 5 : ((char)(imm)>0x5 ? 21 : 69) - (char)(imm), \
	((char)(imm)&0xF0) ? 6 : ((char)(imm)>0x6 ? 22 : 70) - (char)(imm), \
	((char)(imm)&0xF0) ? 7 : ((char)(imm)>0x7 ? 23 : 71) - (char)(imm), \
	((char)(imm)&0xF0) ? 8 : ((char)(imm)>0x8 ? 24 : 72) - (char)(imm), \
	((char)(imm)&0xF0) ? 9 : ((char)(imm)>0x9 ? 25 : 73) - (char)(imm), \
	((char)(imm)&0xF0) ? 10 : ((char)(imm)>0xA ? 26 : 74) - (char)(imm), \
	((char)(imm)&0xF0) ? 11 : ((char)(imm)>0xB ? 27 : 75) - (char)(imm), \
	((char)(imm)&0xF0) ? 12 : ((char)(imm)>0xC ? 28 : 76) - (char)(imm), \
	((char)(imm)&0xF0) ? 13 : ((char)(imm)>0xD ? 29 : 77) - (char)(imm), \
	((char)(imm)&0xF0) ? 14 : ((char)(imm)>0xE ? 30 : 78) - (char)(imm), \
	((char)(imm)&0xF0) ? 15 : ((char)(imm)>0xF ? 31 : 79) - (char)(imm), \
	((char)(imm)&0xF0) ? 16 : ((char)(imm)>0x0 ? 32 : 80) - (char)(imm), \
	((char)(imm)&0xF0) ? 17 : ((char)(imm)>0x1 ? 33 : 81) - (char)(imm), \
	((char)(imm)&0xF0) ? 18 : ((char)(imm)>0x2 ? 34 : 82) - (char)(imm), \
	((char)(imm)&0xF0) ? 19 : ((char)(imm)>0x3 ? 35 : 83) - (char)(imm), \
	((char)(imm)&0xF0) ? 20 : ((char)(imm)>0x4 ? 36 : 84) - (char)(imm), \
	((char)(imm)&0xF0) ? 21 : ((char)(imm)>0x5 ? 37 : 85) - (char)(imm), \
	((char)(imm)&0xF0) ? 22 : ((char)(imm)>0x6 ? 38 : 86) - (char)(imm), \
	((char)(imm)&0xF0) ? 23 : ((char)(imm)>0x7 ? 39 : 87) - (char)(imm), \
	((char)(imm)&0xF0) ? 24 : ((char)(imm)>0x8 ? 40 : 88) - (char)(imm), \
	((char)(imm)&0xF0) ? 25 : ((char)(imm)>0x9 ? 41 : 89) - (char)(imm), \
	((char)(imm)&0xF0) ? 26 : ((char)(imm)>0xA ? 42 : 90) - (char)(imm), \
	((char)(imm)&0xF0) ? 27 : ((char)(imm)>0xB ? 43 : 91) - (char)(imm), \
	((char)(imm)&0xF0) ? 28 : ((char)(imm)>0xC ? 44 : 92) - (char)(imm), \
	((char)(imm)&0xF0) ? 29 : ((char)(imm)>0xD ? 45 : 93) - (char)(imm), \
	((char)(imm)&0xF0) ? 30 : ((char)(imm)>0xE ? 46 : 94) - (char)(imm), \
	((char)(imm)&0xF0) ? 31 : ((char)(imm)>0xF ? 47 : 95) - (char)(imm), \
	((char)(imm)&0xF0) ? 32 : ((char)(imm)>0x0 ? 48 : 96) - (char)(imm), \
	((char)(imm)&0xF0) ? 33 : ((char)(imm)>0x1 ? 49 : 97) - (char)(imm), \
	((char)(imm)&0xF0) ? 34 : ((char)(imm)>0x2 ? 50 : 98) - (char)(imm), \
	((char)(imm)&0xF0) ? 35 : ((char)(imm)>0x3 ? 51 : 99) - (char)(imm), \
	((char)(imm)&0xF0) ? 36 : ((char)(imm)>0x4 ? 52 : 100) - (char)(imm), \
	((char)(imm)&0xF0) ? 37 : ((char)(imm)>0x5 ? 53 : 101) - (char)(imm), \
	((char)(imm)&0xF0) ? 38 : ((char)(imm)>0x6 ? 54 : 102) - (char)(imm), \
	((char)(imm)&0xF0) ? 39 : ((char)(imm)>0x7 ? 55 : 103) - (char)(imm), \
	((char)(imm)&0xF0) ? 40 : ((char)(imm)>0x8 ? 56 : 104) - (char)(imm), \
	((char)(imm)&0xF0) ? 41 : ((char)(imm)>0x9 ? 57 : 105) - (char)(imm), \
	((char)(imm)&0xF0) ? 42 : ((char)(imm)>0xA ? 58 : 106) - (char)(imm), \
	((char)(imm)&0xF0) ? 43 : ((char)(imm)>0xB ? 59 : 107) - (char)(imm), \
	((char)(imm)&0xF0) ? 44 : ((char)(imm)>0xC ? 60 : 108) - (char)(imm), \
	((char)(imm)&0xF0) ? 45 : ((char)(imm)>0xD ? 61 : 109) - (char)(imm), \
	((char)(imm)&0xF0) ? 46 : ((char)(imm)>0xE ? 62 : 110) - (char)(imm), \
	((char)(imm)&0xF0) ? 47 : ((char)(imm)>0xF ? 63 : 111) - (char)(imm), \
	((char)(imm)&0xF0) ? 48 : ((char)(imm)>0x0 ? 64 : 112) - (char)(imm), \
	((char)(imm)&0xF0) ? 49 : ((char)(imm)>0x1 ? 65 : 113) - (char)(imm), \
	((char)(imm)&0xF0) ? 50 : ((char)(imm)>0x2 ? 66 : 114) - (char)(imm), \
	((char)(imm)&0xF0) ? 51 : ((char)(imm)>0x3 ? 67 : 115) - (char)(imm), \
	((char)(imm)&0xF0) ? 52 : ((char)(imm)>0x4 ? 68 : 116) - (char)(imm), \
	((char)(imm)&0xF0) ? 53 : ((char)(imm)>0x5 ? 69 : 117) - (char)(imm), \
	((char)(imm)&0xF0) ? 54 : ((char)(imm)>0x6 ? 70 : 118) - (char)(imm), \
	((char)(imm)&0xF0) ? 55 : ((char)(imm)>0x7 ? 71 : 119) - (char)(imm), \
	((char)(imm)&0xF0) ? 56 : ((char)(imm)>0x8 ? 72 : 120) - (char)(imm), \
	((char)(imm)&0xF0) ? 57 : ((char)(imm)>0x9 ? 73 : 121) - (char)(imm), \
	((char)(imm)&0xF0) ? 58 : ((char)(imm)>0xA ? 74 : 122) - (char)(imm), \
	((char)(imm)&0xF0) ? 59 : ((char)(imm)>0xB ? 75 : 123) - (char)(imm), \
	((char)(imm)&0xF0) ? 60 : ((char)(imm)>0xC ? 76 : 124) - (char)(imm), \
	((char)(imm)&0xF0) ? 61 : ((char)(imm)>0xD ? 77 : 125) - (char)(imm), \
	((char)(imm)&0xF0) ? 62 : ((char)(imm)>0xE ? 78 : 126) - (char)(imm), \
	((char)(imm)&0xF0) ? 63 : ((char)(imm)>0xF ? 79 : 127) - (char)(imm)); })

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_srlv_epi16(__m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_psrlv32hi((__v32hi)__A, (__v32hi)__B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_srlv_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_srlv_epi16(__A, __B),
	(__v32hi)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_srlv_epi16(__mmask32 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_srlv_epi16(__A, __B),
	(__v32hi)_mm512_setzero_hi());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_srav_epi16(__m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_psrav32hi((__v32hi)__A, (__v32hi)__B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_srav_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_srav_epi16(__A, __B),
	(__v32hi)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_srav_epi16(__mmask32 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_srav_epi16(__A, __B),
	(__v32hi)_mm512_setzero_hi());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_sra_epi16(__m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_psraw512((__v32hi) __A, (__v8hi) __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_sra_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_sra_epi16(__A, __B),
	(__v32hi)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_sra_epi16(__A, __B),
	(__v32hi)_mm512_setzero_hi());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_srai_epi16(__m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_psrawi512((__v32hi)__A, __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_srai_epi16(__A, __B),
	(__v32hi)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_srai_epi16(__A, __B),
	(__v32hi)_mm512_setzero_hi());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_srl_epi16(__m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_psrlw512((__v32hi) __A, (__v8hi) __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_srl_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_srl_epi16(__A, __B),
	(__v32hi)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_srl_epi16(__A, __B),
	(__v32hi)_mm512_setzero_hi());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_srli_epi16(__m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_psrlwi512((__v32hi)__A, __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_srli_epi16(__m512i __W, __mmask32 __U, __m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_srli_epi16(__A, __B),
	(__v32hi)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
	(__v32hi)_mm512_srli_epi16(__A, __B),
	(__v32hi)_mm512_setzero_hi());
	}

	#define _mm512_bsrli_epi128(a, imm) __extension__ ({ \
	(__m512i)__builtin_shufflevector( \
	(__v64qi)(__m512i)(a), \
	(__v64qi)_mm512_setzero_si512(), \
	((char)(imm)&0xF0) ? 64 : (char)(imm) + ((char)(imm)>0xF ? 48 : 0), \
	((char)(imm)&0xF0) ? 65 : (char)(imm) + ((char)(imm)>0xE ? 49 : 1), \
	((char)(imm)&0xF0) ? 66 : (char)(imm) + ((char)(imm)>0xD ? 50 : 2), \
	((char)(imm)&0xF0) ? 67 : (char)(imm) + ((char)(imm)>0xC ? 51 : 3), \
	((char)(imm)&0xF0) ? 68 : (char)(imm) + ((char)(imm)>0xB ? 52 : 4), \
	((char)(imm)&0xF0) ? 69 : (char)(imm) + ((char)(imm)>0xA ? 53 : 5), \
	((char)(imm)&0xF0) ? 70 : (char)(imm) + ((char)(imm)>0x9 ? 54 : 6), \
	((char)(imm)&0xF0) ? 71 : (char)(imm) + ((char)(imm)>0x8 ? 55 : 7), \
	((char)(imm)&0xF0) ? 72 : (char)(imm) + ((char)(imm)>0x7 ? 56 : 8), \
	((char)(imm)&0xF0) ? 73 : (char)(imm) + ((char)(imm)>0x6 ? 57 : 9), \
	((char)(imm)&0xF0) ? 74 : (char)(imm) + ((char)(imm)>0x5 ? 58 : 10), \
	((char)(imm)&0xF0) ? 75 : (char)(imm) + ((char)(imm)>0x4 ? 59 : 11), \
	((char)(imm)&0xF0) ? 76 : (char)(imm) + ((char)(imm)>0x3 ? 60 : 12), \
	((char)(imm)&0xF0) ? 77 : (char)(imm) + ((char)(imm)>0x2 ? 61 : 13), \
	((char)(imm)&0xF0) ? 78 : (char)(imm) + ((char)(imm)>0x1 ? 62 : 14), \
	((char)(imm)&0xF0) ? 79 : (char)(imm) + ((char)(imm)>0x0 ? 63 : 15), \
	((char)(imm)&0xF0) ? 80 : (char)(imm) + ((char)(imm)>0xF ? 64 : 16), \
	((char)(imm)&0xF0) ? 81 : (char)(imm) + ((char)(imm)>0xE ? 65 : 17), \
	((char)(imm)&0xF0) ? 82 : (char)(imm) + ((char)(imm)>0xD ? 66 : 18), \
	((char)(imm)&0xF0) ? 83 : (char)(imm) + ((char)(imm)>0xC ? 67 : 19), \
	((char)(imm)&0xF0) ? 84 : (char)(imm) + ((char)(imm)>0xB ? 68 : 20), \
	((char)(imm)&0xF0) ? 85 : (char)(imm) + ((char)(imm)>0xA ? 69 : 21), \
	((char)(imm)&0xF0) ? 86 : (char)(imm) + ((char)(imm)>0x9 ? 70 : 22), \
	((char)(imm)&0xF0) ? 87 : (char)(imm) + ((char)(imm)>0x8 ? 71 : 23), \
	((char)(imm)&0xF0) ? 88 : (char)(imm) + ((char)(imm)>0x7 ? 72 : 24), \
	((char)(imm)&0xF0) ? 89 : (char)(imm) + ((char)(imm)>0x6 ? 73 : 25), \
	((char)(imm)&0xF0) ? 90 : (char)(imm) + ((char)(imm)>0x5 ? 74 : 26), \
	((char)(imm)&0xF0) ? 91 : (char)(imm) + ((char)(imm)>0x4 ? 75 : 27), \
	((char)(imm)&0xF0) ? 92 : (char)(imm) + ((char)(imm)>0x3 ? 76 : 28), \
	((char)(imm)&0xF0) ? 93 : (char)(imm) + ((char)(imm)>0x2 ? 77 : 29), \
	((char)(imm)&0xF0) ? 94 : (char)(imm) + ((char)(imm)>0x1 ? 78 : 30), \
	((char)(imm)&0xF0) ? 95 : (char)(imm) + ((char)(imm)>0x0 ? 79 : 31), \
	((char)(imm)&0xF0) ? 96 : (char)(imm) + ((char)(imm)>0xF ? 80 : 32), \
	((char)(imm)&0xF0) ? 97 : (char)(imm) + ((char)(imm)>0xE ? 81 : 33), \
	((char)(imm)&0xF0) ? 98 : (char)(imm) + ((char)(imm)>0xD ? 82 : 34), \
	((char)(imm)&0xF0) ? 99 : (char)(imm) + ((char)(imm)>0xC ? 83 : 35), \
	((char)(imm)&0xF0) ? 100 : (char)(imm) + ((char)(imm)>0xB ? 84 : 36), \
	((char)(imm)&0xF0) ? 101 : (char)(imm) + ((char)(imm)>0xA ? 85 : 37), \
	((char)(imm)&0xF0) ? 102 : (char)(imm) + ((char)(imm)>0x9 ? 86 : 38), \
	((char)(imm)&0xF0) ? 103 : (char)(imm) + ((char)(imm)>0x8 ? 87 : 39), \
	((char)(imm)&0xF0) ? 104 : (char)(imm) + ((char)(imm)>0x7 ? 88 : 40), \
	((char)(imm)&0xF0) ? 105 : (char)(imm) + ((char)(imm)>0x6 ? 89 : 41), \
	((char)(imm)&0xF0) ? 106 : (char)(imm) + ((char)(imm)>0x5 ? 90 : 42), \
	((char)(imm)&0xF0) ? 107 : (char)(imm) + ((char)(imm)>0x4 ? 91 : 43), \
	((char)(imm)&0xF0) ? 108 : (char)(imm) + ((char)(imm)>0x3 ? 92 : 44), \
	((char)(imm)&0xF0) ? 109 : (char)(imm) + ((char)(imm)>0x2 ? 93 : 45), \
	((char)(imm)&0xF0) ? 110 : (char)(imm) + ((char)(imm)>0x1 ? 94 : 46), \
	((char)(imm)&0xF0) ? 111 : (char)(imm) + ((char)(imm)>0x0 ? 95 : 47), \
	((char)(imm)&0xF0) ? 112 : (char)(imm) + ((char)(imm)>0xF ? 96 : 48), \
	((char)(imm)&0xF0) ? 113 : (char)(imm) + ((char)(imm)>0xE ? 97 : 49), \
	((char)(imm)&0xF0) ? 114 : (char)(imm) + ((char)(imm)>0xD ? 98 : 50), \
	((char)(imm)&0xF0) ? 115 : (char)(imm) + ((char)(imm)>0xC ? 99 : 51), \
	((char)(imm)&0xF0) ? 116 : (char)(imm) + ((char)(imm)>0xB ? 100 : 52), \
	((char)(imm)&0xF0) ? 117 : (char)(imm) + ((char)(imm)>0xA ? 101 : 53), \
	((char)(imm)&0xF0) ? 118 : (char)(imm) + ((char)(imm)>0x9 ? 102 : 54), \
	((char)(imm)&0xF0) ? 119 : (char)(imm) + ((char)(imm)>0x8 ? 103 : 55), \
	((char)(imm)&0xF0) ? 120 : (char)(imm) + ((char)(imm)>0x7 ? 104 : 56), \
	((char)(imm)&0xF0) ? 121 : (char)(imm) + ((char)(imm)>0x6 ? 105 : 57), \
	((char)(imm)&0xF0) ? 122 : (char)(imm) + ((char)(imm)>0x5 ? 106 : 58), \
	((char)(imm)&0xF0) ? 123 : (char)(imm) + ((char)(imm)>0x4 ? 107 : 59), \
	((char)(imm)&0xF0) ? 124 : (char)(imm) + ((char)(imm)>0x3 ? 108 : 60), \
	((char)(imm)&0xF0) ? 125 : (char)(imm) + ((char)(imm)>0x2 ? 109 : 61), \
	((char)(imm)&0xF0) ? 126 : (char)(imm) + ((char)(imm)>0x1 ? 110 : 62), \
	((char)(imm)&0xF0) ? 127 : (char)(imm) + ((char)(imm)>0x0 ? 111 : 63)); })

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U,
	(__v32hi) __A,
	(__v32hi) __W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_mov_epi16 (__mmask32 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U,
	(__v32hi) __A,
	(__v32hi) _mm512_setzero_hi ());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_mov_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U,
	(__v64qi) __A,
	(__v64qi) __W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U,
	(__v64qi) __A,
	(__v64qi) _mm512_setzero_hi ());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A)
	{
	return (__m512i) __builtin_ia32_selectb_512(__M,
	(__v64qi)_mm512_set1_epi8(__A),
	(__v64qi) __O);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_set1_epi8 (__mmask64 __M, char __A)
	{
	return (__m512i) __builtin_ia32_selectb_512(__M,
	(__v64qi) _mm512_set1_epi8(__A),
	(__v64qi) _mm512_setzero_si512());
	}

	static __inline__ __mmask64 __DEFAULT_FN_ATTRS
	_mm512_kunpackd (__mmask64 __A, __mmask64 __B)
	{
	- return (__mmask64) (( __A & 0xFFFFFFFF) \| ( __B << 32));
	+ return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A,
	+ (__mmask64) __B);
	}

	static __inline__ __mmask32 __DEFAULT_FN_ATTRS
	_mm512_kunpackw (__mmask32 __A, __mmask32 __B)
	{
	-return (__mmask32) (( __A & 0xFFFF) \| ( __B << 16));
	+ return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A,
	+ (__mmask32) __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_loadu_epi16 (__m512i __W, __mmask32 __U, void const *__P)
	{
	return (__m512i) __builtin_ia32_loaddquhi512_mask ((__v32hi *) __P,
	(__v32hi) __W,
	(__mmask32) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_loadu_epi16 (__mmask32 __U, void const *__P)
	{
	return (__m512i) __builtin_ia32_loaddquhi512_mask ((__v32hi *) __P,
	(__v32hi)
	_mm512_setzero_hi (),
	(__mmask32) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_loadu_epi8 (__m512i __W, __mmask64 __U, void const *__P)
	{
	return (__m512i) __builtin_ia32_loaddquqi512_mask ((__v64qi *) __P,
	(__v64qi) __W,
	(__mmask64) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_loadu_epi8 (__mmask64 __U, void const *__P)
	{
	return (__m512i) __builtin_ia32_loaddquqi512_mask ((__v64qi *) __P,
	(__v64qi)
	_mm512_setzero_hi (),
	(__mmask64) __U);
	}
	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_storeu_epi16 (void *__P, __mmask32 __U, __m512i __A)
	{
	__builtin_ia32_storedquhi512_mask ((__v32hi *) __P,
	(__v32hi) __A,
	(__mmask32) __U);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_storeu_epi8 (void *__P, __mmask64 __U, __m512i __A)
	{
	__builtin_ia32_storedquqi512_mask ((__v64qi *) __P,
	(__v64qi) __A,
	(__mmask64) __U);
	}

	static __inline__ __mmask64 __DEFAULT_FN_ATTRS
	_mm512_test_epi8_mask (__m512i __A, __m512i __B)
	{
	return _mm512_cmpneq_epi8_mask (_mm512_and_epi32 (__A, __B),
	_mm512_setzero_qi());
	}

	static __inline__ __mmask64 __DEFAULT_FN_ATTRS
	_mm512_mask_test_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
	{
	return _mm512_mask_cmpneq_epi8_mask (__U, _mm512_and_epi32 (__A, __B),
	_mm512_setzero_qi());
	}

	static __inline__ __mmask32 __DEFAULT_FN_ATTRS
	_mm512_test_epi16_mask (__m512i __A, __m512i __B)
	{
	return _mm512_cmpneq_epi16_mask (_mm512_and_epi32 (__A, __B),
	_mm512_setzero_qi());
	}

	static __inline__ __mmask32 __DEFAULT_FN_ATTRS
	_mm512_mask_test_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
	{
	return _mm512_mask_cmpneq_epi16_mask (__U, _mm512_and_epi32 (__A, __B),
	_mm512_setzero_qi());
	}

	static __inline__ __mmask64 __DEFAULT_FN_ATTRS
	_mm512_testn_epi8_mask (__m512i __A, __m512i __B)
	{
	return _mm512_cmpeq_epi8_mask (_mm512_and_epi32 (__A, __B), _mm512_setzero_qi());
	}

	static __inline__ __mmask64 __DEFAULT_FN_ATTRS
	_mm512_mask_testn_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
	{
	return _mm512_mask_cmpeq_epi8_mask (__U, _mm512_and_epi32 (__A, __B),
	_mm512_setzero_qi());
	}

	static __inline__ __mmask32 __DEFAULT_FN_ATTRS
	_mm512_testn_epi16_mask (__m512i __A, __m512i __B)
	{
	return _mm512_cmpeq_epi16_mask (_mm512_and_epi32 (__A, __B),
	_mm512_setzero_qi());
	}

	static __inline__ __mmask32 __DEFAULT_FN_ATTRS
	_mm512_mask_testn_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
	{
	return _mm512_mask_cmpeq_epi16_mask (__U, _mm512_and_epi32 (__A, __B),
	_mm512_setzero_qi());
	}

	static __inline__ __mmask64 __DEFAULT_FN_ATTRS
	_mm512_movepi8_mask (__m512i __A)
	{
	return (__mmask64) __builtin_ia32_cvtb2mask512 ((__v64qi) __A);
	}

	static __inline__ __mmask32 __DEFAULT_FN_ATTRS
	_mm512_movepi16_mask (__m512i __A)
	{
	return (__mmask32) __builtin_ia32_cvtw2mask512 ((__v32hi) __A);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_movm_epi8 (__mmask64 __A)
	{
	return (__m512i) __builtin_ia32_cvtmask2b512 (__A);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_movm_epi16 (__mmask32 __A)
	{
	return (__m512i) __builtin_ia32_cvtmask2w512 (__A);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_broadcastb_epi8 (__m128i __A)
	{
	return (__m512i)__builtin_shufflevector((__v16qi) __A,
	(__v16qi)_mm_undefined_si128(),
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectb_512(__M,
	(__v64qi) _mm512_broadcastb_epi8(__A),
	(__v64qi) __O);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_broadcastb_epi8 (__mmask64 __M, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectb_512(__M,
	(__v64qi) _mm512_broadcastb_epi8(__A),
	(__v64qi) _mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A)
	{
	return (__m512i) __builtin_ia32_selectw_512(__M,
	(__v32hi) _mm512_set1_epi16(__A),
	(__v32hi) __O);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_set1_epi16 (__mmask32 __M, short __A)
	{
	return (__m512i) __builtin_ia32_selectw_512(__M,
	(__v32hi) _mm512_set1_epi16(__A),
	(__v32hi) _mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_broadcastw_epi16 (__m128i __A)
	{
	return (__m512i)__builtin_shufflevector((__v8hi) __A,
	(__v8hi)_mm_undefined_si128(),
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectw_512(__M,
	(__v32hi) _mm512_broadcastw_epi16(__A),
	(__v32hi) __O);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_broadcastw_epi16 (__mmask32 __M, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectw_512(__M,
	(__v32hi) _mm512_broadcastw_epi16(__A),
	(__v32hi) _mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_permutexvar_epi16 (__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
	(__v32hi) __A,
	(__v32hi) _mm512_undefined_epi32 (),
	(__mmask32) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_permutexvar_epi16 (__mmask32 __M, __m512i __A,
	__m512i __B)
	{
	return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
	(__v32hi) __A,
	(__v32hi) _mm512_setzero_hi(),
	(__mmask32) __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
	__m512i __B)
	{
	return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
	(__v32hi) __A,
	(__v32hi) __W,
	(__mmask32) __M);
	}

	#define _mm512_alignr_epi8(A, B, N) __extension__ ({\
	(__m512i)__builtin_ia32_palignr512_mask((__v64qi)(__m512i)(A), \
	(__v64qi)(__m512i)(B), (int)(N), \
	(__v64qi)_mm512_undefined_pd(), \
	(__mmask64)-1); })

	#define _mm512_mask_alignr_epi8(W, U, A, B, N) __extension__({\
	(__m512i)__builtin_ia32_palignr512_mask((__v64qi)(__m512i)(A), \
	(__v64qi)(__m512i)(B), (int)(N), \
	(__v64qi)(__m512i)(W), \
	(__mmask64)(U)); })

	#define _mm512_maskz_alignr_epi8(U, A, B, N) __extension__({\
	(__m512i)__builtin_ia32_palignr512_mask((__v64qi)(__m512i)(A), \
	(__v64qi)(__m512i)(B), (int)(N), \
	(__v64qi)_mm512_setzero_si512(), \
	(__mmask64)(U)); })

	#define _mm512_dbsad_epu8(A, B, imm) __extension__ ({\
	(__m512i)__builtin_ia32_dbpsadbw512_mask((__v64qi)(__m512i)(A), \
	(__v64qi)(__m512i)(B), (int)(imm), \
	(__v32hi)_mm512_undefined_epi32(), \
	(__mmask32)-1); })

	#define _mm512_mask_dbsad_epu8(W, U, A, B, imm) ({\
	(__m512i)__builtin_ia32_dbpsadbw512_mask((__v64qi)(__m512i)(A), \
	(__v64qi)(__m512i)(B), (int)(imm), \
	(__v32hi)(__m512i)(W), \
	(__mmask32)(U)); })

	#define _mm512_maskz_dbsad_epu8(U, A, B, imm) ({\
	(__m512i)__builtin_ia32_dbpsadbw512_mask((__v64qi)(__m512i)(A), \
	(__v64qi)(__m512i)(B), (int)(imm), \
	(__v32hi)_mm512_setzero_hi(), \
	(__mmask32)(U)); })

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_sad_epu8 (__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_psadbw512 ((__v64qi) __A,
	(__v64qi) __B);
	}



	#undef __DEFAULT_FN_ATTRS

	#endif
	Index: head/contrib/llvm/tools/clang/lib/Headers/avx512fintrin.h
	===================================================================
	--- head/contrib/llvm/tools/clang/lib/Headers/avx512fintrin.h (revision 329409)
	+++ head/contrib/llvm/tools/clang/lib/Headers/avx512fintrin.h (revision 329410)
	@@ -1,10233 +1,10233 @@
	/*===---- avx512fintrin.h - AVX512F intrinsics -----------------------------===
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to deal
	* in the Software without restriction, including without limitation the rights
	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	* copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in
	* all copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	* THE SOFTWARE.
	*
	*===-----------------------------------------------------------------------===
	*/
	#ifndef __IMMINTRIN_H
	#error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead."
	#endif

	#ifndef __AVX512FINTRIN_H
	#define __AVX512FINTRIN_H

	typedef char __v64qi __attribute__((__vector_size__(64)));
	typedef short __v32hi __attribute__((__vector_size__(64)));
	typedef double __v8df __attribute__((__vector_size__(64)));
	typedef float __v16sf __attribute__((__vector_size__(64)));
	typedef long long __v8di __attribute__((__vector_size__(64)));
	typedef int __v16si __attribute__((__vector_size__(64)));

	/* Unsigned types */
	typedef unsigned char __v64qu __attribute__((__vector_size__(64)));
	typedef unsigned short __v32hu __attribute__((__vector_size__(64)));
	typedef unsigned long long __v8du __attribute__((__vector_size__(64)));
	typedef unsigned int __v16su __attribute__((__vector_size__(64)));

	typedef float __m512 __attribute__((__vector_size__(64)));
	typedef double __m512d __attribute__((__vector_size__(64)));
	typedef long long __m512i __attribute__((__vector_size__(64)));

	typedef unsigned char __mmask8;
	typedef unsigned short __mmask16;

	/* Rounding mode macros. */
	#define _MM_FROUND_TO_NEAREST_INT 0x00
	#define _MM_FROUND_TO_NEG_INF 0x01
	#define _MM_FROUND_TO_POS_INF 0x02
	#define _MM_FROUND_TO_ZERO 0x03
	#define _MM_FROUND_CUR_DIRECTION 0x04

	/* Constants for integer comparison predicates */
	typedef enum {
	_MM_CMPINT_EQ, /* Equal */
	_MM_CMPINT_LT, /* Less than */
	_MM_CMPINT_LE, /* Less than or Equal */
	_MM_CMPINT_UNUSED,
	_MM_CMPINT_NE, /* Not Equal */
	_MM_CMPINT_NLT, /* Not Less than */
	#define _MM_CMPINT_GE _MM_CMPINT_NLT /* Greater than or Equal */
	_MM_CMPINT_NLE /* Not Less than or Equal */
	#define _MM_CMPINT_GT _MM_CMPINT_NLE /* Greater than */
	} _MM_CMPINT_ENUM;

	typedef enum
	{
	_MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02,
	_MM_PERM_AAAD = 0x03, _MM_PERM_AABA = 0x04, _MM_PERM_AABB = 0x05,
	_MM_PERM_AABC = 0x06, _MM_PERM_AABD = 0x07, _MM_PERM_AACA = 0x08,
	_MM_PERM_AACB = 0x09, _MM_PERM_AACC = 0x0A, _MM_PERM_AACD = 0x0B,
	_MM_PERM_AADA = 0x0C, _MM_PERM_AADB = 0x0D, _MM_PERM_AADC = 0x0E,
	_MM_PERM_AADD = 0x0F, _MM_PERM_ABAA = 0x10, _MM_PERM_ABAB = 0x11,
	_MM_PERM_ABAC = 0x12, _MM_PERM_ABAD = 0x13, _MM_PERM_ABBA = 0x14,
	_MM_PERM_ABBB = 0x15, _MM_PERM_ABBC = 0x16, _MM_PERM_ABBD = 0x17,
	_MM_PERM_ABCA = 0x18, _MM_PERM_ABCB = 0x19, _MM_PERM_ABCC = 0x1A,
	_MM_PERM_ABCD = 0x1B, _MM_PERM_ABDA = 0x1C, _MM_PERM_ABDB = 0x1D,
	_MM_PERM_ABDC = 0x1E, _MM_PERM_ABDD = 0x1F, _MM_PERM_ACAA = 0x20,
	_MM_PERM_ACAB = 0x21, _MM_PERM_ACAC = 0x22, _MM_PERM_ACAD = 0x23,
	_MM_PERM_ACBA = 0x24, _MM_PERM_ACBB = 0x25, _MM_PERM_ACBC = 0x26,
	_MM_PERM_ACBD = 0x27, _MM_PERM_ACCA = 0x28, _MM_PERM_ACCB = 0x29,
	_MM_PERM_ACCC = 0x2A, _MM_PERM_ACCD = 0x2B, _MM_PERM_ACDA = 0x2C,
	_MM_PERM_ACDB = 0x2D, _MM_PERM_ACDC = 0x2E, _MM_PERM_ACDD = 0x2F,
	_MM_PERM_ADAA = 0x30, _MM_PERM_ADAB = 0x31, _MM_PERM_ADAC = 0x32,
	_MM_PERM_ADAD = 0x33, _MM_PERM_ADBA = 0x34, _MM_PERM_ADBB = 0x35,
	_MM_PERM_ADBC = 0x36, _MM_PERM_ADBD = 0x37, _MM_PERM_ADCA = 0x38,
	_MM_PERM_ADCB = 0x39, _MM_PERM_ADCC = 0x3A, _MM_PERM_ADCD = 0x3B,
	_MM_PERM_ADDA = 0x3C, _MM_PERM_ADDB = 0x3D, _MM_PERM_ADDC = 0x3E,
	_MM_PERM_ADDD = 0x3F, _MM_PERM_BAAA = 0x40, _MM_PERM_BAAB = 0x41,
	_MM_PERM_BAAC = 0x42, _MM_PERM_BAAD = 0x43, _MM_PERM_BABA = 0x44,
	_MM_PERM_BABB = 0x45, _MM_PERM_BABC = 0x46, _MM_PERM_BABD = 0x47,
	_MM_PERM_BACA = 0x48, _MM_PERM_BACB = 0x49, _MM_PERM_BACC = 0x4A,
	_MM_PERM_BACD = 0x4B, _MM_PERM_BADA = 0x4C, _MM_PERM_BADB = 0x4D,
	_MM_PERM_BADC = 0x4E, _MM_PERM_BADD = 0x4F, _MM_PERM_BBAA = 0x50,
	_MM_PERM_BBAB = 0x51, _MM_PERM_BBAC = 0x52, _MM_PERM_BBAD = 0x53,
	_MM_PERM_BBBA = 0x54, _MM_PERM_BBBB = 0x55, _MM_PERM_BBBC = 0x56,
	_MM_PERM_BBBD = 0x57, _MM_PERM_BBCA = 0x58, _MM_PERM_BBCB = 0x59,
	_MM_PERM_BBCC = 0x5A, _MM_PERM_BBCD = 0x5B, _MM_PERM_BBDA = 0x5C,
	_MM_PERM_BBDB = 0x5D, _MM_PERM_BBDC = 0x5E, _MM_PERM_BBDD = 0x5F,
	_MM_PERM_BCAA = 0x60, _MM_PERM_BCAB = 0x61, _MM_PERM_BCAC = 0x62,
	_MM_PERM_BCAD = 0x63, _MM_PERM_BCBA = 0x64, _MM_PERM_BCBB = 0x65,
	_MM_PERM_BCBC = 0x66, _MM_PERM_BCBD = 0x67, _MM_PERM_BCCA = 0x68,
	_MM_PERM_BCCB = 0x69, _MM_PERM_BCCC = 0x6A, _MM_PERM_BCCD = 0x6B,
	_MM_PERM_BCDA = 0x6C, _MM_PERM_BCDB = 0x6D, _MM_PERM_BCDC = 0x6E,
	_MM_PERM_BCDD = 0x6F, _MM_PERM_BDAA = 0x70, _MM_PERM_BDAB = 0x71,
	_MM_PERM_BDAC = 0x72, _MM_PERM_BDAD = 0x73, _MM_PERM_BDBA = 0x74,
	_MM_PERM_BDBB = 0x75, _MM_PERM_BDBC = 0x76, _MM_PERM_BDBD = 0x77,
	_MM_PERM_BDCA = 0x78, _MM_PERM_BDCB = 0x79, _MM_PERM_BDCC = 0x7A,
	_MM_PERM_BDCD = 0x7B, _MM_PERM_BDDA = 0x7C, _MM_PERM_BDDB = 0x7D,
	_MM_PERM_BDDC = 0x7E, _MM_PERM_BDDD = 0x7F, _MM_PERM_CAAA = 0x80,
	_MM_PERM_CAAB = 0x81, _MM_PERM_CAAC = 0x82, _MM_PERM_CAAD = 0x83,
	_MM_PERM_CABA = 0x84, _MM_PERM_CABB = 0x85, _MM_PERM_CABC = 0x86,
	_MM_PERM_CABD = 0x87, _MM_PERM_CACA = 0x88, _MM_PERM_CACB = 0x89,
	_MM_PERM_CACC = 0x8A, _MM_PERM_CACD = 0x8B, _MM_PERM_CADA = 0x8C,
	_MM_PERM_CADB = 0x8D, _MM_PERM_CADC = 0x8E, _MM_PERM_CADD = 0x8F,
	_MM_PERM_CBAA = 0x90, _MM_PERM_CBAB = 0x91, _MM_PERM_CBAC = 0x92,
	_MM_PERM_CBAD = 0x93, _MM_PERM_CBBA = 0x94, _MM_PERM_CBBB = 0x95,
	_MM_PERM_CBBC = 0x96, _MM_PERM_CBBD = 0x97, _MM_PERM_CBCA = 0x98,
	_MM_PERM_CBCB = 0x99, _MM_PERM_CBCC = 0x9A, _MM_PERM_CBCD = 0x9B,
	_MM_PERM_CBDA = 0x9C, _MM_PERM_CBDB = 0x9D, _MM_PERM_CBDC = 0x9E,
	_MM_PERM_CBDD = 0x9F, _MM_PERM_CCAA = 0xA0, _MM_PERM_CCAB = 0xA1,
	_MM_PERM_CCAC = 0xA2, _MM_PERM_CCAD = 0xA3, _MM_PERM_CCBA = 0xA4,
	_MM_PERM_CCBB = 0xA5, _MM_PERM_CCBC = 0xA6, _MM_PERM_CCBD = 0xA7,
	_MM_PERM_CCCA = 0xA8, _MM_PERM_CCCB = 0xA9, _MM_PERM_CCCC = 0xAA,
	_MM_PERM_CCCD = 0xAB, _MM_PERM_CCDA = 0xAC, _MM_PERM_CCDB = 0xAD,
	_MM_PERM_CCDC = 0xAE, _MM_PERM_CCDD = 0xAF, _MM_PERM_CDAA = 0xB0,
	_MM_PERM_CDAB = 0xB1, _MM_PERM_CDAC = 0xB2, _MM_PERM_CDAD = 0xB3,
	_MM_PERM_CDBA = 0xB4, _MM_PERM_CDBB = 0xB5, _MM_PERM_CDBC = 0xB6,
	_MM_PERM_CDBD = 0xB7, _MM_PERM_CDCA = 0xB8, _MM_PERM_CDCB = 0xB9,
	_MM_PERM_CDCC = 0xBA, _MM_PERM_CDCD = 0xBB, _MM_PERM_CDDA = 0xBC,
	_MM_PERM_CDDB = 0xBD, _MM_PERM_CDDC = 0xBE, _MM_PERM_CDDD = 0xBF,
	_MM_PERM_DAAA = 0xC0, _MM_PERM_DAAB = 0xC1, _MM_PERM_DAAC = 0xC2,
	_MM_PERM_DAAD = 0xC3, _MM_PERM_DABA = 0xC4, _MM_PERM_DABB = 0xC5,
	_MM_PERM_DABC = 0xC6, _MM_PERM_DABD = 0xC7, _MM_PERM_DACA = 0xC8,
	_MM_PERM_DACB = 0xC9, _MM_PERM_DACC = 0xCA, _MM_PERM_DACD = 0xCB,
	_MM_PERM_DADA = 0xCC, _MM_PERM_DADB = 0xCD, _MM_PERM_DADC = 0xCE,
	_MM_PERM_DADD = 0xCF, _MM_PERM_DBAA = 0xD0, _MM_PERM_DBAB = 0xD1,
	_MM_PERM_DBAC = 0xD2, _MM_PERM_DBAD = 0xD3, _MM_PERM_DBBA = 0xD4,
	_MM_PERM_DBBB = 0xD5, _MM_PERM_DBBC = 0xD6, _MM_PERM_DBBD = 0xD7,
	_MM_PERM_DBCA = 0xD8, _MM_PERM_DBCB = 0xD9, _MM_PERM_DBCC = 0xDA,
	_MM_PERM_DBCD = 0xDB, _MM_PERM_DBDA = 0xDC, _MM_PERM_DBDB = 0xDD,
	_MM_PERM_DBDC = 0xDE, _MM_PERM_DBDD = 0xDF, _MM_PERM_DCAA = 0xE0,
	_MM_PERM_DCAB = 0xE1, _MM_PERM_DCAC = 0xE2, _MM_PERM_DCAD = 0xE3,
	_MM_PERM_DCBA = 0xE4, _MM_PERM_DCBB = 0xE5, _MM_PERM_DCBC = 0xE6,
	_MM_PERM_DCBD = 0xE7, _MM_PERM_DCCA = 0xE8, _MM_PERM_DCCB = 0xE9,
	_MM_PERM_DCCC = 0xEA, _MM_PERM_DCCD = 0xEB, _MM_PERM_DCDA = 0xEC,
	_MM_PERM_DCDB = 0xED, _MM_PERM_DCDC = 0xEE, _MM_PERM_DCDD = 0xEF,
	_MM_PERM_DDAA = 0xF0, _MM_PERM_DDAB = 0xF1, _MM_PERM_DDAC = 0xF2,
	_MM_PERM_DDAD = 0xF3, _MM_PERM_DDBA = 0xF4, _MM_PERM_DDBB = 0xF5,
	_MM_PERM_DDBC = 0xF6, _MM_PERM_DDBD = 0xF7, _MM_PERM_DDCA = 0xF8,
	_MM_PERM_DDCB = 0xF9, _MM_PERM_DDCC = 0xFA, _MM_PERM_DDCD = 0xFB,
	_MM_PERM_DDDA = 0xFC, _MM_PERM_DDDB = 0xFD, _MM_PERM_DDDC = 0xFE,
	_MM_PERM_DDDD = 0xFF
	} _MM_PERM_ENUM;

	typedef enum
	{
	_MM_MANT_NORM_1_2, /* interval [1, 2) */
	_MM_MANT_NORM_p5_2, /* interval [0.5, 2) */
	_MM_MANT_NORM_p5_1, /* interval [0.5, 1) */
	_MM_MANT_NORM_p75_1p5 /* interval [0.75, 1.5) */
	} _MM_MANTISSA_NORM_ENUM;

	typedef enum
	{
	_MM_MANT_SIGN_src, /* sign = sign(SRC) */
	_MM_MANT_SIGN_zero, /* sign = 0 */
	_MM_MANT_SIGN_nan /* DEST = NaN if sign(SRC) = 1 */
	} _MM_MANTISSA_SIGN_ENUM;

	/* Define the default attributes for the functions in this file. */
	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512f")))

	/* Create vectors with repeated elements */

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_setzero_si512(void)
	{
	return (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 };
	}

	#define _mm512_setzero_epi32 _mm512_setzero_si512

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_undefined_pd(void)
	{
	return (__m512d)__builtin_ia32_undef512();
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_undefined(void)
	{
	return (__m512)__builtin_ia32_undef512();
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_undefined_ps(void)
	{
	return (__m512)__builtin_ia32_undef512();
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_undefined_epi32(void)
	{
	return (__m512i)__builtin_ia32_undef512();
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_broadcastd_epi32 (__m128i __A)
	{
	return (__m512i)__builtin_shufflevector((__v4si) __A,
	(__v4si)_mm_undefined_si128(),
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectd_512(__M,
	(__v16si) _mm512_broadcastd_epi32(__A),
	(__v16si) __O);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectd_512(__M,
	(__v16si) _mm512_broadcastd_epi32(__A),
	(__v16si) _mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_broadcastq_epi64 (__m128i __A)
	{
	return (__m512i)__builtin_shufflevector((__v2di) __A,
	(__v2di) _mm_undefined_si128(),
	0, 0, 0, 0, 0, 0, 0, 0);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectq_512(__M,
	(__v8di) _mm512_broadcastq_epi64(__A),
	(__v8di) __O);

	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectq_512(__M,
	(__v8di) _mm512_broadcastq_epi64(__A),
	(__v8di) _mm512_setzero_si512());
	}


	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_setzero_ps(void)
	{
	return (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
	0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
	}

	#define _mm512_setzero _mm512_setzero_ps

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_setzero_pd(void)
	{
	return (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_set1_ps(float __w)
	{
	return (__m512){ __w, __w, __w, __w, __w, __w, __w, __w,
	__w, __w, __w, __w, __w, __w, __w, __w };
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_set1_pd(double __w)
	{
	return (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w };
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_set1_epi8(char __w)
	{
	return (__m512i)(__v64qi){ __w, __w, __w, __w, __w, __w, __w, __w,
	__w, __w, __w, __w, __w, __w, __w, __w,
	__w, __w, __w, __w, __w, __w, __w, __w,
	__w, __w, __w, __w, __w, __w, __w, __w,
	__w, __w, __w, __w, __w, __w, __w, __w,
	__w, __w, __w, __w, __w, __w, __w, __w,
	__w, __w, __w, __w, __w, __w, __w, __w,
	__w, __w, __w, __w, __w, __w, __w, __w };
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_set1_epi16(short __w)
	{
	return (__m512i)(__v32hi){ __w, __w, __w, __w, __w, __w, __w, __w,
	__w, __w, __w, __w, __w, __w, __w, __w,
	__w, __w, __w, __w, __w, __w, __w, __w,
	__w, __w, __w, __w, __w, __w, __w, __w };
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_set1_epi32(int __s)
	{
	return (__m512i)(__v16si){ __s, __s, __s, __s, __s, __s, __s, __s,
	__s, __s, __s, __s, __s, __s, __s, __s };
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_set1_epi32(__mmask16 __M, int __A)
	{
	return (__m512i)__builtin_ia32_selectd_512(__M,
	(__v16si)_mm512_set1_epi32(__A),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_set1_epi64(long long __d)
	{
	return (__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d };
	}

	#ifdef __x86_64__
	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_set1_epi64(__mmask8 __M, long long __A)
	{
	return (__m512i)__builtin_ia32_selectq_512(__M,
	(__v8di)_mm512_set1_epi64(__A),
	(__v8di)_mm512_setzero_si512());
	}
	#endif

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_broadcastss_ps(__m128 __A)
	{
	return (__m512)__builtin_shufflevector((__v4sf) __A,
	(__v4sf)_mm_undefined_ps(),
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_set4_epi32 (int __A, int __B, int __C, int __D)
	{
	return (__m512i)(__v16si)
	{ __D, __C, __B, __A, __D, __C, __B, __A,
	__D, __C, __B, __A, __D, __C, __B, __A };
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_set4_epi64 (long long __A, long long __B, long long __C,
	long long __D)
	{
	return (__m512i) (__v8di)
	{ __D, __C, __B, __A, __D, __C, __B, __A };
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_set4_pd (double __A, double __B, double __C, double __D)
	{
	return (__m512d)
	{ __D, __C, __B, __A, __D, __C, __B, __A };
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_set4_ps (float __A, float __B, float __C, float __D)
	{
	return (__m512)
	{ __D, __C, __B, __A, __D, __C, __B, __A,
	__D, __C, __B, __A, __D, __C, __B, __A };
	}

	#define _mm512_setr4_epi32(e0,e1,e2,e3) \
	_mm512_set4_epi32((e3),(e2),(e1),(e0))

	#define _mm512_setr4_epi64(e0,e1,e2,e3) \
	_mm512_set4_epi64((e3),(e2),(e1),(e0))

	#define _mm512_setr4_pd(e0,e1,e2,e3) \
	_mm512_set4_pd((e3),(e2),(e1),(e0))

	#define _mm512_setr4_ps(e0,e1,e2,e3) \
	_mm512_set4_ps((e3),(e2),(e1),(e0))

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_broadcastsd_pd(__m128d __A)
	{
	return (__m512d)__builtin_shufflevector((__v2df) __A,
	(__v2df) _mm_undefined_pd(),
	0, 0, 0, 0, 0, 0, 0, 0);
	}

	/* Cast between vector types */

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_castpd256_pd512(__m256d __a)
	{
	return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_castps256_ps512(__m256 __a)
	{
	return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7,
	-1, -1, -1, -1, -1, -1, -1, -1);
	}

	static __inline __m128d __DEFAULT_FN_ATTRS
	_mm512_castpd512_pd128(__m512d __a)
	{
	return __builtin_shufflevector(__a, __a, 0, 1);
	}

	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm512_castpd512_pd256 (__m512d __A)
	{
	return __builtin_shufflevector(__A, __A, 0, 1, 2, 3);
	}

	static __inline __m128 __DEFAULT_FN_ATTRS
	_mm512_castps512_ps128(__m512 __a)
	{
	return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
	}

	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm512_castps512_ps256 (__m512 __A)
	{
	return __builtin_shufflevector(__A, __A, 0, 1, 2, 3, 4, 5, 6, 7);
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_castpd_ps (__m512d __A)
	{
	return (__m512) (__A);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_castpd_si512 (__m512d __A)
	{
	return (__m512i) (__A);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_castpd128_pd512 (__m128d __A)
	{
	return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1);
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_castps_pd (__m512 __A)
	{
	return (__m512d) (__A);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_castps_si512 (__m512 __A)
	{
	return (__m512i) (__A);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_castps128_ps512 (__m128 __A)
	{
	return __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_castsi128_si512 (__m128i __A)
	{
	return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_castsi256_si512 (__m256i __A)
	{
	return __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1);
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_castsi512_ps (__m512i __A)
	{
	return (__m512) (__A);
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_castsi512_pd (__m512i __A)
	{
	return (__m512d) (__A);
	}

	static __inline __m128i __DEFAULT_FN_ATTRS
	_mm512_castsi512_si128 (__m512i __A)
	{
	return (__m128i)__builtin_shufflevector(__A, __A , 0, 1);
	}

	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm512_castsi512_si256 (__m512i __A)
	{
	return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_int2mask(int __a)
	{
	return (__mmask16)__a;
	}

	static __inline__ int __DEFAULT_FN_ATTRS
	_mm512_mask2int(__mmask16 __a)
	{
	return (int)__a;
	}

	/// \brief Constructs a 512-bit floating-point vector of [8 x double] from a
	/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
	/// contain the value of the source vector. The upper 384 bits are set
	/// to zero.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \param __a
	/// A 128-bit vector of [2 x double].
	/// \returns A 512-bit floating-point vector of [8 x double]. The lower 128 bits
	/// contain the value of the parameter. The upper 384 bits are set to zero.
	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_zextpd128_pd512(__m128d __a)
	{
	return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3, 2, 3, 2, 3);
	}

	/// \brief Constructs a 512-bit floating-point vector of [8 x double] from a
	/// 256-bit floating-point vector of [4 x double]. The lower 256 bits
	/// contain the value of the source vector. The upper 256 bits are set
	/// to zero.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \param __a
	/// A 256-bit vector of [4 x double].
	/// \returns A 512-bit floating-point vector of [8 x double]. The lower 256 bits
	/// contain the value of the parameter. The upper 256 bits are set to zero.
	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_zextpd256_pd512(__m256d __a)
	{
	return __builtin_shufflevector((__v4df)__a, (__v4df)_mm256_setzero_pd(), 0, 1, 2, 3, 4, 5, 6, 7);
	}

	/// \brief Constructs a 512-bit floating-point vector of [16 x float] from a
	/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
	/// the value of the source vector. The upper 384 bits are set to zero.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \param __a
	/// A 128-bit vector of [4 x float].
	/// \returns A 512-bit floating-point vector of [16 x float]. The lower 128 bits
	/// contain the value of the parameter. The upper 384 bits are set to zero.
	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_zextps128_ps512(__m128 __a)
	{
	return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7);
	}

	/// \brief Constructs a 512-bit floating-point vector of [16 x float] from a
	/// 256-bit floating-point vector of [8 x float]. The lower 256 bits contain
	/// the value of the source vector. The upper 256 bits are set to zero.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float].
	/// \returns A 512-bit floating-point vector of [16 x float]. The lower 256 bits
	/// contain the value of the parameter. The upper 256 bits are set to zero.
	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_zextps256_ps512(__m256 __a)
	{
	return __builtin_shufflevector((__v8sf)__a, (__v8sf)_mm256_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
	}

	/// \brief Constructs a 512-bit integer vector from a 128-bit integer vector.
	/// The lower 128 bits contain the value of the source vector. The upper
	/// 384 bits are set to zero.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \param __a
	/// A 128-bit integer vector.
	/// \returns A 512-bit integer vector. The lower 128 bits contain the value of
	/// the parameter. The upper 384 bits are set to zero.
	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_zextsi128_si512(__m128i __a)
	{
	return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3, 2, 3, 2, 3);
	}

	/// \brief Constructs a 512-bit integer vector from a 256-bit integer vector.
	/// The lower 256 bits contain the value of the source vector. The upper
	/// 256 bits are set to zero.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \param __a
	/// A 256-bit integer vector.
	/// \returns A 512-bit integer vector. The lower 256 bits contain the value of
	/// the parameter. The upper 256 bits are set to zero.
	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_zextsi256_si512(__m256i __a)
	{
	return __builtin_shufflevector((__v4di)__a, (__v4di)_mm256_setzero_si256(), 0, 1, 2, 3, 4, 5, 6, 7);
	}

	/* Bitwise operators */
	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_and_epi32(__m512i __a, __m512i __b)
	{
	return (__m512i)((__v16su)__a & (__v16su)__b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
	(__v16si) _mm512_and_epi32(__a, __b),
	(__v16si) __src);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b)
	{
	return (__m512i) _mm512_mask_and_epi32(_mm512_setzero_si512 (),
	__k, __a, __b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_and_epi64(__m512i __a, __m512i __b)
	{
	return (__m512i)((__v8du)__a & (__v8du)__b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
	{
	return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __k,
	(__v8di) _mm512_and_epi64(__a, __b),
	(__v8di) __src);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b)
	{
	return (__m512i) _mm512_mask_and_epi64(_mm512_setzero_si512 (),
	__k, __a, __b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_andnot_si512 (__m512i __A, __m512i __B)
	{
	return (__m512i)(~(__v8du)(__A) & (__v8du)__B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_andnot_epi32 (__m512i __A, __m512i __B)
	{
	return (__m512i)(~(__v16su)(__A) & (__v16su)__B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_andnot_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_andnot_epi32(__A, __B),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_andnot_epi32(__mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)_mm512_mask_andnot_epi32(_mm512_setzero_si512(),
	__U, __A, __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_andnot_epi64(__m512i __A, __m512i __B)
	{
	return (__m512i)(~(__v8du)(__A) & (__v8du)__B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_andnot_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_andnot_epi64(__A, __B),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_andnot_epi64(__mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)_mm512_mask_andnot_epi64(_mm512_setzero_si512(),
	__U, __A, __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_or_epi32(__m512i __a, __m512i __b)
	{
	return (__m512i)((__v16su)__a \| (__v16su)__b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_or_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
	(__v16si)_mm512_or_epi32(__a, __b),
	(__v16si)__src);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b)
	{
	return (__m512i)_mm512_mask_or_epi32(_mm512_setzero_si512(), __k, __a, __b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_or_epi64(__m512i __a, __m512i __b)
	{
	return (__m512i)((__v8du)__a \| (__v8du)__b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_or_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k,
	(__v8di)_mm512_or_epi64(__a, __b),
	(__v8di)__src);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b)
	{
	return (__m512i)_mm512_mask_or_epi64(_mm512_setzero_si512(), __k, __a, __b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_xor_epi32(__m512i __a, __m512i __b)
	{
	return (__m512i)((__v16su)__a ^ (__v16su)__b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_xor_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
	(__v16si)_mm512_xor_epi32(__a, __b),
	(__v16si)__src);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b)
	{
	return (__m512i)_mm512_mask_xor_epi32(_mm512_setzero_si512(), __k, __a, __b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_xor_epi64(__m512i __a, __m512i __b)
	{
	return (__m512i)((__v8du)__a ^ (__v8du)__b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_xor_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k,
	(__v8di)_mm512_xor_epi64(__a, __b),
	(__v8di)__src);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b)
	{
	return (__m512i)_mm512_mask_xor_epi64(_mm512_setzero_si512(), __k, __a, __b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_and_si512(__m512i __a, __m512i __b)
	{
	return (__m512i)((__v8du)__a & (__v8du)__b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_or_si512(__m512i __a, __m512i __b)
	{
	return (__m512i)((__v8du)__a \| (__v8du)__b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_xor_si512(__m512i __a, __m512i __b)
	{
	return (__m512i)((__v8du)__a ^ (__v8du)__b);
	}

	/* Arithmetic */

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_add_pd(__m512d __a, __m512d __b)
	{
	return (__m512d)((__v8df)__a + (__v8df)__b);
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_add_ps(__m512 __a, __m512 __b)
	{
	return (__m512)((__v16sf)__a + (__v16sf)__b);
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_mul_pd(__m512d __a, __m512d __b)
	{
	return (__m512d)((__v8df)__a * (__v8df)__b);
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_mul_ps(__m512 __a, __m512 __b)
	{
	return (__m512)((__v16sf)__a * (__v16sf)__b);
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_sub_pd(__m512d __a, __m512d __b)
	{
	return (__m512d)((__v8df)__a - (__v8df)__b);
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_sub_ps(__m512 __a, __m512 __b)
	{
	return (__m512)((__v16sf)__a - (__v16sf)__b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_add_epi64 (__m512i __A, __m512i __B)
	{
	return (__m512i) ((__v8du) __A + (__v8du) __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_add_epi64(__A, __B),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_add_epi64(__A, __B),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_sub_epi64 (__m512i __A, __m512i __B)
	{
	return (__m512i) ((__v8du) __A - (__v8du) __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_sub_epi64(__A, __B),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_sub_epi64(__A, __B),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_add_epi32 (__m512i __A, __m512i __B)
	{
	return (__m512i) ((__v16su) __A + (__v16su) __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_add_epi32(__A, __B),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_add_epi32(__A, __B),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_sub_epi32 (__m512i __A, __m512i __B)
	{
	return (__m512i) ((__v16su) __A - (__v16su) __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_sub_epi32(__A, __B),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_sub_epi32(__A, __B),
	(__v16si)_mm512_setzero_si512());
	}

	#define _mm512_mask_max_round_pd(W, U, A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_maxpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(W), (__mmask8)(U), \
	(int)(R)); })

	#define _mm512_maskz_max_round_pd(U, A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_maxpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_max_round_pd(A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_maxpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)_mm512_undefined_pd(), \
	(__mmask8)-1, (int)(R)); })

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_max_pd(__m512d __A, __m512d __B)
	{
	return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
	{
	return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B)
	{
	return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_mask_max_round_ps(W, U, A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_maxps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(W), (__mmask16)(U), \
	(int)(R)); })

	#define _mm512_maskz_max_round_ps(U, A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_maxps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_max_round_ps(A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_maxps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)_mm512_undefined_ps(), \
	(__mmask16)-1, (int)(R)); })

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_max_ps(__m512 __A, __m512 __B)
	{
	return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf)
	_mm512_setzero_ps (),
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
	{
	return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __W,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B)
	{
	return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf)
	_mm512_setzero_ps (),
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_max_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
	return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_max_ss(__mmask8 __U,__m128 __A, __m128 __B) {
	return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) _mm_setzero_ps (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_max_round_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_max_round_ss(W, U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)(__m128)(W), (__mmask8)(U), \
	(int)(R)); })

	#define _mm_maskz_max_round_ss(U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_max_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
	return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
	(__v2df) __B,
	(__v2df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_max_sd(__mmask8 __U,__m128d __A, __m128d __B) {
	return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
	(__v2df) __B,
	(__v2df) _mm_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_max_round_sd(A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_max_round_sd(W, U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)(__m128d)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_maskz_max_round_sd(U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	static __inline __m512i
	__DEFAULT_FN_ATTRS
	_mm512_max_epi32(__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si) __W, __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si)
	_mm512_setzero_si512 (),
	__M);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_max_epu32(__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si) __W, __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si)
	_mm512_setzero_si512 (),
	__M);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_max_epi64(__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di)
	_mm512_setzero_si512 (),
	(__mmask8) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di) __W, __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di)
	_mm512_setzero_si512 (),
	__M);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_max_epu64(__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di)
	_mm512_setzero_si512 (),
	(__mmask8) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di) __W, __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di)
	_mm512_setzero_si512 (),
	__M);
	}

	#define _mm512_mask_min_round_pd(W, U, A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_minpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(W), (__mmask8)(U), \
	(int)(R)); })

	#define _mm512_maskz_min_round_pd(U, A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_minpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_min_round_pd(A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_minpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)_mm512_undefined_pd(), \
	(__mmask8)-1, (int)(R)); })

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_min_pd(__m512d __A, __m512d __B)
	{
	return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
	{
	return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_mask_min_round_ps(W, U, A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_minps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(W), (__mmask16)(U), \
	(int)(R)); })

	#define _mm512_maskz_min_round_ps(U, A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_minps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_min_round_ps(A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_minps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)_mm512_undefined_ps(), \
	(__mmask16)-1, (int)(R)); })

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B)
	{
	return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_min_ps(__m512 __A, __m512 __B)
	{
	return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf)
	_mm512_setzero_ps (),
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
	{
	return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __W,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B)
	{
	return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf)
	_mm512_setzero_ps (),
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_min_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
	return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_min_ss(__mmask8 __U,__m128 __A, __m128 __B) {
	return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) _mm_setzero_ps (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_min_round_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_min_round_ss(W, U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)(__m128)(W), (__mmask8)(U), \
	(int)(R)); })

	#define _mm_maskz_min_round_ss(U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_min_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
	return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
	(__v2df) __B,
	(__v2df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_min_sd(__mmask8 __U,__m128d __A, __m128d __B) {
	return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
	(__v2df) __B,
	(__v2df) _mm_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_min_round_sd(A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_min_round_sd(W, U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)(__m128d)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_maskz_min_round_sd(U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	static __inline __m512i
	__DEFAULT_FN_ATTRS
	_mm512_min_epi32(__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si) __W, __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si)
	_mm512_setzero_si512 (),
	__M);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_min_epu32(__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si) __W, __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si)
	_mm512_setzero_si512 (),
	__M);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_min_epi64(__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di)
	_mm512_setzero_si512 (),
	(__mmask8) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di) __W, __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di)
	_mm512_setzero_si512 (),
	__M);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_min_epu64(__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di)
	_mm512_setzero_si512 (),
	(__mmask8) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di) __W, __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di)
	_mm512_setzero_si512 (),
	__M);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_mul_epi32(__m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
	(__v8di)_mm512_mul_epi32(__X, __Y),
	(__v8di)__W);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
	(__v8di)_mm512_mul_epi32(__X, __Y),
	(__v8di)_mm512_setzero_si512 ());
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_mul_epu32(__m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
	(__v8di)_mm512_mul_epu32(__X, __Y),
	(__v8di)__W);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
	(__v8di)_mm512_mul_epu32(__X, __Y),
	(__v8di)_mm512_setzero_si512 ());
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_mullo_epi32 (__m512i __A, __m512i __B)
	{
	return (__m512i) ((__v16su) __A * (__v16su) __B);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
	(__v16si)_mm512_mullo_epi32(__A, __B),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
	(__v16si)_mm512_mullo_epi32(__A, __B),
	(__v16si)__W);
	}

	#define _mm512_mask_sqrt_round_pd(W, U, A, R) __extension__ ({ \
	(__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(W), (__mmask8)(U), \
	(int)(R)); })

	#define _mm512_maskz_sqrt_round_pd(U, A, R) __extension__ ({ \
	(__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_sqrt_round_pd(A, R) __extension__ ({ \
	(__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)_mm512_undefined_pd(), \
	(__mmask8)-1, (int)(R)); })

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_sqrt_pd(__m512d __a)
	{
	return (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)__a,
	(__v8df) _mm512_setzero_pd (),
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
	(__v8df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_mask_sqrt_round_ps(W, U, A, R) __extension__ ({ \
	(__m512)__builtin_ia32_sqrtps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(W), (__mmask16)(U), \
	(int)(R)); })

	#define _mm512_maskz_sqrt_round_ps(U, A, R) __extension__ ({ \
	(__m512)__builtin_ia32_sqrtps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_sqrt_round_ps(A, R) __extension__ ({ \
	(__m512)__builtin_ia32_sqrtps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)_mm512_undefined_ps(), \
	(__mmask16)-1, (int)(R)); })

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_sqrt_ps(__m512 __a)
	{
	return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)__a,
	(__v16sf) _mm512_setzero_ps (),
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A)
	{
	return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)__A,
	(__v16sf) __W,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A)
	{
	return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)__A,
	(__v16sf) _mm512_setzero_ps (),
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_rsqrt14_pd(__m512d __A)
	{
	return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) -1);}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
	(__v8df) __W,
	(__mmask8) __U);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) __U);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_rsqrt14_ps(__m512 __A)
	{
	return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
	(__v16sf)
	_mm512_setzero_ps (),
	(__mmask16) -1);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A)
	{
	return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
	(__v16sf) __W,
	(__mmask16) __U);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A)
	{
	return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
	(__v16sf)
	_mm512_setzero_ps (),
	(__mmask16) __U);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_rsqrt14_ss(__m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf)
	_mm_setzero_ps (),
	(__mmask8) -1);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) __W,
	(__mmask8) __U);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) _mm_setzero_ps (),
	(__mmask8) __U);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_rsqrt14_sd(__m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __A,
	(__v2df) __B,
	(__v2df)
	_mm_setzero_pd (),
	(__mmask8) -1);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
	(__v2df) __B,
	(__v2df) __W,
	(__mmask8) __U);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
	(__v2df) __B,
	(__v2df) _mm_setzero_pd (),
	(__mmask8) __U);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_rcp14_pd(__m512d __A)
	{
	return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) -1);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
	(__v8df) __W,
	(__mmask8) __U);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) __U);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_rcp14_ps(__m512 __A)
	{
	return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
	(__v16sf)
	_mm512_setzero_ps (),
	(__mmask16) -1);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A)
	{
	return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
	(__v16sf) __W,
	(__mmask16) __U);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A)
	{
	return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
	(__v16sf)
	_mm512_setzero_ps (),
	(__mmask16) __U);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_rcp14_ss(__m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf)
	_mm_setzero_ps (),
	(__mmask8) -1);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) __W,
	(__mmask8) __U);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) _mm_setzero_ps (),
	(__mmask8) __U);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_rcp14_sd(__m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __A,
	(__v2df) __B,
	(__v2df)
	_mm_setzero_pd (),
	(__mmask8) -1);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
	(__v2df) __B,
	(__v2df) __W,
	(__mmask8) __U);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
	(__v2df) __B,
	(__v2df) _mm_setzero_pd (),
	(__mmask8) __U);
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_floor_ps(__m512 __A)
	{
	return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
	_MM_FROUND_FLOOR,
	(__v16sf) __A, -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A)
	{
	return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
	_MM_FROUND_FLOOR,
	(__v16sf) __W, __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_floor_pd(__m512d __A)
	{
	return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
	_MM_FROUND_FLOOR,
	(__v8df) __A, -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
	_MM_FROUND_FLOOR,
	(__v8df) __W, __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A)
	{
	return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
	_MM_FROUND_CEIL,
	(__v16sf) __W, __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_ceil_ps(__m512 __A)
	{
	return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
	_MM_FROUND_CEIL,
	(__v16sf) __A, -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_ceil_pd(__m512d __A)
	{
	return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
	_MM_FROUND_CEIL,
	(__v8df) __A, -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
	_MM_FROUND_CEIL,
	(__v8df) __W, __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_abs_epi64(__m512i __A)
	{
	return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
	(__v8di)
	_mm512_setzero_si512 (),
	(__mmask8) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
	(__v8di) __W,
	(__mmask8) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
	(__v8di)
	_mm512_setzero_si512 (),
	(__mmask8) __U);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_abs_epi32(__m512i __A)
	{
	return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
	(__v16si) __W,
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) __U);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_add_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
	return (__m128) __builtin_ia32_addss_round_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) {
	return (__m128) __builtin_ia32_addss_round_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) _mm_setzero_ps (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_add_round_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_add_round_ss(W, U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)(__m128)(W), (__mmask8)(U), \
	(int)(R)); })

	#define _mm_maskz_add_round_ss(U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
	return (__m128d) __builtin_ia32_addsd_round_mask ((__v2df) __A,
	(__v2df) __B,
	(__v2df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) {
	return (__m128d) __builtin_ia32_addsd_round_mask ((__v2df) __A,
	(__v2df) __B,
	(__v2df) _mm_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}
	#define _mm_add_round_sd(A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_add_round_sd(W, U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)(__m128d)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_maskz_add_round_sd(U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
	(__v8df)_mm512_add_pd(__A, __B),
	(__v8df)__W);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) {
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
	(__v8df)_mm512_add_pd(__A, __B),
	(__v8df)_mm512_setzero_pd());
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
	(__v16sf)_mm512_add_ps(__A, __B),
	(__v16sf)__W);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) {
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
	(__v16sf)_mm512_add_ps(__A, __B),
	(__v16sf)_mm512_setzero_ps());
	}

	#define _mm512_add_round_pd(A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_addpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_add_round_pd(W, U, A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_addpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(W), (__mmask8)(U), \
	(int)(R)); })

	#define _mm512_maskz_add_round_pd(U, A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_addpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_add_round_ps(A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_addps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_add_round_ps(W, U, A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_addps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(W), (__mmask16)(U), \
	(int)(R)); })

	#define _mm512_maskz_add_round_ps(U, A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_addps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(U), (int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
	return (__m128) __builtin_ia32_subss_round_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) {
	return (__m128) __builtin_ia32_subss_round_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) _mm_setzero_ps (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}
	#define _mm_sub_round_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_sub_round_ss(W, U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)(__m128)(W), (__mmask8)(U), \
	(int)(R)); })

	#define _mm_maskz_sub_round_ss(U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
	return (__m128d) __builtin_ia32_subsd_round_mask ((__v2df) __A,
	(__v2df) __B,
	(__v2df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) {
	return (__m128d) __builtin_ia32_subsd_round_mask ((__v2df) __A,
	(__v2df) __B,
	(__v2df) _mm_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_sub_round_sd(A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_sub_round_sd(W, U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)(__m128d)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_maskz_sub_round_sd(U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
	(__v8df)_mm512_sub_pd(__A, __B),
	(__v8df)__W);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) {
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
	(__v8df)_mm512_sub_pd(__A, __B),
	(__v8df)_mm512_setzero_pd());
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
	(__v16sf)_mm512_sub_ps(__A, __B),
	(__v16sf)__W);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) {
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
	(__v16sf)_mm512_sub_ps(__A, __B),
	(__v16sf)_mm512_setzero_ps());
	}

	#define _mm512_sub_round_pd(A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_subpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_sub_round_pd(W, U, A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_subpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(W), (__mmask8)(U), \
	(int)(R)); })

	#define _mm512_maskz_sub_round_pd(U, A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_subpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_sub_round_ps(A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_subps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_sub_round_ps(W, U, A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_subps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(W), (__mmask16)(U), \
	(int)(R)); });

	#define _mm512_maskz_sub_round_ps(U, A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_subps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(U), (int)(R)); });

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
	return (__m128) __builtin_ia32_mulss_round_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) {
	return (__m128) __builtin_ia32_mulss_round_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) _mm_setzero_ps (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}
	#define _mm_mul_round_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_mul_round_ss(W, U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)(__m128)(W), (__mmask8)(U), \
	(int)(R)); })

	#define _mm_maskz_mul_round_ss(U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
	return (__m128d) __builtin_ia32_mulsd_round_mask ((__v2df) __A,
	(__v2df) __B,
	(__v2df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) {
	return (__m128d) __builtin_ia32_mulsd_round_mask ((__v2df) __A,
	(__v2df) __B,
	(__v2df) _mm_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mul_round_sd(A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_mul_round_sd(W, U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)(__m128d)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_maskz_mul_round_sd(U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
	(__v8df)_mm512_mul_pd(__A, __B),
	(__v8df)__W);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) {
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
	(__v8df)_mm512_mul_pd(__A, __B),
	(__v8df)_mm512_setzero_pd());
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
	(__v16sf)_mm512_mul_ps(__A, __B),
	(__v16sf)__W);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) {
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
	(__v16sf)_mm512_mul_ps(__A, __B),
	(__v16sf)_mm512_setzero_ps());
	}

	#define _mm512_mul_round_pd(A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_mulpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_mul_round_pd(W, U, A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_mulpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(W), (__mmask8)(U), \
	(int)(R)); })

	#define _mm512_maskz_mul_round_pd(U, A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_mulpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_mul_round_ps(A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_mulps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_mul_round_ps(W, U, A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_mulps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(W), (__mmask16)(U), \
	(int)(R)); });

	#define _mm512_maskz_mul_round_ps(U, A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_mulps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(U), (int)(R)); });

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
	return (__m128) __builtin_ia32_divss_round_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) {
	return (__m128) __builtin_ia32_divss_round_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) _mm_setzero_ps (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_div_round_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_div_round_ss(W, U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)(__m128)(W), (__mmask8)(U), \
	(int)(R)); })

	#define _mm_maskz_div_round_ss(U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
	return (__m128d) __builtin_ia32_divsd_round_mask ((__v2df) __A,
	(__v2df) __B,
	(__v2df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) {
	return (__m128d) __builtin_ia32_divsd_round_mask ((__v2df) __A,
	(__v2df) __B,
	(__v2df) _mm_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_div_round_sd(A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_div_round_sd(W, U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)(__m128d)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_maskz_div_round_sd(U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_div_pd(__m512d __a, __m512d __b)
	{
	return (__m512d)((__v8df)__a/(__v8df)__b);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
	(__v8df)_mm512_div_pd(__A, __B),
	(__v8df)__W);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) {
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
	(__v8df)_mm512_div_pd(__A, __B),
	(__v8df)_mm512_setzero_pd());
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_div_ps(__m512 __a, __m512 __b)
	{
	return (__m512)((__v16sf)__a/(__v16sf)__b);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
	(__v16sf)_mm512_div_ps(__A, __B),
	(__v16sf)__W);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) {
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
	(__v16sf)_mm512_div_ps(__A, __B),
	(__v16sf)_mm512_setzero_ps());
	}

	#define _mm512_div_round_pd(A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_divpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_div_round_pd(W, U, A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_divpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(W), (__mmask8)(U), \
	(int)(R)); })

	#define _mm512_maskz_div_round_pd(U, A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_divpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_div_round_ps(A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_divps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_div_round_ps(W, U, A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_divps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(W), (__mmask16)(U), \
	(int)(R)); });

	#define _mm512_maskz_div_round_ps(U, A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_divps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(U), (int)(R)); });

	#define _mm512_roundscale_ps(A, B) __extension__ ({ \
	(__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \
	(__v16sf)(__m512)(A), (__mmask16)-1, \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_mask_roundscale_ps(A, B, C, imm) __extension__ ({\
	(__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
	(__v16sf)(__m512)(A), (__mmask16)(B), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_maskz_roundscale_ps(A, B, imm) __extension__ ({\
	(__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(A), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_mask_roundscale_round_ps(A, B, C, imm, R) __extension__ ({ \
	(__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
	(__v16sf)(__m512)(A), (__mmask16)(B), \
	(int)(R)); })

	#define _mm512_maskz_roundscale_round_ps(A, B, imm, R) __extension__ ({ \
	(__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(A), (int)(R)); })

	#define _mm512_roundscale_round_ps(A, imm, R) __extension__ ({ \
	(__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(imm), \
	(__v16sf)_mm512_undefined_ps(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_roundscale_pd(A, B) __extension__ ({ \
	(__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(B), \
	(__v8df)(__m512d)(A), (__mmask8)-1, \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_mask_roundscale_pd(A, B, C, imm) __extension__ ({\
	(__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
	(__v8df)(__m512d)(A), (__mmask8)(B), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_maskz_roundscale_pd(A, B, imm) __extension__ ({\
	(__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)(A), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_mask_roundscale_round_pd(A, B, C, imm, R) __extension__ ({ \
	(__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
	(__v8df)(__m512d)(A), (__mmask8)(B), \
	(int)(R)); })

	#define _mm512_maskz_roundscale_round_pd(A, B, imm, R) __extension__ ({ \
	(__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)(A), (int)(R)); })

	#define _mm512_roundscale_round_pd(A, imm, R) __extension__ ({ \
	(__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(imm), \
	(__v8df)_mm512_undefined_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_fmadd_round_pd(A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), (__mmask8)-1, \
	(int)(R)); })


	#define _mm512_mask_fmadd_round_pd(A, U, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	#define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	#define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	#define _mm512_fmsub_round_pd(A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	-(__v8df)(__m512d)(C), \
	(__mmask8)-1, (int)(R)); })


	#define _mm512_mask_fmsub_round_pd(A, U, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	-(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	#define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	-(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	#define _mm512_fnmadd_round_pd(A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), (__mmask8)-1, \
	(int)(R)); })


	#define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	#define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	#define _mm512_fnmsub_round_pd(A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	-(__v8df)(__m512d)(C), \
	(__mmask8)-1, (int)(R)); })


	#define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	-(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
	{
	return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
	(__v8df) __B,
	-(__v8df) __C,
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
	(__v8df) __B,
	-(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
	(__v8df) __B,
	-(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
	{
	return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A,
	(__v8df) __B,
	-(__v8df) __C,
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
	(__v8df) __B,
	-(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_fmadd_round_ps(A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), (__mmask16)-1, \
	(int)(R)); })


	#define _mm512_mask_fmadd_round_ps(A, U, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	#define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	#define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	#define _mm512_fmsub_round_ps(A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	-(__v16sf)(__m512)(C), \
	(__mmask16)-1, (int)(R)); })


	#define _mm512_mask_fmsub_round_ps(A, U, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	-(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	#define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	-(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	#define _mm512_fnmadd_round_ps(A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddps512_mask(-(__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), (__mmask16)-1, \
	(int)(R)); })


	#define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	#define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	#define _mm512_fnmsub_round_ps(A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddps512_mask(-(__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	-(__v16sf)(__m512)(C), \
	(__mmask16)-1, (int)(R)); })


	#define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	-(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
	{
	return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	-(__v16sf) __C,
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	-(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
	(__v16sf) __B,
	-(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
	{
	return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A,
	(__v16sf) __B,
	-(__v16sf) __C,
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
	(__v16sf) __B,
	-(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_fmaddsub_round_pd(A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), \
	(__mmask8)-1, (int)(R)); })


	#define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	#define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	#define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	#define _mm512_fmsubadd_round_pd(A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	-(__v8df)(__m512d)(C), \
	(__mmask8)-1, (int)(R)); })


	#define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	-(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	#define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	-(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
	{
	return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
	(__v8df) __B,
	-(__v8df) __C,
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
	(__v8df) __B,
	-(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
	(__v8df) __B,
	-(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_fmaddsub_round_ps(A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), \
	(__mmask16)-1, (int)(R)); })


	#define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	#define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	#define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	#define _mm512_fmsubadd_round_ps(A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	-(__v16sf)(__m512)(C), \
	(__mmask16)-1, (int)(R)); })


	#define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	-(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	#define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	-(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
	{
	return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	-(__v16sf) __C,
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	-(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
	(__v16sf) __B,
	-(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
	{
	return (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
	{
	return (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
	{
	return (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
	{
	return (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfnmaddpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfnmaddps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfnmsubpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	#define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfnmsubpd512_mask3((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
	{
	return (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfnmsubps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	#define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfnmsubps512_mask3((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
	{
	return (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}



	/* Vector permutations */

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B)
	{
	return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I
	/* idx */ ,
	(__v16si) __A,
	(__v16si) __B,
	(__mmask16) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_permutex2var_epi32 (__m512i __A, __mmask16 __U,
	__m512i __I, __m512i __B)
	{
	return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I
	/* idx */ ,
	(__v16si) __A,
	(__v16si) __B,
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_permutex2var_epi32 (__mmask16 __U, __m512i __A,
	__m512i __I, __m512i __B)
	{
	return (__m512i) __builtin_ia32_vpermt2vard512_maskz ((__v16si) __I
	/* idx */ ,
	(__v16si) __A,
	(__v16si) __B,
	(__mmask16) __U);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B)
	{
	return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I
	/* idx */ ,
	(__v8di) __A,
	(__v8di) __B,
	(__mmask8) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_permutex2var_epi64 (__m512i __A, __mmask8 __U, __m512i __I,
	__m512i __B)
	{
	return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I
	/* idx */ ,
	(__v8di) __A,
	(__v8di) __B,
	(__mmask8) __U);
	}


	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_permutex2var_epi64 (__mmask8 __U, __m512i __A,
	__m512i __I, __m512i __B)
	{
	return (__m512i) __builtin_ia32_vpermt2varq512_maskz ((__v8di) __I
	/* idx */ ,
	(__v8di) __A,
	(__v8di) __B,
	(__mmask8) __U);
	}

	#define _mm512_alignr_epi64(A, B, I) __extension__ ({ \
	(__m512i)__builtin_shufflevector((__v8di)(__m512i)(B), \
	(__v8di)(__m512i)(A), \
	((int)(I) & 0x7) + 0, \
	((int)(I) & 0x7) + 1, \
	((int)(I) & 0x7) + 2, \
	((int)(I) & 0x7) + 3, \
	((int)(I) & 0x7) + 4, \
	((int)(I) & 0x7) + 5, \
	((int)(I) & 0x7) + 6, \
	((int)(I) & 0x7) + 7); })

	#define _mm512_mask_alignr_epi64(W, U, A, B, imm) __extension__({\
	(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
	(__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
	(__v8di)(__m512i)(W)); })

	#define _mm512_maskz_alignr_epi64(U, A, B, imm) __extension__({\
	(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
	(__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
	(__v8di)_mm512_setzero_si512()); })

	#define _mm512_alignr_epi32(A, B, I) __extension__ ({ \
	(__m512i)__builtin_shufflevector((__v16si)(__m512i)(B), \
	(__v16si)(__m512i)(A), \
	((int)(I) & 0xf) + 0, \
	((int)(I) & 0xf) + 1, \
	((int)(I) & 0xf) + 2, \
	((int)(I) & 0xf) + 3, \
	((int)(I) & 0xf) + 4, \
	((int)(I) & 0xf) + 5, \
	((int)(I) & 0xf) + 6, \
	((int)(I) & 0xf) + 7, \
	((int)(I) & 0xf) + 8, \
	((int)(I) & 0xf) + 9, \
	((int)(I) & 0xf) + 10, \
	((int)(I) & 0xf) + 11, \
	((int)(I) & 0xf) + 12, \
	((int)(I) & 0xf) + 13, \
	((int)(I) & 0xf) + 14, \
	((int)(I) & 0xf) + 15); })

	#define _mm512_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({\
	(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
	(__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
	(__v16si)(__m512i)(W)); })

	#define _mm512_maskz_alignr_epi32(U, A, B, imm) __extension__({\
	(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
	(__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
	(__v16si)_mm512_setzero_si512()); })
	/* Vector Extract */

	#define _mm512_extractf64x4_pd(A, I) __extension__ ({ \
	(__m256d)__builtin_shufflevector((__v8df)(__m512d)(A), \
	(__v8df)_mm512_undefined_pd(), \
	((I) & 1) ? 4 : 0, \
	((I) & 1) ? 5 : 1, \
	((I) & 1) ? 6 : 2, \
	((I) & 1) ? 7 : 3); })

	#define _mm512_mask_extractf64x4_pd(W, U, A, imm) __extension__ ({\
	(__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
	(__v4df)_mm512_extractf64x4_pd((A), (imm)), \
	(__v4df)(W)); })

	#define _mm512_maskz_extractf64x4_pd(U, A, imm) __extension__ ({\
	(__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
	(__v4df)_mm512_extractf64x4_pd((A), (imm)), \
	(__v4df)_mm256_setzero_pd()); })

	#define _mm512_extractf32x4_ps(A, I) __extension__ ({ \
	(__m128)__builtin_shufflevector((__v16sf)(__m512)(A), \
	(__v16sf)_mm512_undefined_ps(), \
	0 + ((I) & 0x3) * 4, \
	1 + ((I) & 0x3) * 4, \
	2 + ((I) & 0x3) * 4, \
	3 + ((I) & 0x3) * 4); })

	#define _mm512_mask_extractf32x4_ps(W, U, A, imm) __extension__ ({\
	(__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
	(__v4sf)_mm512_extractf32x4_ps((A), (imm)), \
	(__v4sf)(W)); })

	#define _mm512_maskz_extractf32x4_ps(U, A, imm) __extension__ ({\
	(__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
	(__v4sf)_mm512_extractf32x4_ps((A), (imm)), \
	(__v4sf)_mm_setzero_ps()); })

	/* Vector Blend */

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W)
	{
	return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
	(__v8df) __W,
	(__v8df) __A);
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W)
	{
	return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
	(__v16sf) __W,
	(__v16sf) __A);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W)
	{
	return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
	(__v8di) __W,
	(__v8di) __A);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
	{
	return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
	(__v16si) __W,
	(__v16si) __A);
	}

	/* Compare */

	#define _mm512_cmp_round_ps_mask(A, B, P, R) __extension__ ({ \
	(__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), (int)(P), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) __extension__ ({ \
	(__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), (int)(P), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_cmp_ps_mask(A, B, P) \
	_mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
	#define _mm512_mask_cmp_ps_mask(U, A, B, P) \
	_mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)

	#define _mm512_cmpeq_ps_mask(A, B) \
	_mm512_cmp_ps_mask((A), (B), _CMP_EQ_OQ)
	#define _mm512_mask_cmpeq_ps_mask(k, A, B) \
	_mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_EQ_OQ)

	#define _mm512_cmplt_ps_mask(A, B) \
	_mm512_cmp_ps_mask((A), (B), _CMP_LT_OS)
	#define _mm512_mask_cmplt_ps_mask(k, A, B) \
	_mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LT_OS)

	#define _mm512_cmple_ps_mask(A, B) \
	_mm512_cmp_ps_mask((A), (B), _CMP_LE_OS)
	#define _mm512_mask_cmple_ps_mask(k, A, B) \
	_mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LE_OS)

	#define _mm512_cmpunord_ps_mask(A, B) \
	_mm512_cmp_ps_mask((A), (B), _CMP_UNORD_Q)
	#define _mm512_mask_cmpunord_ps_mask(k, A, B) \
	_mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_UNORD_Q)

	#define _mm512_cmpneq_ps_mask(A, B) \
	_mm512_cmp_ps_mask((A), (B), _CMP_NEQ_UQ)
	#define _mm512_mask_cmpneq_ps_mask(k, A, B) \
	_mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NEQ_UQ)

	#define _mm512_cmpnlt_ps_mask(A, B) \
	_mm512_cmp_ps_mask((A), (B), _CMP_NLT_US)
	#define _mm512_mask_cmpnlt_ps_mask(k, A, B) \
	_mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLT_US)

	#define _mm512_cmpnle_ps_mask(A, B) \
	_mm512_cmp_ps_mask((A), (B), _CMP_NLE_US)
	#define _mm512_mask_cmpnle_ps_mask(k, A, B) \
	_mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLE_US)

	#define _mm512_cmpord_ps_mask(A, B) \
	_mm512_cmp_ps_mask((A), (B), _CMP_ORD_Q)
	#define _mm512_mask_cmpord_ps_mask(k, A, B) \
	_mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_ORD_Q)

	#define _mm512_cmp_round_pd_mask(A, B, P, R) __extension__ ({ \
	(__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), (int)(P), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) __extension__ ({ \
	(__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), (int)(P), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_cmp_pd_mask(A, B, P) \
	_mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
	#define _mm512_mask_cmp_pd_mask(U, A, B, P) \
	_mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)

	#define _mm512_cmpeq_pd_mask(A, B) \
	_mm512_cmp_pd_mask((A), (B), _CMP_EQ_OQ)
	#define _mm512_mask_cmpeq_pd_mask(k, A, B) \
	_mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_EQ_OQ)

	#define _mm512_cmplt_pd_mask(A, B) \
	_mm512_cmp_pd_mask((A), (B), _CMP_LT_OS)
	#define _mm512_mask_cmplt_pd_mask(k, A, B) \
	_mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LT_OS)

	#define _mm512_cmple_pd_mask(A, B) \
	_mm512_cmp_pd_mask((A), (B), _CMP_LE_OS)
	#define _mm512_mask_cmple_pd_mask(k, A, B) \
	_mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LE_OS)

	#define _mm512_cmpunord_pd_mask(A, B) \
	_mm512_cmp_pd_mask((A), (B), _CMP_UNORD_Q)
	#define _mm512_mask_cmpunord_pd_mask(k, A, B) \
	_mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_UNORD_Q)

	#define _mm512_cmpneq_pd_mask(A, B) \
	_mm512_cmp_pd_mask((A), (B), _CMP_NEQ_UQ)
	#define _mm512_mask_cmpneq_pd_mask(k, A, B) \
	_mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NEQ_UQ)

	#define _mm512_cmpnlt_pd_mask(A, B) \
	_mm512_cmp_pd_mask((A), (B), _CMP_NLT_US)
	#define _mm512_mask_cmpnlt_pd_mask(k, A, B) \
	_mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLT_US)

	#define _mm512_cmpnle_pd_mask(A, B) \
	_mm512_cmp_pd_mask((A), (B), _CMP_NLE_US)
	#define _mm512_mask_cmpnle_pd_mask(k, A, B) \
	_mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLE_US)

	#define _mm512_cmpord_pd_mask(A, B) \
	_mm512_cmp_pd_mask((A), (B), _CMP_ORD_Q)
	#define _mm512_mask_cmpord_pd_mask(k, A, B) \
	_mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_ORD_Q)

	/* Conversion */

	#define _mm512_cvtt_roundps_epu32(A, R) __extension__ ({ \
	(__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
	(__v16si)_mm512_undefined_epi32(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_cvtt_roundps_epu32(W, U, A, R) __extension__ ({ \
	(__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
	(__v16si)(__m512i)(W), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_maskz_cvtt_roundps_epu32(U, A, R) __extension__ ({ \
	(__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
	(__v16si)_mm512_setzero_si512(), \
	(__mmask16)(U), (int)(R)); })


	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_cvttps_epu32(__m512 __A)
	{
	return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
	{
	return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
	(__v16si) __W,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A)
	{
	return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
	(__v16si) _mm512_setzero_si512 (),
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_cvt_roundepi32_ps(A, R) __extension__ ({ \
	(__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_cvt_roundepi32_ps(W, U, A, R) __extension__ ({ \
	(__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
	(__v16sf)(__m512)(W), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_maskz_cvt_roundepi32_ps(U, A, R) __extension__ ({ \
	(__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_cvt_roundepu32_ps(A, R) __extension__ ({ \
	(__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_cvt_roundepu32_ps(W, U, A, R) __extension__ ({ \
	(__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
	(__v16sf)(__m512)(W), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_maskz_cvt_roundepu32_ps(U, A, R) __extension__ ({ \
	(__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(U), (int)(R)); })

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_cvtepu32_ps (__m512i __A)
	{
	return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
	(__v16sf) _mm512_undefined_ps (),
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A)
	{
	return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
	(__v16sf) __W,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A)
	{
	return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
	(__v16sf) _mm512_setzero_ps (),
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_cvtepi32_pd(__m256i __A)
	{
	return (__m512d)__builtin_convertvector((__v8si)__A, __v8df);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A)
	{
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
	(__v8df)_mm512_cvtepi32_pd(__A),
	(__v8df)__W);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A)
	{
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
	(__v8df)_mm512_cvtepi32_pd(__A),
	(__v8df)_mm512_setzero_pd());
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_cvtepi32lo_pd(__m512i __A)
	{
	return (__m512d) _mm512_cvtepi32_pd(_mm512_castsi512_si256(__A));
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
	{
	return (__m512d) _mm512_mask_cvtepi32_pd(__W, __U, _mm512_castsi512_si256(__A));
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_cvtepi32_ps (__m512i __A)
	{
	return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
	(__v16sf) _mm512_undefined_ps (),
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A)
	{
	return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
	(__v16sf) __W,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A)
	{
	return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
	(__v16sf) _mm512_setzero_ps (),
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_cvtepu32_pd(__m256i __A)
	{
	return (__m512d)__builtin_convertvector((__v8su)__A, __v8df);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A)
	{
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
	(__v8df)_mm512_cvtepu32_pd(__A),
	(__v8df)__W);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A)
	{
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
	(__v8df)_mm512_cvtepu32_pd(__A),
	(__v8df)_mm512_setzero_pd());
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_cvtepu32lo_pd(__m512i __A)
	{
	return (__m512d) _mm512_cvtepu32_pd(_mm512_castsi512_si256(__A));
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
	{
	return (__m512d) _mm512_mask_cvtepu32_pd(__W, __U, _mm512_castsi512_si256(__A));
	}

	#define _mm512_cvt_roundpd_ps(A, R) __extension__ ({ \
	(__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
	(__v8sf)_mm256_setzero_ps(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_cvt_roundpd_ps(W, U, A, R) __extension__ ({ \
	(__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
	(__v8sf)(__m256)(W), (__mmask8)(U), \
	(int)(R)); })

	#define _mm512_maskz_cvt_roundpd_ps(U, A, R) __extension__ ({ \
	(__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
	(__v8sf)_mm256_setzero_ps(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m256 __DEFAULT_FN_ATTRS
	_mm512_cvtpd_ps (__m512d __A)
	{
	return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
	(__v8sf) _mm256_undefined_ps (),
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m256 __DEFAULT_FN_ATTRS
	_mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A)
	{
	return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
	(__v8sf) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m256 __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A)
	{
	return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
	(__v8sf) _mm256_setzero_ps (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_cvtpd_pslo (__m512d __A)
	{
	return (__m512) __builtin_shufflevector((__v8sf) _mm512_cvtpd_ps(__A),
	(__v8sf) _mm256_setzero_ps (),
	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A)
	{
	return (__m512) __builtin_shufflevector (
	(__v8sf) _mm512_mask_cvtpd_ps (_mm512_castps512_ps256(__W),
	__U, __A),
	(__v8sf) _mm256_setzero_ps (),
	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
	}

	#define _mm512_cvt_roundps_ph(A, I) __extension__ ({ \
	(__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
	(__v16hi)_mm256_undefined_si256(), \
	(__mmask16)-1); })

	#define _mm512_mask_cvt_roundps_ph(U, W, A, I) __extension__ ({ \
	(__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
	(__v16hi)(__m256i)(U), \
	(__mmask16)(W)); })

	#define _mm512_maskz_cvt_roundps_ph(W, A, I) __extension__ ({ \
	(__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
	(__v16hi)_mm256_setzero_si256(), \
	(__mmask16)(W)); })

	#define _mm512_cvtps_ph(A, I) __extension__ ({ \
	(__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
	(__v16hi)_mm256_setzero_si256(), \
	(__mmask16)-1); })

	#define _mm512_mask_cvtps_ph(U, W, A, I) __extension__ ({ \
	(__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
	(__v16hi)(__m256i)(U), \
	(__mmask16)(W)); })

	#define _mm512_maskz_cvtps_ph(W, A, I) __extension__ ({\
	(__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
	(__v16hi)_mm256_setzero_si256(), \
	(__mmask16)(W)); })

	#define _mm512_cvt_roundph_ps(A, R) __extension__ ({ \
	(__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
	(__v16sf)_mm512_undefined_ps(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_cvt_roundph_ps(W, U, A, R) __extension__ ({ \
	(__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
	(__v16sf)(__m512)(W), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_maskz_cvt_roundph_ps(U, A, R) __extension__ ({ \
	(__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(U), (int)(R)); })


	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_cvtph_ps(__m256i __A)
	{
	return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
	(__v16sf)
	_mm512_setzero_ps (),
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A)
	{
	return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
	(__v16sf) __W,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A)
	{
	return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
	(__v16sf) _mm512_setzero_ps (),
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_cvtt_roundpd_epi32(A, R) __extension__ ({ \
	(__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
	(__v8si)_mm256_setzero_si256(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_cvtt_roundpd_epi32(W, U, A, R) __extension__ ({ \
	(__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
	(__v8si)(__m256i)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_maskz_cvtt_roundpd_epi32(U, A, R) __extension__ ({ \
	(__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
	(__v8si)_mm256_setzero_si256(), \
	(__mmask8)(U), (int)(R)); })

	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm512_cvttpd_epi32(__m512d __a)
	{
	return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df) __a,
	(__v8si)_mm256_setzero_si256(),
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
	{
	return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
	(__v8si) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A)
	{
	return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
	(__v8si) _mm256_setzero_si256 (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_cvtt_roundps_epi32(A, R) __extension__ ({ \
	(__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
	(__v16si)_mm512_setzero_si512(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_cvtt_roundps_epi32(W, U, A, R) __extension__ ({ \
	(__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
	(__v16si)(__m512i)(W), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_maskz_cvtt_roundps_epi32(U, A, R) __extension__ ({ \
	(__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
	(__v16si)_mm512_setzero_si512(), \
	(__mmask16)(U), (int)(R)); })

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_cvttps_epi32(__m512 __a)
	{
	return (__m512i)
	__builtin_ia32_cvttps2dq512_mask((__v16sf) __a,
	(__v16si) _mm512_setzero_si512 (),
	(__mmask16) -1, _MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
	{
	return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
	(__v16si) __W,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A)
	{
	return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
	(__v16si) _mm512_setzero_si512 (),
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_cvt_roundps_epi32(A, R) __extension__ ({ \
	(__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
	(__v16si)_mm512_setzero_si512(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_cvt_roundps_epi32(W, U, A, R) __extension__ ({ \
	(__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
	(__v16si)(__m512i)(W), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_maskz_cvt_roundps_epi32(U, A, R) __extension__ ({ \
	(__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
	(__v16si)_mm512_setzero_si512(), \
	(__mmask16)(U), (int)(R)); })

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_cvtps_epi32 (__m512 __A)
	{
	return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
	(__v16si) _mm512_undefined_epi32 (),
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
	{
	return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
	(__v16si) __W,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A)
	{
	return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_cvt_roundpd_epi32(A, R) __extension__ ({ \
	(__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
	(__v8si)_mm256_setzero_si256(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_cvt_roundpd_epi32(W, U, A, R) __extension__ ({ \
	(__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
	(__v8si)(__m256i)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_maskz_cvt_roundpd_epi32(U, A, R) __extension__ ({ \
	(__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
	(__v8si)_mm256_setzero_si256(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_cvtpd_epi32 (__m512d __A)
	{
	return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
	(__v8si)
	_mm256_undefined_si256 (),
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
	{
	return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
	(__v8si) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A)
	{
	return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
	(__v8si)
	_mm256_setzero_si256 (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_cvt_roundps_epu32(A, R) __extension__ ({ \
	(__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
	(__v16si)_mm512_setzero_si512(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_cvt_roundps_epu32(W, U, A, R) __extension__ ({ \
	(__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
	(__v16si)(__m512i)(W), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_maskz_cvt_roundps_epu32(U, A, R) __extension__ ({ \
	(__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
	(__v16si)_mm512_setzero_si512(), \
	(__mmask16)(U), (int)(R)); })

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_cvtps_epu32 ( __m512 __A)
	{
	return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,\
	(__v16si)\
	_mm512_undefined_epi32 (),\
	(__mmask16) -1,\
	_MM_FROUND_CUR_DIRECTION);\
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
	{
	return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
	(__v16si) __W,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtps_epu32 ( __mmask16 __U, __m512 __A)
	{
	return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) __U ,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_cvt_roundpd_epu32(A, R) __extension__ ({ \
	(__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
	(__v8si)_mm256_setzero_si256(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_cvt_roundpd_epu32(W, U, A, R) __extension__ ({ \
	(__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
	(__v8si)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_maskz_cvt_roundpd_epu32(U, A, R) __extension__ ({ \
	(__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
	(__v8si)_mm256_setzero_si256(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_cvtpd_epu32 (__m512d __A)
	{
	return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
	(__v8si)
	_mm256_undefined_si256 (),
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
	{
	return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
	(__v8si) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A)
	{
	return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
	(__v8si)
	_mm256_setzero_si256 (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ double __DEFAULT_FN_ATTRS
	_mm512_cvtsd_f64(__m512d __a)
	{
	return __a[0];
	}

	static __inline__ float __DEFAULT_FN_ATTRS
	_mm512_cvtss_f32(__m512 __a)
	{
	return __a[0];
	}

	/* Unpack and Interleave */

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_unpackhi_pd(__m512d __a, __m512d __b)
	{
	return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
	1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
	{
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
	(__v8df)_mm512_unpackhi_pd(__A, __B),
	(__v8df)__W);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B)
	{
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
	(__v8df)_mm512_unpackhi_pd(__A, __B),
	(__v8df)_mm512_setzero_pd());
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_unpacklo_pd(__m512d __a, __m512d __b)
	{
	return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
	0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
	{
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
	(__v8df)_mm512_unpacklo_pd(__A, __B),
	(__v8df)__W);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B)
	{
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
	(__v8df)_mm512_unpacklo_pd(__A, __B),
	(__v8df)_mm512_setzero_pd());
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_unpackhi_ps(__m512 __a, __m512 __b)
	{
	return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
	2, 18, 3, 19,
	2+4, 18+4, 3+4, 19+4,
	2+8, 18+8, 3+8, 19+8,
	2+12, 18+12, 3+12, 19+12);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
	{
	return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
	(__v16sf)_mm512_unpackhi_ps(__A, __B),
	(__v16sf)__W);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B)
	{
	return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
	(__v16sf)_mm512_unpackhi_ps(__A, __B),
	(__v16sf)_mm512_setzero_ps());
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_unpacklo_ps(__m512 __a, __m512 __b)
	{
	return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
	0, 16, 1, 17,
	0+4, 16+4, 1+4, 17+4,
	0+8, 16+8, 1+8, 17+8,
	0+12, 16+12, 1+12, 17+12);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
	{
	return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
	(__v16sf)_mm512_unpacklo_ps(__A, __B),
	(__v16sf)__W);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B)
	{
	return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
	(__v16sf)_mm512_unpacklo_ps(__A, __B),
	(__v16sf)_mm512_setzero_ps());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_unpackhi_epi32(__m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
	2, 18, 3, 19,
	2+4, 18+4, 3+4, 19+4,
	2+8, 18+8, 3+8, 19+8,
	2+12, 18+12, 3+12, 19+12);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
	(__v16si)_mm512_unpackhi_epi32(__A, __B),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
	(__v16si)_mm512_unpackhi_epi32(__A, __B),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_unpacklo_epi32(__m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
	0, 16, 1, 17,
	0+4, 16+4, 1+4, 17+4,
	0+8, 16+8, 1+8, 17+8,
	0+12, 16+12, 1+12, 17+12);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
	(__v16si)_mm512_unpacklo_epi32(__A, __B),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
	(__v16si)_mm512_unpacklo_epi32(__A, __B),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_unpackhi_epi64(__m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
	1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
	(__v8di)_mm512_unpackhi_epi64(__A, __B),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
	(__v8di)_mm512_unpackhi_epi64(__A, __B),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_unpacklo_epi64 (__m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
	0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
	(__v8di)_mm512_unpacklo_epi64(__A, __B),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
	(__v8di)_mm512_unpacklo_epi64(__A, __B),
	(__v8di)_mm512_setzero_si512());
	}


	/* SIMD load ops */

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_loadu_si512 (void const *__P)
	{
	return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) -1);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P)
	{
	return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P,
	(__v16si) __W,
	(__mmask16) __U);
	}


	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P)
	{
	return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *)__P,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) __U);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P)
	{
	return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P,
	(__v8di) __W,
	(__mmask8) __U);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P)
	{
	return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *)__P,
	(__v8di)
	_mm512_setzero_si512 (),
	(__mmask8) __U);
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P)
	{
	return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P,
	(__v16sf) __W,
	(__mmask16) __U);
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_loadu_ps(__mmask16 __U, void const *__P)
	{
	return (__m512) __builtin_ia32_loadups512_mask ((const float *)__P,
	(__v16sf)
	_mm512_setzero_ps (),
	(__mmask16) __U);
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P)
	{
	return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P,
	(__v8df) __W,
	(__mmask8) __U);
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_loadu_pd(__mmask8 __U, void const *__P)
	{
	return (__m512d) __builtin_ia32_loadupd512_mask ((const double *)__P,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) __U);
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_loadu_pd(void const *__p)
	{
	struct __loadu_pd {
	__m512d __v;
	} __attribute__((__packed__, __may_alias__));
	return ((struct __loadu_pd*)__p)->__v;
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_loadu_ps(void const *__p)
	{
	struct __loadu_ps {
	__m512 __v;
	} __attribute__((__packed__, __may_alias__));
	return ((struct __loadu_ps*)__p)->__v;
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_load_ps(void const *__p)
	{
	return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__p,
	(__v16sf)
	_mm512_setzero_ps (),
	(__mmask16) -1);
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_load_ps (__m512 __W, __mmask16 __U, void const *__P)
	{
	return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P,
	(__v16sf) __W,
	(__mmask16) __U);
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_load_ps(__mmask16 __U, void const *__P)
	{
	return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__P,
	(__v16sf)
	_mm512_setzero_ps (),
	(__mmask16) __U);
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_load_pd(void const *__p)
	{
	return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__p,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) -1);
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_load_pd (__m512d __W, __mmask8 __U, void const *__P)
	{
	return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P,
	(__v8df) __W,
	(__mmask8) __U);
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_load_pd(__mmask8 __U, void const *__P)
	{
	return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__P,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) __U);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_load_si512 (void const *__P)
	{
	return (__m512i ) __P;
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_load_epi32 (void const *__P)
	{
	return (__m512i ) __P;
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_load_epi64 (void const *__P)
	{
	return (__m512i ) __P;
	}

	/* SIMD store ops */

	static __inline void __DEFAULT_FN_ATTRS
	_mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A)
	{
	__builtin_ia32_storedqudi512_mask ((long long *)__P, (__v8di) __A,
	(__mmask8) __U);
	}

	static __inline void __DEFAULT_FN_ATTRS
	_mm512_storeu_si512 (void *__P, __m512i __A)
	{
	__builtin_ia32_storedqusi512_mask ((int *) __P, (__v16si) __A,
	(__mmask16) -1);
	}

	static __inline void __DEFAULT_FN_ATTRS
	_mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A)
	{
	__builtin_ia32_storedqusi512_mask ((int *)__P, (__v16si) __A,
	(__mmask16) __U);
	}

	static __inline void __DEFAULT_FN_ATTRS
	_mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A)
	{
	__builtin_ia32_storeupd512_mask ((double *)__P, (__v8df) __A, (__mmask8) __U);
	}

	static __inline void __DEFAULT_FN_ATTRS
	_mm512_storeu_pd(void *__P, __m512d __A)
	{
	__builtin_ia32_storeupd512_mask((double *)__P, (__v8df)__A, (__mmask8)-1);
	}

	static __inline void __DEFAULT_FN_ATTRS
	_mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A)
	{
	__builtin_ia32_storeups512_mask ((float *)__P, (__v16sf) __A,
	(__mmask16) __U);
	}

	static __inline void __DEFAULT_FN_ATTRS
	_mm512_storeu_ps(void *__P, __m512 __A)
	{
	__builtin_ia32_storeups512_mask((float *)__P, (__v16sf)__A, (__mmask16)-1);
	}

	static __inline void __DEFAULT_FN_ATTRS
	_mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A)
	{
	__builtin_ia32_storeapd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U);
	}

	static __inline void __DEFAULT_FN_ATTRS
	_mm512_store_pd(void *__P, __m512d __A)
	{
	(__m512d)__P = __A;
	}

	static __inline void __DEFAULT_FN_ATTRS
	_mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A)
	{
	__builtin_ia32_storeaps512_mask ((__v16sf *)__P, (__v16sf) __A,
	(__mmask16) __U);
	}

	static __inline void __DEFAULT_FN_ATTRS
	_mm512_store_ps(void *__P, __m512 __A)
	{
	(__m512)__P = __A;
	}

	static __inline void __DEFAULT_FN_ATTRS
	_mm512_store_si512 (void *__P, __m512i __A)
	{
	(__m512i ) __P = __A;
	}

	static __inline void __DEFAULT_FN_ATTRS
	_mm512_store_epi32 (void *__P, __m512i __A)
	{
	(__m512i ) __P = __A;
	}

	static __inline void __DEFAULT_FN_ATTRS
	_mm512_store_epi64 (void *__P, __m512i __A)
	{
	(__m512i ) __P = __A;
	}

	/* Mask ops */

	static __inline __mmask16 __DEFAULT_FN_ATTRS
	_mm512_knot(__mmask16 __M)
	{
	return __builtin_ia32_knothi(__M);
	}

	/* Integer compare */

	#define _mm512_cmpeq_epi32_mask(A, B) \
	_mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ)
	#define _mm512_mask_cmpeq_epi32_mask(k, A, B) \
	_mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ)
	#define _mm512_cmpge_epi32_mask(A, B) \
	_mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GE)
	#define _mm512_mask_cmpge_epi32_mask(k, A, B) \
	_mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE)
	#define _mm512_cmpgt_epi32_mask(A, B) \
	_mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GT)
	#define _mm512_mask_cmpgt_epi32_mask(k, A, B) \
	_mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT)
	#define _mm512_cmple_epi32_mask(A, B) \
	_mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LE)
	#define _mm512_mask_cmple_epi32_mask(k, A, B) \
	_mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE)
	#define _mm512_cmplt_epi32_mask(A, B) \
	_mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LT)
	#define _mm512_mask_cmplt_epi32_mask(k, A, B) \
	_mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT)
	#define _mm512_cmpneq_epi32_mask(A, B) \
	_mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_NE)
	#define _mm512_mask_cmpneq_epi32_mask(k, A, B) \
	_mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE)

	#define _mm512_cmpeq_epu32_mask(A, B) \
	_mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ)
	#define _mm512_mask_cmpeq_epu32_mask(k, A, B) \
	_mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ)
	#define _mm512_cmpge_epu32_mask(A, B) \
	_mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GE)
	#define _mm512_mask_cmpge_epu32_mask(k, A, B) \
	_mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE)
	#define _mm512_cmpgt_epu32_mask(A, B) \
	_mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GT)
	#define _mm512_mask_cmpgt_epu32_mask(k, A, B) \
	_mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT)
	#define _mm512_cmple_epu32_mask(A, B) \
	_mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LE)
	#define _mm512_mask_cmple_epu32_mask(k, A, B) \
	_mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE)
	#define _mm512_cmplt_epu32_mask(A, B) \
	_mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LT)
	#define _mm512_mask_cmplt_epu32_mask(k, A, B) \
	_mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT)
	#define _mm512_cmpneq_epu32_mask(A, B) \
	_mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_NE)
	#define _mm512_mask_cmpneq_epu32_mask(k, A, B) \
	_mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE)

	#define _mm512_cmpeq_epi64_mask(A, B) \
	_mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ)
	#define _mm512_mask_cmpeq_epi64_mask(k, A, B) \
	_mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ)
	#define _mm512_cmpge_epi64_mask(A, B) \
	_mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GE)
	#define _mm512_mask_cmpge_epi64_mask(k, A, B) \
	_mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE)
	#define _mm512_cmpgt_epi64_mask(A, B) \
	_mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GT)
	#define _mm512_mask_cmpgt_epi64_mask(k, A, B) \
	_mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT)
	#define _mm512_cmple_epi64_mask(A, B) \
	_mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LE)
	#define _mm512_mask_cmple_epi64_mask(k, A, B) \
	_mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE)
	#define _mm512_cmplt_epi64_mask(A, B) \
	_mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LT)
	#define _mm512_mask_cmplt_epi64_mask(k, A, B) \
	_mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT)
	#define _mm512_cmpneq_epi64_mask(A, B) \
	_mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_NE)
	#define _mm512_mask_cmpneq_epi64_mask(k, A, B) \
	_mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE)

	#define _mm512_cmpeq_epu64_mask(A, B) \
	_mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ)
	#define _mm512_mask_cmpeq_epu64_mask(k, A, B) \
	_mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ)
	#define _mm512_cmpge_epu64_mask(A, B) \
	_mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GE)
	#define _mm512_mask_cmpge_epu64_mask(k, A, B) \
	_mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE)
	#define _mm512_cmpgt_epu64_mask(A, B) \
	_mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GT)
	#define _mm512_mask_cmpgt_epu64_mask(k, A, B) \
	_mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT)
	#define _mm512_cmple_epu64_mask(A, B) \
	_mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LE)
	#define _mm512_mask_cmple_epu64_mask(k, A, B) \
	_mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE)
	#define _mm512_cmplt_epu64_mask(A, B) \
	_mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LT)
	#define _mm512_mask_cmplt_epu64_mask(k, A, B) \
	_mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT)
	#define _mm512_cmpneq_epu64_mask(A, B) \
	_mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_NE)
	#define _mm512_mask_cmpneq_epu64_mask(k, A, B) \
	_mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE)

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_cvtepi8_epi32(__m128i __A)
	{
	/* This function always performs a signed extension, but __v16qi is a char
	which may be signed or unsigned, so use __v16qs. */
	return (__m512i)__builtin_convertvector((__v16qs)__A, __v16si);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_cvtepi8_epi32(__A),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_cvtepi8_epi32(__A),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_cvtepi8_epi64(__m128i __A)
	{
	/* This function always performs a signed extension, but __v16qi is a char
	which may be signed or unsigned, so use __v16qs. */
	return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__A, (__v16qs)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_cvtepi8_epi64(__A),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_cvtepi8_epi64(__A),
	(__v8di)_mm512_setzero_si512 ());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_cvtepi32_epi64(__m256i __X)
	{
	return (__m512i)__builtin_convertvector((__v8si)__X, __v8di);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_cvtepi32_epi64(__X),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_cvtepi32_epi64(__X),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_cvtepi16_epi32(__m256i __A)
	{
	return (__m512i)__builtin_convertvector((__v16hi)__A, __v16si);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_cvtepi16_epi32(__A),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_cvtepi16_epi32(__A),
	(__v16si)_mm512_setzero_si512 ());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_cvtepi16_epi64(__m128i __A)
	{
	return (__m512i)__builtin_convertvector((__v8hi)__A, __v8di);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_cvtepi16_epi64(__A),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_cvtepi16_epi64(__A),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_cvtepu8_epi32(__m128i __A)
	{
	return (__m512i)__builtin_convertvector((__v16qu)__A, __v16si);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_cvtepu8_epi32(__A),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_cvtepu8_epi32(__A),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_cvtepu8_epi64(__m128i __A)
	{
	return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__A, (__v16qu)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_cvtepu8_epi64(__A),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_cvtepu8_epi64(__A),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_cvtepu32_epi64(__m256i __X)
	{
	return (__m512i)__builtin_convertvector((__v8su)__X, __v8di);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_cvtepu32_epi64(__X),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_cvtepu32_epi64(__X),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_cvtepu16_epi32(__m256i __A)
	{
	return (__m512i)__builtin_convertvector((__v16hu)__A, __v16si);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_cvtepu16_epi32(__A),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_cvtepu16_epi32(__A),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_cvtepu16_epi64(__m128i __A)
	{
	return (__m512i)__builtin_convertvector((__v8hu)__A, __v8di);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_cvtepu16_epi64(__A),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_cvtepu16_epi64(__A),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_rorv_epi32 (__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si) __W,
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_rorv_epi64 (__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di)
	_mm512_setzero_si512 (),
	(__mmask8) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di) __W,
	(__mmask8) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di)
	_mm512_setzero_si512 (),
	(__mmask8) __U);
	}



	#define _mm512_cmp_epi32_mask(a, b, p) __extension__ ({ \
	(__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
	(__v16si)(__m512i)(b), (int)(p), \
	(__mmask16)-1); })

	#define _mm512_cmp_epu32_mask(a, b, p) __extension__ ({ \
	(__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
	(__v16si)(__m512i)(b), (int)(p), \
	(__mmask16)-1); })

	#define _mm512_cmp_epi64_mask(a, b, p) __extension__ ({ \
	(__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
	(__v8di)(__m512i)(b), (int)(p), \
	(__mmask8)-1); })

	#define _mm512_cmp_epu64_mask(a, b, p) __extension__ ({ \
	(__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
	(__v8di)(__m512i)(b), (int)(p), \
	(__mmask8)-1); })

	#define _mm512_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \
	(__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
	(__v16si)(__m512i)(b), (int)(p), \
	(__mmask16)(m)); })

	#define _mm512_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \
	(__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
	(__v16si)(__m512i)(b), (int)(p), \
	(__mmask16)(m)); })

	#define _mm512_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \
	(__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
	(__v8di)(__m512i)(b), (int)(p), \
	(__mmask8)(m)); })

	#define _mm512_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \
	(__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
	(__v8di)(__m512i)(b), (int)(p), \
	(__mmask8)(m)); })

	#define _mm512_rol_epi32(a, b) __extension__ ({ \
	(__m512i)__builtin_ia32_prold512_mask((__v16si)(__m512i)(a), (int)(b), \
	(__v16si)_mm512_setzero_si512(), \
	(__mmask16)-1); })

	#define _mm512_mask_rol_epi32(W, U, a, b) __extension__ ({ \
	(__m512i)__builtin_ia32_prold512_mask((__v16si)(__m512i)(a), (int)(b), \
	(__v16si)(__m512i)(W), \
	(__mmask16)(U)); })

	#define _mm512_maskz_rol_epi32(U, a, b) __extension__ ({ \
	(__m512i)__builtin_ia32_prold512_mask((__v16si)(__m512i)(a), (int)(b), \
	(__v16si)_mm512_setzero_si512(), \
	(__mmask16)(U)); })

	#define _mm512_rol_epi64(a, b) __extension__ ({ \
	(__m512i)__builtin_ia32_prolq512_mask((__v8di)(__m512i)(a), (int)(b), \
	(__v8di)_mm512_setzero_si512(), \
	(__mmask8)-1); })

	#define _mm512_mask_rol_epi64(W, U, a, b) __extension__ ({ \
	(__m512i)__builtin_ia32_prolq512_mask((__v8di)(__m512i)(a), (int)(b), \
	(__v8di)(__m512i)(W), (__mmask8)(U)); })

	#define _mm512_maskz_rol_epi64(U, a, b) __extension__ ({ \
	(__m512i)__builtin_ia32_prolq512_mask((__v8di)(__m512i)(a), (int)(b), \
	(__v8di)_mm512_setzero_si512(), \
	(__mmask8)(U)); })
	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_rolv_epi32 (__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si) __W,
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_rolv_epi64 (__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di)
	_mm512_setzero_si512 (),
	(__mmask8) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di) __W,
	(__mmask8) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di)
	_mm512_setzero_si512 (),
	(__mmask8) __U);
	}

	#define _mm512_ror_epi32(A, B) __extension__ ({ \
	(__m512i)__builtin_ia32_prord512_mask((__v16si)(__m512i)(A), (int)(B), \
	(__v16si)_mm512_setzero_si512(), \
	(__mmask16)-1); })

	#define _mm512_mask_ror_epi32(W, U, A, B) __extension__ ({ \
	(__m512i)__builtin_ia32_prord512_mask((__v16si)(__m512i)(A), (int)(B), \
	(__v16si)(__m512i)(W), \
	(__mmask16)(U)); })

	#define _mm512_maskz_ror_epi32(U, A, B) __extension__ ({ \
	(__m512i)__builtin_ia32_prord512_mask((__v16si)(__m512i)(A), (int)(B), \
	(__v16si)_mm512_setzero_si512(), \
	(__mmask16)(U)); })

	#define _mm512_ror_epi64(A, B) __extension__ ({ \
	(__m512i)__builtin_ia32_prorq512_mask((__v8di)(__m512i)(A), (int)(B), \
	(__v8di)_mm512_setzero_si512(), \
	(__mmask8)-1); })

	#define _mm512_mask_ror_epi64(W, U, A, B) __extension__ ({ \
	(__m512i)__builtin_ia32_prorq512_mask((__v8di)(__m512i)(A), (int)(B), \
	(__v8di)(__m512i)(W), (__mmask8)(U)); })

	#define _mm512_maskz_ror_epi64(U, A, B) __extension__ ({ \
	(__m512i)__builtin_ia32_prorq512_mask((__v8di)(__m512i)(A), (int)(B), \
	(__v8di)_mm512_setzero_si512(), \
	(__mmask8)(U)); })

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_slli_epi32(__m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_slli_epi32(__A, __B),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, int __B) {
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_slli_epi32(__A, __B),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_slli_epi64(__m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_slli_epi64(__A, __B),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_slli_epi64(__A, __B),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_srli_epi32(__m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_srli_epi32(__A, __B),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, int __B) {
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_srli_epi32(__A, __B),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_srli_epi64(__m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_srli_epi64(__A, __B),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_srli_epi64(__A, __B),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P)
	{
	return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
	(__v16si) __W,
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_load_epi32 (__mmask16 __U, void const *__P)
	{
	return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) __U);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A)
	{
	__builtin_ia32_movdqa32store512_mask ((__v16si *) __P, (__v16si) __A,
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
	(__v16si) __A,
	(__v16si) __W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
	(__v16si) __A,
	(__v16si) _mm512_setzero_si512 ());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
	(__v8di) __A,
	(__v8di) __W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
	(__v8di) __A,
	(__v8di) _mm512_setzero_si512 ());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_load_epi64 (__m512i __W, __mmask8 __U, void const *__P)
	{
	return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
	(__v8di) __W,
	(__mmask8) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_load_epi64 (__mmask8 __U, void const *__P)
	{
	return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
	(__v8di)
	_mm512_setzero_si512 (),
	(__mmask8) __U);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A)
	{
	__builtin_ia32_movdqa64store512_mask ((__v8di *) __P, (__v8di) __A,
	(__mmask8) __U);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_movedup_pd (__m512d __A)
	{
	return (__m512d)__builtin_shufflevector((__v8df)__A, (__v8df)__A,
	0, 0, 2, 2, 4, 4, 6, 6);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A)
	{
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
	(__v8df)_mm512_movedup_pd(__A),
	(__v8df)__W);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A)
	{
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
	(__v8df)_mm512_movedup_pd(__A),
	(__v8df)_mm512_setzero_pd());
	}

	#define _mm512_fixupimm_round_pd(A, B, C, imm, R) __extension__ ({ \
	(__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8di)(__m512i)(C), (int)(imm), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_fixupimm_round_pd(A, U, B, C, imm, R) __extension__ ({ \
	(__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8di)(__m512i)(C), (int)(imm), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_fixupimm_pd(A, B, C, imm) __extension__ ({ \
	(__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8di)(__m512i)(C), (int)(imm), \
	(__mmask8)-1, \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_mask_fixupimm_pd(A, U, B, C, imm) __extension__ ({ \
	(__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8di)(__m512i)(C), (int)(imm), \
	(__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_maskz_fixupimm_round_pd(U, A, B, C, imm, R) __extension__ ({ \
	(__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8di)(__m512i)(C), \
	(int)(imm), (__mmask8)(U), \
	(int)(R)); })

	#define _mm512_maskz_fixupimm_pd(U, A, B, C, imm) __extension__ ({ \
	(__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8di)(__m512i)(C), \
	(int)(imm), (__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_fixupimm_round_ps(A, B, C, imm, R) __extension__ ({ \
	(__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16si)(__m512i)(C), (int)(imm), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_fixupimm_round_ps(A, U, B, C, imm, R) __extension__ ({ \
	(__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16si)(__m512i)(C), (int)(imm), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_fixupimm_ps(A, B, C, imm) __extension__ ({ \
	(__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16si)(__m512i)(C), (int)(imm), \
	(__mmask16)-1, \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_mask_fixupimm_ps(A, U, B, C, imm) __extension__ ({ \
	(__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16si)(__m512i)(C), (int)(imm), \
	(__mmask16)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_maskz_fixupimm_round_ps(U, A, B, C, imm, R) __extension__ ({ \
	(__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16si)(__m512i)(C), \
	(int)(imm), (__mmask16)(U), \
	(int)(R)); })

	#define _mm512_maskz_fixupimm_ps(U, A, B, C, imm) __extension__ ({ \
	(__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16si)(__m512i)(C), \
	(int)(imm), (__mmask16)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_fixupimm_round_sd(A, B, C, imm, R) __extension__ ({ \
	(__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2di)(__m128i)(C), (int)(imm), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_fixupimm_round_sd(A, U, B, C, imm, R) __extension__ ({ \
	(__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2di)(__m128i)(C), (int)(imm), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_fixupimm_sd(A, B, C, imm) __extension__ ({ \
	(__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2di)(__m128i)(C), (int)(imm), \
	(__mmask8)-1, \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_mask_fixupimm_sd(A, U, B, C, imm) __extension__ ({ \
	(__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2di)(__m128i)(C), (int)(imm), \
	(__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_maskz_fixupimm_round_sd(U, A, B, C, imm, R) __extension__ ({ \
	(__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2di)(__m128i)(C), (int)(imm), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_maskz_fixupimm_sd(U, A, B, C, imm) __extension__ ({ \
	(__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2di)(__m128i)(C), (int)(imm), \
	(__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_fixupimm_round_ss(A, B, C, imm, R) __extension__ ({ \
	(__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4si)(__m128i)(C), (int)(imm), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_fixupimm_round_ss(A, U, B, C, imm, R) __extension__ ({ \
	(__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4si)(__m128i)(C), (int)(imm), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_fixupimm_ss(A, B, C, imm) __extension__ ({ \
	(__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4si)(__m128i)(C), (int)(imm), \
	(__mmask8)-1, \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_mask_fixupimm_ss(A, U, B, C, imm) __extension__ ({ \
	(__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4si)(__m128i)(C), (int)(imm), \
	(__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_maskz_fixupimm_round_ss(U, A, B, C, imm, R) __extension__ ({ \
	(__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4si)(__m128i)(C), (int)(imm), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_maskz_fixupimm_ss(U, A, B, C, imm) __extension__ ({ \
	(__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4si)(__m128i)(C), (int)(imm), \
	(__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_getexp_round_sd(A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)-1, (int)(R)); })


	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_getexp_sd (__m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_getexpsd128_round_mask ((__v2df) __A,
	(__v2df) __B, (__v2df) _mm_setzero_pd(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
	(__v2df) __B,
	(__v2df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask_getexp_round_sd(W, U, A, B, R) __extension__ ({\
	(__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)(__m128d)(W), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
	(__v2df) __B,
	(__v2df) _mm_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_maskz_getexp_round_sd(U, A, B, R) __extension__ ({\
	(__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_getexp_round_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)-1, (int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_getexp_ss (__m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
	(__v4sf) __B, (__v4sf) _mm_setzero_ps(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask_getexp_round_ss(W, U, A, B, R) __extension__ ({\
	(__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)(__m128)(W), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) _mm_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_maskz_getexp_round_ss(U, A, B, R) __extension__ ({\
	(__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_getmant_round_sd(A, B, C, D, R) __extension__ ({ \
	(__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(int)(((D)<<2) \| (C)), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_getmant_sd(A, B, C, D) __extension__ ({ \
	(__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(int)(((D)<<2) \| (C)), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)-1, \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_mask_getmant_sd(W, U, A, B, C, D) __extension__ ({\
	(__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(int)(((D)<<2) \| (C)), \
	(__v2df)(__m128d)(W), \
	(__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_mask_getmant_round_sd(W, U, A, B, C, D, R)({\
	(__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(int)(((D)<<2) \| (C)), \
	(__v2df)(__m128d)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_maskz_getmant_sd(U, A, B, C, D) __extension__ ({\
	(__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(int)(((D)<<2) \| (C)), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_maskz_getmant_round_sd(U, A, B, C, D, R) __extension__ ({\
	(__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(int)(((D)<<2) \| (C)), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_getmant_round_ss(A, B, C, D, R) __extension__ ({ \
	(__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(int)(((D)<<2) \| (C)), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_getmant_ss(A, B, C, D) __extension__ ({ \
	(__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(int)(((D)<<2) \| (C)), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)-1, \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_mask_getmant_ss(W, U, A, B, C, D) __extension__ ({\
	(__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(int)(((D)<<2) \| (C)), \
	(__v4sf)(__m128)(W), \
	(__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_mask_getmant_round_ss(W, U, A, B, C, D, R)({\
	(__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(int)(((D)<<2) \| (C)), \
	(__v4sf)(__m128)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_maskz_getmant_ss(U, A, B, C, D) __extension__ ({\
	(__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(int)(((D)<<2) \| (C)), \
	(__v4sf)_mm_setzero_pd(), \
	(__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_maskz_getmant_round_ss(U, A, B, C, D, R) __extension__ ({\
	(__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(int)(((D)<<2) \| (C)), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_kmov (__mmask16 __A)
	{
	return __A;
	}

	#define _mm_comi_round_sd(A, B, P, R) __extension__ ({\
	(int)__builtin_ia32_vcomisd((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), \
	(int)(P), (int)(R)); })

	#define _mm_comi_round_ss(A, B, P, R) __extension__ ({\
	(int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \
	(int)(P), (int)(R)); })

	#ifdef __x86_64__
	#define _mm_cvt_roundsd_si64(A, R) __extension__ ({ \
	(long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)); })
	#endif

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask2_permutex2var_epi32 (__m512i __A, __m512i __I,
	__mmask16 __U, __m512i __B)
	{
	return (__m512i) __builtin_ia32_vpermi2vard512_mask ((__v16si) __A,
	(__v16si) __I
	/* idx */ ,
	(__v16si) __B,
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_sll_epi32(__m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_pslld512((__v16si) __A, (__v4si)__B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_sll_epi32(__A, __B),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_sll_epi32(__A, __B),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_sll_epi64(__m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_psllq512((__v8di)__A, (__v2di)__B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_sll_epi64(__A, __B),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_sll_epi64(__A, __B),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_sllv_epi32(__m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_psllv16si((__v16si)__X, (__v16si)__Y);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_sllv_epi32(__X, __Y),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_sllv_epi32(__X, __Y),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_sllv_epi64(__m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_psllv8di((__v8di)__X, (__v8di)__Y);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_sllv_epi64(__X, __Y),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_sllv_epi64(__X, __Y),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_sra_epi32(__m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_psrad512((__v16si) __A, (__v4si)__B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_sra_epi32(__A, __B),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_sra_epi32(__A, __B),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_sra_epi64(__m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_psraq512((__v8di)__A, (__v2di)__B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_sra_epi64(__A, __B),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_sra_epi64(__A, __B),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_srav_epi32(__m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_psrav16si((__v16si)__X, (__v16si)__Y);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_srav_epi32(__X, __Y),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_srav_epi32(__X, __Y),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_srav_epi64(__m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_psrav8di((__v8di)__X, (__v8di)__Y);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_srav_epi64(__X, __Y),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_srav_epi64(__X, __Y),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_srl_epi32(__m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_psrld512((__v16si) __A, (__v4si)__B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_srl_epi32(__A, __B),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_srl_epi32(__A, __B),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_srl_epi64(__m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_psrlq512((__v8di)__A, (__v2di)__B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_srl_epi64(__A, __B),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_srl_epi64(__A, __B),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_srlv_epi32(__m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_psrlv16si((__v16si)__X, (__v16si)__Y);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_srlv_epi32(__X, __Y),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_srlv_epi32(__X, __Y),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_srlv_epi64 (__m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_psrlv8di((__v8di)__X, (__v8di)__Y);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_srlv_epi64(__X, __Y),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_srlv_epi64(__X, __Y),
	(__v8di)_mm512_setzero_si512());
	}

	#define _mm512_ternarylogic_epi32(A, B, C, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \
	(__v16si)(__m512i)(B), \
	(__v16si)(__m512i)(C), (int)(imm), \
	(__mmask16)-1); })

	#define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \
	(__v16si)(__m512i)(B), \
	(__v16si)(__m512i)(C), (int)(imm), \
	(__mmask16)(U)); })

	#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_pternlogd512_maskz((__v16si)(__m512i)(A), \
	(__v16si)(__m512i)(B), \
	(__v16si)(__m512i)(C), \
	(int)(imm), (__mmask16)(U)); })

	#define _mm512_ternarylogic_epi64(A, B, C, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \
	(__v8di)(__m512i)(B), \
	(__v8di)(__m512i)(C), (int)(imm), \
	(__mmask8)-1); })

	#define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \
	(__v8di)(__m512i)(B), \
	(__v8di)(__m512i)(C), (int)(imm), \
	(__mmask8)(U)); })

	#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_pternlogq512_maskz((__v8di)(__m512i)(A), \
	(__v8di)(__m512i)(B), \
	(__v8di)(__m512i)(C), (int)(imm), \
	(__mmask8)(U)); })

	#ifdef __x86_64__
	#define _mm_cvt_roundsd_i64(A, R) __extension__ ({ \
	(long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)); })
	#endif

	#define _mm_cvt_roundsd_si32(A, R) __extension__ ({ \
	(int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)); })

	#define _mm_cvt_roundsd_i32(A, R) __extension__ ({ \
	(int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)); })

	#define _mm_cvt_roundsd_u32(A, R) __extension__ ({ \
	(unsigned int)__builtin_ia32_vcvtsd2usi32((__v2df)(__m128d)(A), (int)(R)); })

	static __inline__ unsigned __DEFAULT_FN_ATTRS
	_mm_cvtsd_u32 (__m128d __A)
	{
	return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A,
	_MM_FROUND_CUR_DIRECTION);
	}

	#ifdef __x86_64__
	#define _mm_cvt_roundsd_u64(A, R) __extension__ ({ \
	(unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \
	(int)(R)); })

	static __inline__ unsigned long long __DEFAULT_FN_ATTRS
	_mm_cvtsd_u64 (__m128d __A)
	{
	return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df)
	__A,
	_MM_FROUND_CUR_DIRECTION);
	}
	#endif

	#define _mm_cvt_roundss_si32(A, R) __extension__ ({ \
	(int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)); })

	#define _mm_cvt_roundss_i32(A, R) __extension__ ({ \
	(int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)); })

	#ifdef __x86_64__
	#define _mm_cvt_roundss_si64(A, R) __extension__ ({ \
	(long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)); })

	#define _mm_cvt_roundss_i64(A, R) __extension__ ({ \
	(long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)); })
	#endif

	#define _mm_cvt_roundss_u32(A, R) __extension__ ({ \
	(unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R)); })

	static __inline__ unsigned __DEFAULT_FN_ATTRS
	_mm_cvtss_u32 (__m128 __A)
	{
	return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A,
	_MM_FROUND_CUR_DIRECTION);
	}

	#ifdef __x86_64__
	#define _mm_cvt_roundss_u64(A, R) __extension__ ({ \
	(unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \
	(int)(R)); })

	static __inline__ unsigned long long __DEFAULT_FN_ATTRS
	_mm_cvtss_u64 (__m128 __A)
	{
	return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf)
	__A,
	_MM_FROUND_CUR_DIRECTION);
	}
	#endif

	#define _mm_cvtt_roundsd_i32(A, R) __extension__ ({ \
	(int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)); })

	#define _mm_cvtt_roundsd_si32(A, R) __extension__ ({ \
	(int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)); })

	static __inline__ int __DEFAULT_FN_ATTRS
	_mm_cvttsd_i32 (__m128d __A)
	{
	return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A,
	_MM_FROUND_CUR_DIRECTION);
	}

	#ifdef __x86_64__
	#define _mm_cvtt_roundsd_si64(A, R) __extension__ ({ \
	(long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)); })

	#define _mm_cvtt_roundsd_i64(A, R) __extension__ ({ \
	(long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)); })

	static __inline__ long long __DEFAULT_FN_ATTRS
	_mm_cvttsd_i64 (__m128d __A)
	{
	return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A,
	_MM_FROUND_CUR_DIRECTION);
	}
	#endif

	#define _mm_cvtt_roundsd_u32(A, R) __extension__ ({ \
	(unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R)); })

	static __inline__ unsigned __DEFAULT_FN_ATTRS
	_mm_cvttsd_u32 (__m128d __A)
	{
	return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A,
	_MM_FROUND_CUR_DIRECTION);
	}

	#ifdef __x86_64__
	#define _mm_cvtt_roundsd_u64(A, R) __extension__ ({ \
	(unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \
	(int)(R)); })

	static __inline__ unsigned long long __DEFAULT_FN_ATTRS
	_mm_cvttsd_u64 (__m128d __A)
	{
	return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df)
	__A,
	_MM_FROUND_CUR_DIRECTION);
	}
	#endif

	#define _mm_cvtt_roundss_i32(A, R) __extension__ ({ \
	(int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)); })

	#define _mm_cvtt_roundss_si32(A, R) __extension__ ({ \
	(int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)); })

	static __inline__ int __DEFAULT_FN_ATTRS
	_mm_cvttss_i32 (__m128 __A)
	{
	return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A,
	_MM_FROUND_CUR_DIRECTION);
	}

	#ifdef __x86_64__
	#define _mm_cvtt_roundss_i64(A, R) __extension__ ({ \
	(long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)); })

	#define _mm_cvtt_roundss_si64(A, R) __extension__ ({ \
	(long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)); })

	static __inline__ long long __DEFAULT_FN_ATTRS
	_mm_cvttss_i64 (__m128 __A)
	{
	return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A,
	_MM_FROUND_CUR_DIRECTION);
	}
	#endif

	#define _mm_cvtt_roundss_u32(A, R) __extension__ ({ \
	(unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R)); })

	static __inline__ unsigned __DEFAULT_FN_ATTRS
	_mm_cvttss_u32 (__m128 __A)
	{
	return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A,
	_MM_FROUND_CUR_DIRECTION);
	}

	#ifdef __x86_64__
	#define _mm_cvtt_roundss_u64(A, R) __extension__ ({ \
	(unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \
	(int)(R)); })

	static __inline__ unsigned long long __DEFAULT_FN_ATTRS
	_mm_cvttss_u64 (__m128 __A)
	{
	return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf)
	__A,
	_MM_FROUND_CUR_DIRECTION);
	}
	#endif

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask2_permutex2var_pd (__m512d __A, __m512i __I, __mmask8 __U,
	__m512d __B)
	{
	return (__m512d) __builtin_ia32_vpermi2varpd512_mask ((__v8df) __A,
	(__v8di) __I
	/* idx */ ,
	(__v8df) __B,
	(__mmask8) __U);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask2_permutex2var_ps (__m512 __A, __m512i __I, __mmask16 __U,
	__m512 __B)
	{
	return (__m512) __builtin_ia32_vpermi2varps512_mask ((__v16sf) __A,
	(__v16si) __I
	/* idx */ ,
	(__v16sf) __B,
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask2_permutex2var_epi64 (__m512i __A, __m512i __I,
	__mmask8 __U, __m512i __B)
	{
	return (__m512i) __builtin_ia32_vpermi2varq512_mask ((__v8di) __A,
	(__v8di) __I
	/* idx */ ,
	(__v8di) __B,
	(__mmask8) __U);
	}

	#define _mm512_permute_pd(X, C) __extension__ ({ \
	(__m512d)__builtin_shufflevector((__v8df)(__m512d)(X), \
	(__v8df)_mm512_undefined_pd(), \
	0 + (((C) >> 0) & 0x1), \
	0 + (((C) >> 1) & 0x1), \
	2 + (((C) >> 2) & 0x1), \
	2 + (((C) >> 3) & 0x1), \
	4 + (((C) >> 4) & 0x1), \
	4 + (((C) >> 5) & 0x1), \
	6 + (((C) >> 6) & 0x1), \
	6 + (((C) >> 7) & 0x1)); })

	#define _mm512_mask_permute_pd(W, U, X, C) __extension__ ({ \
	(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
	(__v8df)_mm512_permute_pd((X), (C)), \
	(__v8df)(__m512d)(W)); })

	#define _mm512_maskz_permute_pd(U, X, C) __extension__ ({ \
	(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
	(__v8df)_mm512_permute_pd((X), (C)), \
	(__v8df)_mm512_setzero_pd()); })

	#define _mm512_permute_ps(X, C) __extension__ ({ \
	(__m512)__builtin_shufflevector((__v16sf)(__m512)(X), \
	(__v16sf)_mm512_undefined_ps(), \
	0 + (((C) >> 0) & 0x3), \
	0 + (((C) >> 2) & 0x3), \
	0 + (((C) >> 4) & 0x3), \
	0 + (((C) >> 6) & 0x3), \
	4 + (((C) >> 0) & 0x3), \
	4 + (((C) >> 2) & 0x3), \
	4 + (((C) >> 4) & 0x3), \
	4 + (((C) >> 6) & 0x3), \
	8 + (((C) >> 0) & 0x3), \
	8 + (((C) >> 2) & 0x3), \
	8 + (((C) >> 4) & 0x3), \
	8 + (((C) >> 6) & 0x3), \
	12 + (((C) >> 0) & 0x3), \
	12 + (((C) >> 2) & 0x3), \
	12 + (((C) >> 4) & 0x3), \
	12 + (((C) >> 6) & 0x3)); })

	#define _mm512_mask_permute_ps(W, U, X, C) __extension__ ({ \
	(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
	(__v16sf)_mm512_permute_ps((X), (C)), \
	(__v16sf)(__m512)(W)); })

	#define _mm512_maskz_permute_ps(U, X, C) __extension__ ({ \
	(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
	(__v16sf)_mm512_permute_ps((X), (C)), \
	(__v16sf)_mm512_setzero_ps()); })

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_permutevar_pd(__m512d __A, __m512i __C)
	{
	return (__m512d)__builtin_ia32_vpermilvarpd512((__v8df)__A, (__v8di)__C);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C)
	{
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
	(__v8df)_mm512_permutevar_pd(__A, __C),
	(__v8df)__W);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C)
	{
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
	(__v8df)_mm512_permutevar_pd(__A, __C),
	(__v8df)_mm512_setzero_pd());
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_permutevar_ps(__m512 __A, __m512i __C)
	{
	return (__m512)__builtin_ia32_vpermilvarps512((__v16sf)__A, (__v16si)__C);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C)
	{
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
	(__v16sf)_mm512_permutevar_ps(__A, __C),
	(__v16sf)__W);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C)
	{
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
	(__v16sf)_mm512_permutevar_ps(__A, __C),
	(__v16sf)_mm512_setzero_ps());
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B)
	{
	return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I
	/* idx */ ,
	(__v8df) __A,
	(__v8df) __B,
	(__mmask8) -1);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_permutex2var_pd (__m512d __A, __mmask8 __U, __m512i __I, __m512d __B)
	{
	return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I
	/* idx */ ,
	(__v8df) __A,
	(__v8df) __B,
	(__mmask8) __U);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_permutex2var_pd (__mmask8 __U, __m512d __A, __m512i __I,
	__m512d __B)
	{
	return (__m512d) __builtin_ia32_vpermt2varpd512_maskz ((__v8di) __I
	/* idx */ ,
	(__v8df) __A,
	(__v8df) __B,
	(__mmask8) __U);
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B)
	{
	return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I
	/* idx */ ,
	(__v16sf) __A,
	(__v16sf) __B,
	(__mmask16) -1);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_permutex2var_ps (__m512 __A, __mmask16 __U, __m512i __I, __m512 __B)
	{
	return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I
	/* idx */ ,
	(__v16sf) __A,
	(__v16sf) __B,
	(__mmask16) __U);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_permutex2var_ps (__mmask16 __U, __m512 __A, __m512i __I,
	__m512 __B)
	{
	return (__m512) __builtin_ia32_vpermt2varps512_maskz ((__v16si) __I
	/* idx */ ,
	(__v16sf) __A,
	(__v16sf) __B,
	(__mmask16) __U);
	}


	#define _mm512_cvtt_roundpd_epu32(A, R) __extension__ ({ \
	(__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
	(__v8si)_mm256_undefined_si256(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_cvtt_roundpd_epu32(W, U, A, R) __extension__ ({ \
	(__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
	(__v8si)(__m256i)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_maskz_cvtt_roundpd_epu32(U, A, R) __extension__ ({ \
	(__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
	(__v8si)_mm256_setzero_si256(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_cvttpd_epu32 (__m512d __A)
	{
	return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
	(__v8si)
	_mm256_undefined_si256 (),
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
	{
	return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
	(__v8si) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A)
	{
	return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
	(__v8si)
	_mm256_setzero_si256 (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_roundscale_round_sd(A, B, imm, R) __extension__ ({ \
	(__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)-1, (int)(imm), \
	(int)(R)); })

	#define _mm_roundscale_sd(A, B, imm) __extension__ ({ \
	(__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)-1, (int)(imm), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_mask_roundscale_sd(W, U, A, B, imm) __extension__ ({ \
	(__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)(__m128d)(W), \
	(__mmask8)(U), (int)(imm), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_mask_roundscale_round_sd(W, U, A, B, I, R) __extension__ ({ \
	(__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)(__m128d)(W), \
	(__mmask8)(U), (int)(I), \
	(int)(R)); })

	#define _mm_maskz_roundscale_sd(U, A, B, I) __extension__ ({ \
	(__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)(U), (int)(I), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_maskz_roundscale_round_sd(U, A, B, I, R) __extension__ ({ \
	(__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)(U), (int)(I), \
	(int)(R)); })

	#define _mm_roundscale_round_ss(A, B, imm, R) __extension__ ({ \
	(__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)-1, (int)(imm), \
	(int)(R)); })

	#define _mm_roundscale_ss(A, B, imm) __extension__ ({ \
	(__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)-1, (int)(imm), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_mask_roundscale_ss(W, U, A, B, I) __extension__ ({ \
	(__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)(__m128)(W), \
	(__mmask8)(U), (int)(I), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_mask_roundscale_round_ss(W, U, A, B, I, R) __extension__ ({ \
	(__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)(__m128)(W), \
	(__mmask8)(U), (int)(I), \
	(int)(R)); })

	#define _mm_maskz_roundscale_ss(U, A, B, I) __extension__ ({ \
	(__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)(U), (int)(I), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_maskz_roundscale_round_ss(U, A, B, I, R) __extension__ ({ \
	(__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)(U), (int)(I), \
	(int)(R)); })

	#define _mm512_scalef_round_pd(A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)_mm512_undefined_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_scalef_round_pd(W, U, A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_maskz_scalef_round_pd(U, A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_scalef_pd (__m512d __A, __m512d __B)
	{
	return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df)
	_mm512_undefined_pd (),
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
	{
	return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B)
	{
	return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_scalef_round_ps(A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)_mm512_undefined_ps(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_scalef_round_ps(W, U, A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(W), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_maskz_scalef_round_ps(U, A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(U), (int)(R)); })

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_scalef_ps (__m512 __A, __m512 __B)
	{
	return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf)
	_mm512_undefined_ps (),
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
	{
	return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __W,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B)
	{
	return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf)
	_mm512_setzero_ps (),
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_scalef_round_sd(A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)-1, (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_scalef_sd (__m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_scalefsd_round_mask ((__v2df) __A,
	(__v2df)( __B), (__v2df) _mm_setzero_pd(),
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_scalef_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
	(__v2df) __B,
	(__v2df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask_scalef_round_sd(W, U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)(__m128d)(W), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_scalef_sd (__mmask8 __U, __m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
	(__v2df) __B,
	(__v2df) _mm_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_maskz_scalef_round_sd(U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_scalef_round_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)-1, (int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_scalef_ss (__m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_scalefss_round_mask ((__v4sf) __A,
	(__v4sf)( __B), (__v4sf) _mm_setzero_ps(),
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_scalef_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask_scalef_round_ss(W, U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)(__m128)(W), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) _mm_setzero_ps (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_maskz_scalef_round_ss(U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_srai_epi32(__m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_psradi512((__v16si)__A, __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, \
	(__v16si)_mm512_srai_epi32(__A, __B), \
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A, int __B) {
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, \
	(__v16si)_mm512_srai_epi32(__A, __B), \
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_srai_epi64(__m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, \
	(__v8di)_mm512_srai_epi64(__A, __B), \
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, \
	(__v8di)_mm512_srai_epi64(__A, __B), \
	(__v8di)_mm512_setzero_si512());
	}

	#define _mm512_shuffle_f32x4(A, B, imm) __extension__ ({ \
	(__m512)__builtin_shufflevector((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	0 + ((((imm) >> 0) & 0x3) * 4), \
	1 + ((((imm) >> 0) & 0x3) * 4), \
	2 + ((((imm) >> 0) & 0x3) * 4), \
	3 + ((((imm) >> 0) & 0x3) * 4), \
	0 + ((((imm) >> 2) & 0x3) * 4), \
	1 + ((((imm) >> 2) & 0x3) * 4), \
	2 + ((((imm) >> 2) & 0x3) * 4), \
	3 + ((((imm) >> 2) & 0x3) * 4), \
	16 + ((((imm) >> 4) & 0x3) * 4), \
	17 + ((((imm) >> 4) & 0x3) * 4), \
	18 + ((((imm) >> 4) & 0x3) * 4), \
	19 + ((((imm) >> 4) & 0x3) * 4), \
	16 + ((((imm) >> 6) & 0x3) * 4), \
	17 + ((((imm) >> 6) & 0x3) * 4), \
	18 + ((((imm) >> 6) & 0x3) * 4), \
	19 + ((((imm) >> 6) & 0x3) * 4)); })

	#define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) __extension__ ({ \
	(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
	(__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \
	(__v16sf)(__m512)(W)); })

	#define _mm512_maskz_shuffle_f32x4(U, A, B, imm) __extension__ ({ \
	(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
	(__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \
	(__v16sf)_mm512_setzero_ps()); })

	#define _mm512_shuffle_f64x2(A, B, imm) __extension__ ({ \
	(__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	0 + ((((imm) >> 0) & 0x3) * 2), \
	1 + ((((imm) >> 0) & 0x3) * 2), \
	0 + ((((imm) >> 2) & 0x3) * 2), \
	1 + ((((imm) >> 2) & 0x3) * 2), \
	8 + ((((imm) >> 4) & 0x3) * 2), \
	9 + ((((imm) >> 4) & 0x3) * 2), \
	8 + ((((imm) >> 6) & 0x3) * 2), \
	9 + ((((imm) >> 6) & 0x3) * 2)); })

	#define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) __extension__ ({ \
	(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
	(__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \
	(__v8df)(__m512d)(W)); })

	#define _mm512_maskz_shuffle_f64x2(U, A, B, imm) __extension__ ({ \
	(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
	(__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \
	(__v8df)_mm512_setzero_pd()); })

	#define _mm512_shuffle_i32x4(A, B, imm) __extension__ ({ \
	(__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \
	(__v8di)(__m512i)(B), \
	0 + ((((imm) >> 0) & 0x3) * 2), \
	1 + ((((imm) >> 0) & 0x3) * 2), \
	0 + ((((imm) >> 2) & 0x3) * 2), \
	1 + ((((imm) >> 2) & 0x3) * 2), \
	8 + ((((imm) >> 4) & 0x3) * 2), \
	9 + ((((imm) >> 4) & 0x3) * 2), \
	8 + ((((imm) >> 6) & 0x3) * 2), \
	9 + ((((imm) >> 6) & 0x3) * 2)); })

	#define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
	(__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \
	(__v16si)(__m512i)(W)); })

	#define _mm512_maskz_shuffle_i32x4(U, A, B, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
	(__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \
	(__v16si)_mm512_setzero_si512()); })

	#define _mm512_shuffle_i64x2(A, B, imm) __extension__ ({ \
	(__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \
	(__v8di)(__m512i)(B), \
	0 + ((((imm) >> 0) & 0x3) * 2), \
	1 + ((((imm) >> 0) & 0x3) * 2), \
	0 + ((((imm) >> 2) & 0x3) * 2), \
	1 + ((((imm) >> 2) & 0x3) * 2), \
	8 + ((((imm) >> 4) & 0x3) * 2), \
	9 + ((((imm) >> 4) & 0x3) * 2), \
	8 + ((((imm) >> 6) & 0x3) * 2), \
	9 + ((((imm) >> 6) & 0x3) * 2)); })

	#define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
	(__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \
	(__v8di)(__m512i)(W)); })

	#define _mm512_maskz_shuffle_i64x2(U, A, B, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
	(__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \
	(__v8di)_mm512_setzero_si512()); })

	#define _mm512_shuffle_pd(A, B, M) __extension__ ({ \
	(__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	0 + (((M) >> 0) & 0x1), \
	8 + (((M) >> 1) & 0x1), \
	2 + (((M) >> 2) & 0x1), \
	10 + (((M) >> 3) & 0x1), \
	4 + (((M) >> 4) & 0x1), \
	12 + (((M) >> 5) & 0x1), \
	6 + (((M) >> 6) & 0x1), \
	14 + (((M) >> 7) & 0x1)); })

	#define _mm512_mask_shuffle_pd(W, U, A, B, M) __extension__ ({ \
	(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
	(__v8df)_mm512_shuffle_pd((A), (B), (M)), \
	(__v8df)(__m512d)(W)); })

	#define _mm512_maskz_shuffle_pd(U, A, B, M) __extension__ ({ \
	(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
	(__v8df)_mm512_shuffle_pd((A), (B), (M)), \
	(__v8df)_mm512_setzero_pd()); })

	#define _mm512_shuffle_ps(A, B, M) __extension__ ({ \
	(__m512d)__builtin_shufflevector((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	0 + (((M) >> 0) & 0x3), \
	0 + (((M) >> 2) & 0x3), \
	16 + (((M) >> 4) & 0x3), \
	16 + (((M) >> 6) & 0x3), \
	4 + (((M) >> 0) & 0x3), \
	4 + (((M) >> 2) & 0x3), \
	20 + (((M) >> 4) & 0x3), \
	20 + (((M) >> 6) & 0x3), \
	8 + (((M) >> 0) & 0x3), \
	8 + (((M) >> 2) & 0x3), \
	24 + (((M) >> 4) & 0x3), \
	24 + (((M) >> 6) & 0x3), \
	12 + (((M) >> 0) & 0x3), \
	12 + (((M) >> 2) & 0x3), \
	28 + (((M) >> 4) & 0x3), \
	28 + (((M) >> 6) & 0x3)); })

	#define _mm512_mask_shuffle_ps(W, U, A, B, M) __extension__ ({ \
	(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
	(__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
	(__v16sf)(__m512)(W)); })

	#define _mm512_maskz_shuffle_ps(U, A, B, M) __extension__ ({ \
	(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
	(__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
	(__v16sf)_mm512_setzero_ps()); })

	#define _mm_sqrt_round_sd(A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)-1, (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_sqrt_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A,
	(__v2df) __B,
	(__v2df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask_sqrt_round_sd(W, U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)(__m128d)(W), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_sqrt_sd (__mmask8 __U, __m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A,
	(__v2df) __B,
	(__v2df) _mm_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_maskz_sqrt_round_sd(U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_sqrt_round_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)-1, (int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_sqrt_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask_sqrt_round_ss(W, U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)(__m128)(W), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) _mm_setzero_ps (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_maskz_sqrt_round_ss(U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_broadcast_f32x4(__m128 __A)
	{
	return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
	0, 1, 2, 3, 0, 1, 2, 3,
	0, 1, 2, 3, 0, 1, 2, 3);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A)
	{
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
	(__v16sf)_mm512_broadcast_f32x4(__A),
	(__v16sf)__O);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A)
	{
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
	(__v16sf)_mm512_broadcast_f32x4(__A),
	(__v16sf)_mm512_setzero_ps());
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_broadcast_f64x4(__m256d __A)
	{
	return (__m512d)__builtin_shufflevector((__v4df)__A, (__v4df)__A,
	0, 1, 2, 3, 0, 1, 2, 3);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A)
	{
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
	(__v8df)_mm512_broadcast_f64x4(__A),
	(__v8df)__O);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A)
	{
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
	(__v8df)_mm512_broadcast_f64x4(__A),
	(__v8df)_mm512_setzero_pd());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_broadcast_i32x4(__m128i __A)
	{
	return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
	0, 1, 2, 3, 0, 1, 2, 3,
	0, 1, 2, 3, 0, 1, 2, 3);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
	(__v16si)_mm512_broadcast_i32x4(__A),
	(__v16si)__O);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
	(__v16si)_mm512_broadcast_i32x4(__A),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_broadcast_i64x4(__m256i __A)
	{
	return (__m512i)__builtin_shufflevector((__v4di)__A, (__v4di)__A,
	0, 1, 2, 3, 0, 1, 2, 3);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
	(__v8di)_mm512_broadcast_i64x4(__A),
	(__v8di)__O);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
	(__v8di)_mm512_broadcast_i64x4(__A),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A)
	{
	return (__m512d)__builtin_ia32_selectpd_512(__M,
	(__v8df) _mm512_broadcastsd_pd(__A),
	(__v8df) __O);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
	{
	return (__m512d)__builtin_ia32_selectpd_512(__M,
	(__v8df) _mm512_broadcastsd_pd(__A),
	(__v8df) _mm512_setzero_pd());
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A)
	{
	return (__m512)__builtin_ia32_selectps_512(__M,
	(__v16sf) _mm512_broadcastss_ps(__A),
	(__v16sf) __O);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A)
	{
	return (__m512)__builtin_ia32_selectps_512(__M,
	(__v16sf) _mm512_broadcastss_ps(__A),
	(__v16sf) _mm512_setzero_ps());
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_cvtsepi32_epi8 (__m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
	(__v16qi) _mm_undefined_si128 (),
	(__mmask16) -1);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
	(__v16qi) __O, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
	(__v16qi) _mm_setzero_si128 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
	{
	__builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_cvtsepi32_epi16 (__m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
	(__v16hi) _mm256_undefined_si256 (),
	(__mmask16) -1);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
	(__v16hi) __O, __M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
	(__v16hi) _mm256_setzero_si256 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
	{
	__builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_cvtsepi64_epi8 (__m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
	(__v16qi) _mm_undefined_si128 (),
	(__mmask8) -1);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
	(__v16qi) __O, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
	(__v16qi) _mm_setzero_si128 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
	{
	__builtin_ia32_pmovsqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_cvtsepi64_epi32 (__m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
	(__v8si) _mm256_undefined_si256 (),
	(__mmask8) -1);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
	(__v8si) __O, __M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
	(__v8si) _mm256_setzero_si256 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A)
	{
	__builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_cvtsepi64_epi16 (__m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
	(__v8hi) _mm_undefined_si128 (),
	(__mmask8) -1);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
	(__v8hi) __O, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
	(__v8hi) _mm_setzero_si128 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A)
	{
	__builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_cvtusepi32_epi8 (__m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
	(__v16qi) _mm_undefined_si128 (),
	(__mmask16) -1);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
	(__v16qi) __O,
	__M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
	(__v16qi) _mm_setzero_si128 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
	{
	__builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_cvtusepi32_epi16 (__m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
	(__v16hi) _mm256_undefined_si256 (),
	(__mmask16) -1);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
	(__v16hi) __O,
	__M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
	(__v16hi) _mm256_setzero_si256 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
	{
	__builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_cvtusepi64_epi8 (__m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
	(__v16qi) _mm_undefined_si128 (),
	(__mmask8) -1);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
	(__v16qi) __O,
	__M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
	(__v16qi) _mm_setzero_si128 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
	{
	__builtin_ia32_pmovusqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_cvtusepi64_epi32 (__m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
	(__v8si) _mm256_undefined_si256 (),
	(__mmask8) -1);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
	(__v8si) __O, __M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
	(__v8si) _mm256_setzero_si256 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
	{
	__builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_cvtusepi64_epi16 (__m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
	(__v8hi) _mm_undefined_si128 (),
	(__mmask8) -1);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
	(__v8hi) __O, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
	(__v8hi) _mm_setzero_si128 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
	{
	__builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_cvtepi32_epi8 (__m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
	(__v16qi) _mm_undefined_si128 (),
	(__mmask16) -1);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
	(__v16qi) __O, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
	(__v16qi) _mm_setzero_si128 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
	{
	__builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_cvtepi32_epi16 (__m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
	(__v16hi) _mm256_undefined_si256 (),
	(__mmask16) -1);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
	(__v16hi) __O, __M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
	(__v16hi) _mm256_setzero_si256 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A)
	{
	__builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_cvtepi64_epi8 (__m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
	(__v16qi) _mm_undefined_si128 (),
	(__mmask8) -1);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
	(__v16qi) __O, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
	(__v16qi) _mm_setzero_si128 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
	{
	__builtin_ia32_pmovqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_cvtepi64_epi32 (__m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
	(__v8si) _mm256_undefined_si256 (),
	(__mmask8) -1);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
	(__v8si) __O, __M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
	(__v8si) _mm256_setzero_si256 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
	{
	__builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_cvtepi64_epi16 (__m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
	(__v8hi) _mm_undefined_si128 (),
	(__mmask8) -1);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
	(__v8hi) __O, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
	(__v8hi) _mm_setzero_si128 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
	{
	__builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
	}

	#define _mm512_extracti32x4_epi32(A, imm) __extension__ ({ \
	(__m128i)__builtin_shufflevector((__v16si)(__m512i)(A), \
	(__v16si)_mm512_undefined_epi32(), \
	0 + ((imm) & 0x3) * 4, \
	1 + ((imm) & 0x3) * 4, \
	2 + ((imm) & 0x3) * 4, \
	3 + ((imm) & 0x3) * 4); })

	#define _mm512_mask_extracti32x4_epi32(W, U, A, imm) __extension__ ({ \
	(__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
	(__v4si)_mm512_extracti32x4_epi32((A), (imm)), \
	(__v4si)(W)); })

	#define _mm512_maskz_extracti32x4_epi32(U, A, imm) __extension__ ({ \
	(__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
	(__v4si)_mm512_extracti32x4_epi32((A), (imm)), \
	(__v4si)_mm_setzero_si128()); })

	#define _mm512_extracti64x4_epi64(A, imm) __extension__ ({ \
	(__m256i)__builtin_shufflevector((__v8di)(__m512i)(A), \
	(__v8di)_mm512_undefined_epi32(), \
	((imm) & 1) ? 4 : 0, \
	((imm) & 1) ? 5 : 1, \
	((imm) & 1) ? 6 : 2, \
	((imm) & 1) ? 7 : 3); })

	#define _mm512_mask_extracti64x4_epi64(W, U, A, imm) __extension__ ({ \
	(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
	(__v4di)_mm512_extracti64x4_epi64((A), (imm)), \
	(__v4di)(W)); })

	#define _mm512_maskz_extracti64x4_epi64(U, A, imm) __extension__ ({ \
	(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
	(__v4di)_mm512_extracti64x4_epi64((A), (imm)), \
	(__v4di)_mm256_setzero_si256()); })

	#define _mm512_insertf64x4(A, B, imm) __extension__ ({ \
	(__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \
	(__v8df)_mm512_castpd256_pd512((__m256d)(B)), \
	((imm) & 0x1) ? 0 : 8, \
	((imm) & 0x1) ? 1 : 9, \
	((imm) & 0x1) ? 2 : 10, \
	((imm) & 0x1) ? 3 : 11, \
	((imm) & 0x1) ? 8 : 4, \
	((imm) & 0x1) ? 9 : 5, \
	((imm) & 0x1) ? 10 : 6, \
	((imm) & 0x1) ? 11 : 7); })

	#define _mm512_mask_insertf64x4(W, U, A, B, imm) __extension__ ({ \
	(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
	(__v8df)_mm512_insertf64x4((A), (B), (imm)), \
	(__v8df)(W)); })

	#define _mm512_maskz_insertf64x4(U, A, B, imm) __extension__ ({ \
	(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
	(__v8df)_mm512_insertf64x4((A), (B), (imm)), \
	(__v8df)_mm512_setzero_pd()); })

	#define _mm512_inserti64x4(A, B, imm) __extension__ ({ \
	(__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \
	(__v8di)_mm512_castsi256_si512((__m256i)(B)), \
	((imm) & 0x1) ? 0 : 8, \
	((imm) & 0x1) ? 1 : 9, \
	((imm) & 0x1) ? 2 : 10, \
	((imm) & 0x1) ? 3 : 11, \
	((imm) & 0x1) ? 8 : 4, \
	((imm) & 0x1) ? 9 : 5, \
	((imm) & 0x1) ? 10 : 6, \
	((imm) & 0x1) ? 11 : 7); })

	#define _mm512_mask_inserti64x4(W, U, A, B, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
	(__v8di)_mm512_inserti64x4((A), (B), (imm)), \
	(__v8di)(W)); })

	#define _mm512_maskz_inserti64x4(U, A, B, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
	(__v8di)_mm512_inserti64x4((A), (B), (imm)), \
	(__v8di)_mm512_setzero_si512()); })

	#define _mm512_insertf32x4(A, B, imm) __extension__ ({ \
	(__m512)__builtin_shufflevector((__v16sf)(__m512)(A), \
	(__v16sf)_mm512_castps128_ps512((__m128)(B)),\
	(((imm) & 0x3) == 0) ? 16 : 0, \
	(((imm) & 0x3) == 0) ? 17 : 1, \
	(((imm) & 0x3) == 0) ? 18 : 2, \
	(((imm) & 0x3) == 0) ? 19 : 3, \
	(((imm) & 0x3) == 1) ? 16 : 4, \
	(((imm) & 0x3) == 1) ? 17 : 5, \
	(((imm) & 0x3) == 1) ? 18 : 6, \
	(((imm) & 0x3) == 1) ? 19 : 7, \
	(((imm) & 0x3) == 2) ? 16 : 8, \
	(((imm) & 0x3) == 2) ? 17 : 9, \
	(((imm) & 0x3) == 2) ? 18 : 10, \
	(((imm) & 0x3) == 2) ? 19 : 11, \
	(((imm) & 0x3) == 3) ? 16 : 12, \
	(((imm) & 0x3) == 3) ? 17 : 13, \
	(((imm) & 0x3) == 3) ? 18 : 14, \
	(((imm) & 0x3) == 3) ? 19 : 15); })

	#define _mm512_mask_insertf32x4(W, U, A, B, imm) __extension__ ({ \
	(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
	(__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
	(__v16sf)(W)); })

	#define _mm512_maskz_insertf32x4(U, A, B, imm) __extension__ ({ \
	(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
	(__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
	(__v16sf)_mm512_setzero_ps()); })

	#define _mm512_inserti32x4(A, B, imm) __extension__ ({ \
	(__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \
	(__v16si)_mm512_castsi128_si512((__m128i)(B)),\
	(((imm) & 0x3) == 0) ? 16 : 0, \
	(((imm) & 0x3) == 0) ? 17 : 1, \
	(((imm) & 0x3) == 0) ? 18 : 2, \
	(((imm) & 0x3) == 0) ? 19 : 3, \
	(((imm) & 0x3) == 1) ? 16 : 4, \
	(((imm) & 0x3) == 1) ? 17 : 5, \
	(((imm) & 0x3) == 1) ? 18 : 6, \
	(((imm) & 0x3) == 1) ? 19 : 7, \
	(((imm) & 0x3) == 2) ? 16 : 8, \
	(((imm) & 0x3) == 2) ? 17 : 9, \
	(((imm) & 0x3) == 2) ? 18 : 10, \
	(((imm) & 0x3) == 2) ? 19 : 11, \
	(((imm) & 0x3) == 3) ? 16 : 12, \
	(((imm) & 0x3) == 3) ? 17 : 13, \
	(((imm) & 0x3) == 3) ? 18 : 14, \
	(((imm) & 0x3) == 3) ? 19 : 15); })

	#define _mm512_mask_inserti32x4(W, U, A, B, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
	(__v16si)_mm512_inserti32x4((A), (B), (imm)), \
	(__v16si)(W)); })

	#define _mm512_maskz_inserti32x4(U, A, B, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
	(__v16si)_mm512_inserti32x4((A), (B), (imm)), \
	(__v16si)_mm512_setzero_si512()); })

	#define _mm512_getmant_round_pd(A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
	(int)(((C)<<2) \| (B)), \
	(__v8df)_mm512_undefined_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_getmant_round_pd(W, U, A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
	(int)(((C)<<2) \| (B)), \
	(__v8df)(__m512d)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_maskz_getmant_round_pd(U, A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
	(int)(((C)<<2) \| (B)), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_getmant_pd(A, B, C) __extension__ ({ \
	(__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
	(int)(((C)<<2) \| (B)), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)-1, \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_mask_getmant_pd(W, U, A, B, C) __extension__ ({ \
	(__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
	(int)(((C)<<2) \| (B)), \
	(__v8df)(__m512d)(W), \
	(__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_maskz_getmant_pd(U, A, B, C) __extension__ ({ \
	(__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
	(int)(((C)<<2) \| (B)), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_getmant_round_ps(A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
	(int)(((C)<<2) \| (B)), \
	(__v16sf)_mm512_undefined_ps(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_getmant_round_ps(W, U, A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
	(int)(((C)<<2) \| (B)), \
	(__v16sf)(__m512)(W), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_maskz_getmant_round_ps(U, A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
	(int)(((C)<<2) \| (B)), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_getmant_ps(A, B, C) __extension__ ({ \
	(__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
	(int)(((C)<<2)\|(B)), \
	(__v16sf)_mm512_undefined_ps(), \
	(__mmask16)-1, \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_mask_getmant_ps(W, U, A, B, C) __extension__ ({ \
	(__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
	(int)(((C)<<2)\|(B)), \
	(__v16sf)(__m512)(W), \
	(__mmask16)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_maskz_getmant_ps(U, A, B, C) __extension__ ({ \
	(__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
	(int)(((C)<<2)\|(B)), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_getexp_round_pd(A, R) __extension__ ({ \
	(__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
	(__v8df)_mm512_undefined_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_getexp_round_pd(W, U, A, R) __extension__ ({ \
	(__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_maskz_getexp_round_pd(U, A, R) __extension__ ({ \
	(__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_getexp_pd (__m512d __A)
	{
	return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
	(__v8df) _mm512_undefined_pd (),
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
	(__v8df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
	(__v8df) _mm512_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_getexp_round_ps(A, R) __extension__ ({ \
	(__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)_mm512_undefined_ps(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_getexp_round_ps(W, U, A, R) __extension__ ({ \
	(__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(W), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_maskz_getexp_round_ps(U, A, R) __extension__ ({ \
	(__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(U), (int)(R)); })

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_getexp_ps (__m512 __A)
	{
	return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
	(__v16sf) _mm512_undefined_ps (),
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A)
	{
	return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
	(__v16sf) __W,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A)
	{
	return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
	(__v16sf) _mm512_setzero_ps (),
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_i64gather_ps(index, addr, scale) __extension__ ({ \
	(__m256)__builtin_ia32_gatherdiv16sf((__v8sf)_mm256_undefined_ps(), \
	(float const *)(addr), \
	(__v8di)(__m512i)(index), (__mmask8)-1, \
	(int)(scale)); })

	#define _mm512_mask_i64gather_ps(v1_old, mask, index, addr, scale) __extension__({\
	(__m256)__builtin_ia32_gatherdiv16sf((__v8sf)(__m256)(v1_old),\
	(float const *)(addr), \
	(__v8di)(__m512i)(index), \
	(__mmask8)(mask), (int)(scale)); })

	#define _mm512_i64gather_epi32(index, addr, scale) __extension__ ({\
	(__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_ps(), \
	(int const *)(addr), \
	(__v8di)(__m512i)(index), \
	(__mmask8)-1, (int)(scale)); })

	#define _mm512_mask_i64gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\
	(__m256i)__builtin_ia32_gatherdiv16si((__v8si)(__m256i)(v1_old), \
	(int const *)(addr), \
	(__v8di)(__m512i)(index), \
	(__mmask8)(mask), (int)(scale)); })

	#define _mm512_i64gather_pd(index, addr, scale) __extension__ ({\
	(__m512d)__builtin_ia32_gatherdiv8df((__v8df)_mm512_undefined_pd(), \
	(double const *)(addr), \
	(__v8di)(__m512i)(index), (__mmask8)-1, \
	(int)(scale)); })

	#define _mm512_mask_i64gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\
	(__m512d)__builtin_ia32_gatherdiv8df((__v8df)(__m512d)(v1_old), \
	(double const *)(addr), \
	(__v8di)(__m512i)(index), \
	(__mmask8)(mask), (int)(scale)); })

	#define _mm512_i64gather_epi64(index, addr, scale) __extension__ ({\
	(__m512i)__builtin_ia32_gatherdiv8di((__v8di)_mm512_undefined_pd(), \
	(long long const *)(addr), \
	(__v8di)(__m512i)(index), (__mmask8)-1, \
	(int)(scale)); })

	#define _mm512_mask_i64gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\
	(__m512i)__builtin_ia32_gatherdiv8di((__v8di)(__m512i)(v1_old), \
	(long long const *)(addr), \
	(__v8di)(__m512i)(index), \
	(__mmask8)(mask), (int)(scale)); })

	#define _mm512_i32gather_ps(index, addr, scale) __extension__ ({\
	(__m512)__builtin_ia32_gathersiv16sf((__v16sf)_mm512_undefined_ps(), \
	(float const *)(addr), \
	(__v16sf)(__m512)(index), \
	(__mmask16)-1, (int)(scale)); })

	#define _mm512_mask_i32gather_ps(v1_old, mask, index, addr, scale) __extension__ ({\
	(__m512)__builtin_ia32_gathersiv16sf((__v16sf)(__m512)(v1_old), \
	(float const *)(addr), \
	(__v16sf)(__m512)(index), \
	(__mmask16)(mask), (int)(scale)); })

	#define _mm512_i32gather_epi32(index, addr, scale) __extension__ ({\
	(__m512i)__builtin_ia32_gathersiv16si((__v16si)_mm512_undefined_epi32(), \
	(int const *)(addr), \
	(__v16si)(__m512i)(index), \
	(__mmask16)-1, (int)(scale)); })

	#define _mm512_mask_i32gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\
	(__m512i)__builtin_ia32_gathersiv16si((__v16si)(__m512i)(v1_old), \
	(int const *)(addr), \
	(__v16si)(__m512i)(index), \
	(__mmask16)(mask), (int)(scale)); })

	#define _mm512_i32gather_pd(index, addr, scale) __extension__ ({\
	(__m512d)__builtin_ia32_gathersiv8df((__v8df)_mm512_undefined_pd(), \
	(double const *)(addr), \
	(__v8si)(__m256i)(index), (__mmask8)-1, \
	(int)(scale)); })

	#define _mm512_mask_i32gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\
	(__m512d)__builtin_ia32_gathersiv8df((__v8df)(__m512d)(v1_old), \
	(double const *)(addr), \
	(__v8si)(__m256i)(index), \
	(__mmask8)(mask), (int)(scale)); })

	#define _mm512_i32gather_epi64(index, addr, scale) __extension__ ({\
	(__m512i)__builtin_ia32_gathersiv8di((__v8di)_mm512_undefined_epi32(), \
	(long long const *)(addr), \
	(__v8si)(__m256i)(index), (__mmask8)-1, \
	(int)(scale)); })

	#define _mm512_mask_i32gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\
	(__m512i)__builtin_ia32_gathersiv8di((__v8di)(__m512i)(v1_old), \
	(long long const *)(addr), \
	(__v8si)(__m256i)(index), \
	(__mmask8)(mask), (int)(scale)); })

	#define _mm512_i64scatter_ps(addr, index, v1, scale) __extension__ ({\
	__builtin_ia32_scatterdiv16sf((float *)(addr), (__mmask8)-1, \
	(__v8di)(__m512i)(index), \
	(__v8sf)(__m256)(v1), (int)(scale)); })

	#define _mm512_mask_i64scatter_ps(addr, mask, index, v1, scale) __extension__ ({\
	__builtin_ia32_scatterdiv16sf((float *)(addr), (__mmask8)(mask), \
	(__v8di)(__m512i)(index), \
	(__v8sf)(__m256)(v1), (int)(scale)); })

	#define _mm512_i64scatter_epi32(addr, index, v1, scale) __extension__ ({\
	__builtin_ia32_scatterdiv16si((int *)(addr), (__mmask8)-1, \
	(__v8di)(__m512i)(index), \
	(__v8si)(__m256i)(v1), (int)(scale)); })

	#define _mm512_mask_i64scatter_epi32(addr, mask, index, v1, scale) __extension__ ({\
	__builtin_ia32_scatterdiv16si((int *)(addr), (__mmask8)(mask), \
	(__v8di)(__m512i)(index), \
	(__v8si)(__m256i)(v1), (int)(scale)); })

	#define _mm512_i64scatter_pd(addr, index, v1, scale) __extension__ ({\
	__builtin_ia32_scatterdiv8df((double *)(addr), (__mmask8)-1, \
	(__v8di)(__m512i)(index), \
	(__v8df)(__m512d)(v1), (int)(scale)); })

	#define _mm512_mask_i64scatter_pd(addr, mask, index, v1, scale) __extension__ ({\
	__builtin_ia32_scatterdiv8df((double *)(addr), (__mmask8)(mask), \
	(__v8di)(__m512i)(index), \
	(__v8df)(__m512d)(v1), (int)(scale)); })

	#define _mm512_i64scatter_epi64(addr, index, v1, scale) __extension__ ({\
	__builtin_ia32_scatterdiv8di((long long *)(addr), (__mmask8)-1, \
	(__v8di)(__m512i)(index), \
	(__v8di)(__m512i)(v1), (int)(scale)); })

	#define _mm512_mask_i64scatter_epi64(addr, mask, index, v1, scale) __extension__ ({\
	__builtin_ia32_scatterdiv8di((long long *)(addr), (__mmask8)(mask), \
	(__v8di)(__m512i)(index), \
	(__v8di)(__m512i)(v1), (int)(scale)); })

	#define _mm512_i32scatter_ps(addr, index, v1, scale) __extension__ ({\
	__builtin_ia32_scattersiv16sf((float *)(addr), (__mmask16)-1, \
	(__v16si)(__m512i)(index), \
	(__v16sf)(__m512)(v1), (int)(scale)); })

	#define _mm512_mask_i32scatter_ps(addr, mask, index, v1, scale) __extension__ ({\
	__builtin_ia32_scattersiv16sf((float *)(addr), (__mmask16)(mask), \
	(__v16si)(__m512i)(index), \
	(__v16sf)(__m512)(v1), (int)(scale)); })

	#define _mm512_i32scatter_epi32(addr, index, v1, scale) __extension__ ({\
	__builtin_ia32_scattersiv16si((int *)(addr), (__mmask16)-1, \
	(__v16si)(__m512i)(index), \
	(__v16si)(__m512i)(v1), (int)(scale)); })

	#define _mm512_mask_i32scatter_epi32(addr, mask, index, v1, scale) __extension__ ({\
	__builtin_ia32_scattersiv16si((int *)(addr), (__mmask16)(mask), \
	(__v16si)(__m512i)(index), \
	(__v16si)(__m512i)(v1), (int)(scale)); })

	#define _mm512_i32scatter_pd(addr, index, v1, scale) __extension__ ({\
	__builtin_ia32_scattersiv8df((double *)(addr), (__mmask8)-1, \
	(__v8si)(__m256i)(index), \
	(__v8df)(__m512d)(v1), (int)(scale)); })

	#define _mm512_mask_i32scatter_pd(addr, mask, index, v1, scale) __extension__ ({\
	__builtin_ia32_scattersiv8df((double *)(addr), (__mmask8)(mask), \
	(__v8si)(__m256i)(index), \
	(__v8df)(__m512d)(v1), (int)(scale)); })

	#define _mm512_i32scatter_epi64(addr, index, v1, scale) __extension__ ({\
	__builtin_ia32_scattersiv8di((long long *)(addr), (__mmask8)-1, \
	(__v8si)(__m256i)(index), \
	(__v8di)(__m512i)(v1), (int)(scale)); })

	#define _mm512_mask_i32scatter_epi64(addr, mask, index, v1, scale) __extension__ ({\
	__builtin_ia32_scattersiv8di((long long *)(addr), (__mmask8)(mask), \
	(__v8si)(__m256i)(index), \
	(__v8di)(__m512i)(v1), (int)(scale)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
	(__v4sf) __A,
	(__v4sf) __B,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask_fmadd_round_ss(W, U, A, B, R) __extension__({\
	(__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
	(__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
	{
	return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_maskz_fmadd_round_ss(U, A, B, C, R) __extension__ ({\
	(__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)(__m128)(C), (__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
	{
	return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W,
	(__v4sf) __X,
	(__v4sf) __Y,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask3_fmadd_round_ss(W, X, Y, U, R) __extension__ ({\
	(__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
	(__v4sf)(__m128)(X), \
	(__v4sf)(__m128)(Y), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
	(__v4sf) __A,
	-(__v4sf) __B,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask_fmsub_round_ss(W, U, A, B, R) __extension__ ({\
	(__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
	(__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
	{
	return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __A,
	(__v4sf) __B,
	-(__v4sf) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_maskz_fmsub_round_ss(U, A, B, C, R) __extension__ ({\
	(__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	-(__v4sf)(__m128)(C), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
	{
	return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W,
	(__v4sf) __X,
	(__v4sf) __Y,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) __extension__ ({\
	(__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
	(__v4sf)(__m128)(X), \
	(__v4sf)(__m128)(Y), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
	-(__v4sf) __A,
	(__v4sf) __B,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask_fnmadd_round_ss(W, U, A, B, R) __extension__ ({\
	(__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
	-(__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
	{
	return (__m128) __builtin_ia32_vfmaddss3_maskz (-(__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) __extension__ ({\
	(__m128)__builtin_ia32_vfmaddss3_maskz(-(__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)(__m128)(C), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
	{
	return (__m128) __builtin_ia32_vfmaddss3_mask3 (-(__v4sf) __W,
	(__v4sf) __X,
	(__v4sf) __Y,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask3_fnmadd_round_ss(W, X, Y, U, R) __extension__({\
	(__m128)__builtin_ia32_vfmaddss3_mask3(-(__v4sf)(__m128)(W), \
	(__v4sf)(__m128)(X), \
	(__v4sf)(__m128)(Y), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
	-(__v4sf) __A,
	-(__v4sf) __B,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask_fnmsub_round_ss(W, U, A, B, R) __extension__ ({\
	(__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
	-(__v4sf)(__m128)(A), \
	-(__v4sf)(__m128)(B), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
	{
	return (__m128) __builtin_ia32_vfmaddss3_maskz (-(__v4sf) __A,
	(__v4sf) __B,
	-(__v4sf) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) __extension__ ({\
	(__m128)__builtin_ia32_vfmaddss3_maskz(-(__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	-(__v4sf)(__m128)(C), (__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
	{
	return (__m128) __builtin_ia32_vfnmsubss3_mask3 ((__v4sf) __W,
	(__v4sf) __X,
	(__v4sf) __Y,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) __extension__({\
	(__m128)__builtin_ia32_vfnmsubss3_mask3((__v4sf)(__m128)(W), \
	(__v4sf)(__m128)(X), \
	(__v4sf)(__m128)(Y), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
	(__v2df) __A,
	(__v2df) __B,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask_fmadd_round_sd(W, U, A, B, R) __extension__({\
	(__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
	(__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_fmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
	{
	return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( (__v2df) __A,
	(__v2df) __B,
	(__v2df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_maskz_fmadd_round_sd(U, A, B, C, R) __extension__ ({\
	(__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)(__m128d)(C), (__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
	{
	return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W,
	(__v2df) __X,
	(__v2df) __Y,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask3_fmadd_round_sd(W, X, Y, U, R) __extension__ ({\
	(__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
	(__v2df)(__m128d)(X), \
	(__v2df)(__m128d)(Y), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
	(__v2df) __A,
	-(__v2df) __B,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask_fmsub_round_sd(W, U, A, B, R) __extension__ ({\
	(__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
	(__v2df)(__m128d)(A), \
	-(__v2df)(__m128d)(B), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
	{
	return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( (__v2df) __A,
	(__v2df) __B,
	-(__v2df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_maskz_fmsub_round_sd(U, A, B, C, R) __extension__ ({\
	(__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	-(__v2df)(__m128d)(C), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
	{
	return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W,
	(__v2df) __X,
	(__v2df) __Y,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) __extension__ ({\
	(__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
	(__v2df)(__m128d)(X), \
	(__v2df)(__m128d)(Y), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
	-(__v2df) __A,
	(__v2df) __B,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask_fnmadd_round_sd(W, U, A, B, R) __extension__ ({\
	(__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
	-(__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
	{
	return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( -(__v2df) __A,
	(__v2df) __B,
	(__v2df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) __extension__ ({\
	(__m128d)__builtin_ia32_vfmaddsd3_maskz(-(__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)(__m128d)(C), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
	{
	return (__m128d) __builtin_ia32_vfmaddsd3_mask3 (-(__v2df) __W,
	(__v2df) __X,
	(__v2df) __Y,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask3_fnmadd_round_sd(W, X, Y, U, R) __extension__({\
	(__m128d)__builtin_ia32_vfmaddsd3_mask3(-(__v2df)(__m128d)(W), \
	(__v2df)(__m128d)(X), \
	(__v2df)(__m128d)(Y), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
	-(__v2df) __A,
	-(__v2df) __B,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask_fnmsub_round_sd(W, U, A, B, R) __extension__ ({\
	(__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
	-(__v2df)(__m128d)(A), \
	-(__v2df)(__m128d)(B), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
	{
	return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( -(__v2df) __A,
	(__v2df) __B,
	-(__v2df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) __extension__ ({\
	(__m128d)__builtin_ia32_vfmaddsd3_maskz(-(__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	-(__v2df)(__m128d)(C), \
	(__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
	{
	return (__m128d) __builtin_ia32_vfnmsubsd3_mask3 ((__v2df) (__W),
	(__v2df) __X,
	(__v2df) (__Y),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) __extension__({\
	(__m128d)__builtin_ia32_vfnmsubsd3_mask3((__v2df)(__m128d)(W), \
	(__v2df)(__m128d)(X), \
	(__v2df)(__m128d)(Y), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_permutex_pd(X, C) __extension__ ({ \
	(__m512d)__builtin_shufflevector((__v8df)(__m512d)(X), \
	(__v8df)_mm512_undefined_pd(), \
	0 + (((C) >> 0) & 0x3), \
	0 + (((C) >> 2) & 0x3), \
	0 + (((C) >> 4) & 0x3), \
	0 + (((C) >> 6) & 0x3), \
	4 + (((C) >> 0) & 0x3), \
	4 + (((C) >> 2) & 0x3), \
	4 + (((C) >> 4) & 0x3), \
	4 + (((C) >> 6) & 0x3)); })

	#define _mm512_mask_permutex_pd(W, U, X, C) __extension__ ({ \
	(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
	(__v8df)_mm512_permutex_pd((X), (C)), \
	(__v8df)(__m512d)(W)); })

	#define _mm512_maskz_permutex_pd(U, X, C) __extension__ ({ \
	(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
	(__v8df)_mm512_permutex_pd((X), (C)), \
	(__v8df)_mm512_setzero_pd()); })

	#define _mm512_permutex_epi64(X, C) __extension__ ({ \
	(__m512i)__builtin_shufflevector((__v8di)(__m512i)(X), \
	(__v8di)_mm512_undefined_epi32(), \
	0 + (((C) >> 0) & 0x3), \
	0 + (((C) >> 2) & 0x3), \
	0 + (((C) >> 4) & 0x3), \
	0 + (((C) >> 6) & 0x3), \
	4 + (((C) >> 0) & 0x3), \
	4 + (((C) >> 2) & 0x3), \
	4 + (((C) >> 4) & 0x3), \
	4 + (((C) >> 6) & 0x3)); })

	#define _mm512_mask_permutex_epi64(W, U, X, C) __extension__ ({ \
	(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
	(__v8di)_mm512_permutex_epi64((X), (C)), \
	(__v8di)(__m512i)(W)); })

	#define _mm512_maskz_permutex_epi64(U, X, C) __extension__ ({ \
	(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
	(__v8di)_mm512_permutex_epi64((X), (C)), \
	(__v8di)_mm512_setzero_si512()); })

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_permutexvar_pd (__m512i __X, __m512d __Y)
	{
	return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
	(__v8di) __X,
	(__v8df) _mm512_undefined_pd (),
	(__mmask8) -1);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y)
	{
	return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
	(__v8di) __X,
	(__v8df) __W,
	(__mmask8) __U);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y)
	{
	return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
	(__v8di) __X,
	(__v8df) _mm512_setzero_pd (),
	(__mmask8) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y)
	{
	return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
	(__v8di) __X,
	(__v8di) _mm512_setzero_si512 (),
	__M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_permutexvar_epi64 (__m512i __X, __m512i __Y)
	{
	return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
	(__v8di) __X,
	(__v8di) _mm512_undefined_epi32 (),
	(__mmask8) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X,
	__m512i __Y)
	{
	return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
	(__v8di) __X,
	(__v8di) __W,
	__M);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_permutexvar_ps (__m512i __X, __m512 __Y)
	{
	return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
	(__v16si) __X,
	(__v16sf) _mm512_undefined_ps (),
	(__mmask16) -1);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y)
	{
	return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
	(__v16si) __X,
	(__v16sf) __W,
	(__mmask16) __U);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y)
	{
	return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
	(__v16si) __X,
	(__v16sf) _mm512_setzero_ps (),
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y)
	{
	return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
	(__v16si) __X,
	(__v16si) _mm512_setzero_si512 (),
	__M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_permutexvar_epi32 (__m512i __X, __m512i __Y)
	{
	return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
	(__v16si) __X,
	(__v16si) _mm512_undefined_epi32 (),
	(__mmask16) -1);
	}

	#define _mm512_permutevar_epi32 _mm512_permutexvar_epi32

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X,
	__m512i __Y)
	{
	return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
	(__v16si) __X,
	(__v16si) __W,
	__M);
	}

	#define _mm512_mask_permutevar_epi32 _mm512_mask_permutexvar_epi32

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_kand (__mmask16 __A, __mmask16 __B)
	{
	return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_kandn (__mmask16 __A, __mmask16 __B)
	{
	return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, (__mmask16) __B);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_kor (__mmask16 __A, __mmask16 __B)
	{
	return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B);
	}

	static __inline__ int __DEFAULT_FN_ATTRS
	_mm512_kortestc (__mmask16 __A, __mmask16 __B)
	{
	return __builtin_ia32_kortestchi ((__mmask16) __A, (__mmask16) __B);
	}

	static __inline__ int __DEFAULT_FN_ATTRS
	_mm512_kortestz (__mmask16 __A, __mmask16 __B)
	{
	return __builtin_ia32_kortestzhi ((__mmask16) __A, (__mmask16) __B);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_kunpackb (__mmask16 __A, __mmask16 __B)
	{
	- return (__mmask16) (( __A & 0xFF) \| ( __B << 8));
	+ return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_kxnor (__mmask16 __A, __mmask16 __B)
	{
	return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_kxor (__mmask16 __A, __mmask16 __B)
	{
	return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_stream_si512 (__m512i * __P, __m512i __A)
	{
	typedef __v8di __v8di_aligned __attribute__((aligned(64)));
	__builtin_nontemporal_store((__v8di_aligned)__A, (__v8di_aligned*)__P);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_stream_load_si512 (void const *__P)
	{
	typedef __v8di __v8di_aligned __attribute__((aligned(64)));
	return (__m512i) __builtin_nontemporal_load((const __v8di_aligned *)__P);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_stream_pd (double *__P, __m512d __A)
	{
	typedef __v8df __v8df_aligned __attribute__((aligned(64)));
	__builtin_nontemporal_store((__v8df_aligned)__A, (__v8df_aligned*)__P);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_stream_ps (float *__P, __m512 __A)
	{
	typedef __v16sf __v16sf_aligned __attribute__((aligned(64)));
	__builtin_nontemporal_store((__v16sf_aligned)__A, (__v16sf_aligned*)__P);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
	(__v8df) __W,
	(__mmask8) __U);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_compress_pd (__mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
	(__v8di) __W,
	(__mmask8) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
	(__v8di)
	_mm512_setzero_si512 (),
	(__mmask8) __U);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A)
	{
	return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
	(__v16sf) __W,
	(__mmask16) __U);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_compress_ps (__mmask16 __U, __m512 __A)
	{
	return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
	(__v16sf)
	_mm512_setzero_ps (),
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
	(__v16si) __W,
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) __U);
	}

	#define _mm_cmp_round_ss_mask(X, Y, P, R) __extension__ ({ \
	(__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
	(__v4sf)(__m128)(Y), (int)(P), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) __extension__ ({ \
	(__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
	(__v4sf)(__m128)(Y), (int)(P), \
	(__mmask8)(M), (int)(R)); })

	#define _mm_cmp_ss_mask(X, Y, P) __extension__ ({ \
	(__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
	(__v4sf)(__m128)(Y), (int)(P), \
	(__mmask8)-1, \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_mask_cmp_ss_mask(M, X, Y, P) __extension__ ({ \
	(__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
	(__v4sf)(__m128)(Y), (int)(P), \
	(__mmask8)(M), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_cmp_round_sd_mask(X, Y, P, R) __extension__ ({ \
	(__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
	(__v2df)(__m128d)(Y), (int)(P), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) __extension__ ({ \
	(__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
	(__v2df)(__m128d)(Y), (int)(P), \
	(__mmask8)(M), (int)(R)); })

	#define _mm_cmp_sd_mask(X, Y, P) __extension__ ({ \
	(__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
	(__v2df)(__m128d)(Y), (int)(P), \
	(__mmask8)-1, \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_mask_cmp_sd_mask(M, X, Y, P) __extension__ ({ \
	(__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
	(__v2df)(__m128d)(Y), (int)(P), \
	(__mmask8)(M), \
	_MM_FROUND_CUR_DIRECTION); })

	/* Bit Test */

	static __inline __mmask16 __DEFAULT_FN_ATTRS
	_mm512_test_epi32_mask (__m512i __A, __m512i __B)
	{
	return _mm512_cmpneq_epi32_mask (_mm512_and_epi32(__A, __B),
	_mm512_setzero_epi32());
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
	{
	return _mm512_mask_cmpneq_epi32_mask (__U, _mm512_and_epi32 (__A, __B),
	_mm512_setzero_epi32());
	}

	static __inline __mmask8 __DEFAULT_FN_ATTRS
	_mm512_test_epi64_mask (__m512i __A, __m512i __B)
	{
	return _mm512_cmpneq_epi64_mask (_mm512_and_epi32 (__A, __B),
	_mm512_setzero_epi32());
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
	{
	return _mm512_mask_cmpneq_epi64_mask (__U, _mm512_and_epi32 (__A, __B),
	_mm512_setzero_epi32());
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_testn_epi32_mask (__m512i __A, __m512i __B)
	{
	return _mm512_cmpeq_epi32_mask (_mm512_and_epi32 (__A, __B),
	_mm512_setzero_epi32());
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
	{
	return _mm512_mask_cmpeq_epi32_mask (__U, _mm512_and_epi32 (__A, __B),
	_mm512_setzero_epi32());
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_testn_epi64_mask (__m512i __A, __m512i __B)
	{
	return _mm512_cmpeq_epi64_mask (_mm512_and_epi32 (__A, __B),
	_mm512_setzero_epi32());
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
	{
	return _mm512_mask_cmpeq_epi64_mask (__U, _mm512_and_epi32 (__A, __B),
	_mm512_setzero_epi32());
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_movehdup_ps (__m512 __A)
	{
	return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
	1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A)
	{
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
	(__v16sf)_mm512_movehdup_ps(__A),
	(__v16sf)__W);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A)
	{
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
	(__v16sf)_mm512_movehdup_ps(__A),
	(__v16sf)_mm512_setzero_ps());
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_moveldup_ps (__m512 __A)
	{
	return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
	0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A)
	{
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
	(__v16sf)_mm512_moveldup_ps(__A),
	(__v16sf)__W);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A)
	{
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
	(__v16sf)_mm512_moveldup_ps(__A),
	(__v16sf)_mm512_setzero_ps());
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
	{
	__m128 res = __A;
	res[0] = (__U & 1) ? __B[0] : __W[0];
	return res;
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
	{
	__m128 res = __A;
	res[0] = (__U & 1) ? __B[0] : 0;
	return res;
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
	{
	__m128d res = __A;
	res[0] = (__U & 1) ? __B[0] : __W[0];
	return res;
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
	{
	__m128d res = __A;
	res[0] = (__U & 1) ? __B[0] : 0;
	return res;
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A)
	{
	__builtin_ia32_storess128_mask ((__v16sf *)__W,
	(__v16sf) _mm512_castps128_ps512(__A),
	(__mmask16) __U & (__mmask16)1);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A)
	{
	__builtin_ia32_storesd128_mask ((__v8df *)__W,
	(__v8df) _mm512_castpd128_pd512(__A),
	(__mmask8) __U & 1);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_load_ss (__m128 __W, __mmask8 __U, const float* __A)
	{
	__m128 src = (__v4sf) __builtin_shufflevector((__v4sf) __W,
	(__v4sf) {0.0, 0.0, 0.0, 0.0},
	0, 4, 4, 4);

	return (__m128) __builtin_shufflevector(
	__builtin_ia32_loadss128_mask ((__v16sf *) __A,
	(__v16sf) _mm512_castps128_ps512(src),
	(__mmask16) __U & 1),
	_mm512_undefined_ps(), 0, 1, 2, 3);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_load_ss (__mmask8 __U, const float* __A)
	{
	return (__m128) __builtin_shufflevector(
	__builtin_ia32_loadss128_mask ((__v16sf *) __A,
	(__v16sf) _mm512_setzero_ps(),
	(__mmask16) __U & 1),
	_mm512_undefined_ps(), 0, 1, 2, 3);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_load_sd (__m128d __W, __mmask8 __U, const double* __A)
	{
	__m128d src = (__v2df) __builtin_shufflevector((__v2df) __W,
	(__v2df) {0.0, 0.0}, 0, 2);

	return (__m128d) __builtin_shufflevector(
	__builtin_ia32_loadsd128_mask ((__v8df *) __A,
	(__v8df) _mm512_castpd128_pd512(src),
	(__mmask8) __U & 1),
	_mm512_undefined_pd(), 0, 1);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_load_sd (__mmask8 __U, const double* __A)
	{
	return (__m128d) __builtin_shufflevector(
	__builtin_ia32_loadsd128_mask ((__v8df *) __A,
	(__v8df) _mm512_setzero_pd(),
	(__mmask8) __U & 1),
	_mm512_undefined_pd(), 0, 1);
	}

	#define _mm512_shuffle_epi32(A, I) __extension__ ({ \
	(__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \
	(__v16si)_mm512_undefined_epi32(), \
	0 + (((I) >> 0) & 0x3), \
	0 + (((I) >> 2) & 0x3), \
	0 + (((I) >> 4) & 0x3), \
	0 + (((I) >> 6) & 0x3), \
	4 + (((I) >> 0) & 0x3), \
	4 + (((I) >> 2) & 0x3), \
	4 + (((I) >> 4) & 0x3), \
	4 + (((I) >> 6) & 0x3), \
	8 + (((I) >> 0) & 0x3), \
	8 + (((I) >> 2) & 0x3), \
	8 + (((I) >> 4) & 0x3), \
	8 + (((I) >> 6) & 0x3), \
	12 + (((I) >> 0) & 0x3), \
	12 + (((I) >> 2) & 0x3), \
	12 + (((I) >> 4) & 0x3), \
	12 + (((I) >> 6) & 0x3)); })

	#define _mm512_mask_shuffle_epi32(W, U, A, I) __extension__ ({ \
	(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
	(__v16si)_mm512_shuffle_epi32((A), (I)), \
	(__v16si)(__m512i)(W)); })

	#define _mm512_maskz_shuffle_epi32(U, A, I) __extension__ ({ \
	(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
	(__v16si)_mm512_shuffle_epi32((A), (I)), \
	(__v16si)_mm512_setzero_si512()); })

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
	(__v8df) __W,
	(__mmask8) __U);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_expand_pd (__mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
	(__v8df) _mm512_setzero_pd (),
	(__mmask8) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
	(__v8di) __W,
	(__mmask8) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_expand_epi64 ( __mmask8 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
	(__v8di) _mm512_setzero_pd (),
	(__mmask8) __U);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P)
	{
	return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
	(__v8df) __W,
	(__mmask8) __U);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P)
	{
	return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
	(__v8df) _mm512_setzero_pd(),
	(__mmask8) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P)
	{
	return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
	(__v8di) __W,
	(__mmask8) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P)
	{
	return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
	(__v8di) _mm512_setzero_pd(),
	(__mmask8) __U);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, void const *__P)
	{
	return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
	(__v16sf) __W,
	(__mmask16) __U);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P)
	{
	return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
	(__v16sf) _mm512_setzero_ps(),
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P)
	{
	return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
	(__v16si) __W,
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P)
	{
	return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
	(__v16si) _mm512_setzero_ps(),
	(__mmask16) __U);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A)
	{
	return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
	(__v16sf) __W,
	(__mmask16) __U);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_expand_ps (__mmask16 __U, __m512 __A)
	{
	return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
	(__v16sf) _mm512_setzero_ps(),
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
	(__v16si) __W,
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
	(__v16si) _mm512_setzero_ps(),
	(__mmask16) __U);
	}

	#define _mm512_cvt_roundps_pd(A, R) __extension__ ({ \
	(__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
	(__v8df)_mm512_undefined_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_cvt_roundps_pd(W, U, A, R) __extension__ ({ \
	(__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
	(__v8df)(__m512d)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_maskz_cvt_roundps_pd(U, A, R) __extension__ ({ \
	(__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_cvtps_pd (__m256 __A)
	{
	return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
	(__v8df)
	_mm512_undefined_pd (),
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A)
	{
	return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
	(__v8df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A)
	{
	return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_cvtpslo_pd (__m512 __A)
	{
	return (__m512) _mm512_cvtps_pd(_mm512_castps512_ps256(__A));
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_cvtpslo_pd (__m512d __W, __mmask8 __U, __m512 __A)
	{
	return (__m512) _mm512_mask_cvtps_pd(__W, __U, _mm512_castps512_ps256(__A));
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
	(__v8df) __A,
	(__v8df) __W);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_mov_pd (__mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
	(__v8df) __A,
	(__v8df) _mm512_setzero_pd ());
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A)
	{
	return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
	(__v16sf) __A,
	(__v16sf) __W);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_mov_ps (__mmask16 __U, __m512 __A)
	{
	return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
	(__v16sf) __A,
	(__v16sf) _mm512_setzero_ps ());
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A)
	{
	__builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A,
	(__mmask8) __U);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A)
	{
	__builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A,
	(__mmask8) __U);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A)
	{
	__builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A,
	(__mmask16) __U);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A)
	{
	__builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A,
	(__mmask16) __U);
	}

	#define _mm_cvt_roundsd_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
	(__v2df)(__m128d)(B), \
	(__v4sf)_mm_undefined_ps(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_cvt_roundsd_ss(W, U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
	(__v2df)(__m128d)(B), \
	(__v4sf)(__m128)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_maskz_cvt_roundsd_ss(U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
	(__v2df)(__m128d)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_cvtsd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128d __B)
	{
	return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)(__A),
	(__v2df)(__B),
	(__v4sf)(__W),
	(__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
	{
	return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)(__A),
	(__v2df)(__B),
	(__v4sf)_mm_setzero_ps(),
	(__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_cvtss_i32 _mm_cvtss_si32
	#define _mm_cvtsd_i32 _mm_cvtsd_si32
	#define _mm_cvti32_sd _mm_cvtsi32_sd
	#define _mm_cvti32_ss _mm_cvtsi32_ss
	#ifdef __x86_64__
	#define _mm_cvtss_i64 _mm_cvtss_si64
	#define _mm_cvtsd_i64 _mm_cvtsd_si64
	#define _mm_cvti64_sd _mm_cvtsi64_sd
	#define _mm_cvti64_ss _mm_cvtsi64_ss
	#endif

	#ifdef __x86_64__
	#define _mm_cvt_roundi64_sd(A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
	(int)(R)); })

	#define _mm_cvt_roundsi64_sd(A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
	(int)(R)); })
	#endif

	#define _mm_cvt_roundsi32_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)); })

	#define _mm_cvt_roundi32_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)); })

	#ifdef __x86_64__
	#define _mm_cvt_roundsi64_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
	(int)(R)); })

	#define _mm_cvt_roundi64_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
	(int)(R)); })
	#endif

	#define _mm_cvt_roundss_sd(A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
	(__v4sf)(__m128)(B), \
	(__v2df)_mm_undefined_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_cvt_roundss_sd(W, U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
	(__v4sf)(__m128)(B), \
	(__v2df)(__m128d)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_maskz_cvt_roundss_sd(U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
	(__v4sf)(__m128)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_cvtss_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128 __B)
	{
	return __builtin_ia32_cvtss2sd_round_mask((__v2df)(__A),
	(__v4sf)(__B),
	(__v2df)(__W),
	(__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_cvtss_sd (__mmask8 __U, __m128d __A, __m128 __B)
	{
	return __builtin_ia32_cvtss2sd_round_mask((__v2df)(__A),
	(__v4sf)(__B),
	(__v2df)_mm_setzero_pd(),
	(__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_cvtu32_sd (__m128d __A, unsigned __B)
	{
	return (__m128d) __builtin_ia32_cvtusi2sd32 ((__v2df) __A, __B);
	}

	#ifdef __x86_64__
	#define _mm_cvt_roundu64_sd(A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \
	(unsigned long long)(B), (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_cvtu64_sd (__m128d __A, unsigned long long __B)
	{
	return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B,
	_MM_FROUND_CUR_DIRECTION);
	}
	#endif

	#define _mm_cvt_roundu32_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \
	(int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_cvtu32_ss (__m128 __A, unsigned __B)
	{
	return (__m128) __builtin_ia32_cvtusi2ss32 ((__v4sf) __A, __B,
	_MM_FROUND_CUR_DIRECTION);
	}

	#ifdef __x86_64__
	#define _mm_cvt_roundu64_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \
	(unsigned long long)(B), (int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_cvtu64_ss (__m128 __A, unsigned long long __B)
	{
	return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B,
	_MM_FROUND_CUR_DIRECTION);
	}
	#endif

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
	{
	return (__m512i) __builtin_ia32_selectd_512(__M,
	(__v16si) _mm512_set1_epi32(__A),
	(__v16si) __O);
	}

	#ifdef __x86_64__
	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
	{
	return (__m512i) __builtin_ia32_selectq_512(__M,
	(__v8di) _mm512_set1_epi64(__A),
	(__v8di) __O);
	}
	#endif

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_set_epi8 (char __e63, char __e62, char __e61, char __e60, char __e59,
	char __e58, char __e57, char __e56, char __e55, char __e54, char __e53,
	char __e52, char __e51, char __e50, char __e49, char __e48, char __e47,
	char __e46, char __e45, char __e44, char __e43, char __e42, char __e41,
	char __e40, char __e39, char __e38, char __e37, char __e36, char __e35,
	char __e34, char __e33, char __e32, char __e31, char __e30, char __e29,
	char __e28, char __e27, char __e26, char __e25, char __e24, char __e23,
	char __e22, char __e21, char __e20, char __e19, char __e18, char __e17,
	char __e16, char __e15, char __e14, char __e13, char __e12, char __e11,
	char __e10, char __e9, char __e8, char __e7, char __e6, char __e5,
	char __e4, char __e3, char __e2, char __e1, char __e0) {

	return __extension__ (__m512i)(__v64qi)
	{__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,
	__e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,
	__e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,
	__e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31,
	__e32, __e33, __e34, __e35, __e36, __e37, __e38, __e39,
	__e40, __e41, __e42, __e43, __e44, __e45, __e46, __e47,
	__e48, __e49, __e50, __e51, __e52, __e53, __e54, __e55,
	__e56, __e57, __e58, __e59, __e60, __e61, __e62, __e63};
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_set_epi16(short __e31, short __e30, short __e29, short __e28,
	short __e27, short __e26, short __e25, short __e24, short __e23,
	short __e22, short __e21, short __e20, short __e19, short __e18,
	short __e17, short __e16, short __e15, short __e14, short __e13,
	short __e12, short __e11, short __e10, short __e9, short __e8,
	short __e7, short __e6, short __e5, short __e4, short __e3,
	short __e2, short __e1, short __e0) {
	return __extension__ (__m512i)(__v32hi)
	{__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,
	__e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,
	__e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,
	__e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31 };
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_set_epi32 (int __A, int __B, int __C, int __D,
	int __E, int __F, int __G, int __H,
	int __I, int __J, int __K, int __L,
	int __M, int __N, int __O, int __P)
	{
	return __extension__ (__m512i)(__v16si)
	{ __P, __O, __N, __M, __L, __K, __J, __I,
	__H, __G, __F, __E, __D, __C, __B, __A };
	}

	#define _mm512_setr_epi32(e0,e1,e2,e3,e4,e5,e6,e7, \
	e8,e9,e10,e11,e12,e13,e14,e15) \
	_mm512_set_epi32((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6), \
	(e5),(e4),(e3),(e2),(e1),(e0))

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_set_epi64 (long long __A, long long __B, long long __C,
	long long __D, long long __E, long long __F,
	long long __G, long long __H)
	{
	return __extension__ (__m512i) (__v8di)
	{ __H, __G, __F, __E, __D, __C, __B, __A };
	}

	#define _mm512_setr_epi64(e0,e1,e2,e3,e4,e5,e6,e7) \
	_mm512_set_epi64((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0))

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_set_pd (double __A, double __B, double __C, double __D,
	double __E, double __F, double __G, double __H)
	{
	return __extension__ (__m512d)
	{ __H, __G, __F, __E, __D, __C, __B, __A };
	}

	#define _mm512_setr_pd(e0,e1,e2,e3,e4,e5,e6,e7) \
	_mm512_set_pd((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0))

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_set_ps (float __A, float __B, float __C, float __D,
	float __E, float __F, float __G, float __H,
	float __I, float __J, float __K, float __L,
	float __M, float __N, float __O, float __P)
	{
	return __extension__ (__m512)
	{ __P, __O, __N, __M, __L, __K, __J, __I,
	__H, __G, __F, __E, __D, __C, __B, __A };
	}

	#define _mm512_setr_ps(e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15) \
	_mm512_set_ps((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6),(e5), \
	(e4),(e3),(e2),(e1),(e0))

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_abs_ps(__m512 __A)
	{
	return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_abs_ps(__m512 __W, __mmask16 __K, __m512 __A)
	{
	return (__m512)_mm512_mask_and_epi32((__m512i)__W, __K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_abs_pd(__m512d __A)
	{
	return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A) ;
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
	{
	return (__m512d)_mm512_mask_and_epi64((__v8di)__W, __K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A);
	}

	// Vector-reduction arithmetic accepts vectors as inputs and produces scalars as
	// outputs. This class of vector operation forms the basis of many scientific
	// computations. In vector-reduction arithmetic, the evaluation off is
	// independent of the order of the input elements of V.

	// Used bisection method. At each step, we partition the vector with previous
	// step in half, and the operation is performed on its two halves.
	// This takes log2(n) steps where n is the number of elements in the vector.

	// Vec512 - Vector with size 512.
	// Operator - Can be one of following: +,*,&,\|
	// T2 - Can get 'i' for int and 'f' for float.
	// T1 - Can get 'i' for int and 'd' for double.

	#define _mm512_reduce_operator_64bit(Vec512, Operator, T2, T1) \
	__extension__({ \
	__m256##T1 Vec256 = __builtin_shufflevector( \
	(__v8d##T2)Vec512, \
	(__v8d##T2)Vec512, \
	0, 1, 2, 3) \
	Operator \
	__builtin_shufflevector( \
	(__v8d##T2)Vec512, \
	(__v8d##T2)Vec512, \
	4, 5, 6, 7); \
	__m128##T1 Vec128 = __builtin_shufflevector( \
	(__v4d##T2)Vec256, \
	(__v4d##T2)Vec256, \
	0, 1) \
	Operator \
	__builtin_shufflevector( \
	(__v4d##T2)Vec256, \
	(__v4d##T2)Vec256, \
	2, 3); \
	Vec128 = __builtin_shufflevector((__v2d##T2)Vec128, \
	(__v2d##T2)Vec128, 0, -1) \
	Operator \
	__builtin_shufflevector((__v2d##T2)Vec128, \
	(__v2d##T2)Vec128, 1, -1); \
	return Vec128[0]; \
	})

	static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_add_epi64(__m512i __W) {
	_mm512_reduce_operator_64bit(__W, +, i, i);
	}

	static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_mul_epi64(__m512i __W) {
	_mm512_reduce_operator_64bit(__W, *, i, i);
	}

	static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_and_epi64(__m512i __W) {
	_mm512_reduce_operator_64bit(__W, &, i, i);
	}

	static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_or_epi64(__m512i __W) {
	_mm512_reduce_operator_64bit(__W, \|, i, i);
	}

	static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_add_pd(__m512d __W) {
	_mm512_reduce_operator_64bit(__W, +, f, d);
	}

	static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_mul_pd(__m512d __W) {
	_mm512_reduce_operator_64bit(__W, *, f, d);
	}

	// Vec512 - Vector with size 512.
	// Vec512Neutral - All vector elements set to the identity element.
	// Identity element: {+,0},{*,1},{&,0xFFFFFFFFFFFFFFFF},{\|,0}
	// Operator - Can be one of following: +,*,&,\|
	// Mask - Intrinsic Mask
	// T2 - Can get 'i' for int and 'f' for float.
	// T1 - Can get 'i' for int and 'd' for packed double-precision.
	// T3 - Can be Pd for packed double or q for q-word.

	#define _mm512_mask_reduce_operator_64bit(Vec512, Vec512Neutral, Operator, \
	Mask, T2, T1, T3) \
	__extension__({ \
	Vec512 = __builtin_ia32_select##T3##_512( \
	(__mmask8)Mask, \
	(__v8d##T2)Vec512, \
	(__v8d##T2)Vec512Neutral); \
	_mm512_reduce_operator_64bit(Vec512, Operator, T2, T1); \
	})

	static __inline__ long long __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) {
	_mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), +, __M, i, i, q);
	}

	static __inline__ long long __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {
	_mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(1), *, __M, i, i, q);
	}

	static __inline__ long long __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) {
	_mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF),
	&, __M, i, i, q);
	}

	static __inline__ long long __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) {
	_mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), \|, __M,
	i, i, q);
	}

	static __inline__ double __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) {
	_mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(0), +, __M,
	f, d, pd);
	}

	static __inline__ double __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {
	_mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(1), *, __M,
	f, d, pd);
	}

	// Vec512 - Vector with size 512.
	// Operator - Can be one of following: +,*,&,\|
	// T2 - Can get 'i' for int and ' ' for packed single.
	// T1 - Can get 'i' for int and 'f' for float.

	#define _mm512_reduce_operator_32bit(Vec512, Operator, T2, T1) __extension__({ \
	__m256##T1 Vec256 = \
	(__m256##T1)(__builtin_shufflevector( \
	(__v16s##T2)Vec512, \
	(__v16s##T2)Vec512, \
	0, 1, 2, 3, 4, 5, 6, 7) \
	Operator \
	__builtin_shufflevector( \
	(__v16s##T2)Vec512, \
	(__v16s##T2)Vec512, \
	8, 9, 10, 11, 12, 13, 14, 15)); \
	__m128##T1 Vec128 = \
	(__m128##T1)(__builtin_shufflevector( \
	(__v8s##T2)Vec256, \
	(__v8s##T2)Vec256, \
	0, 1, 2, 3) \
	Operator \
	__builtin_shufflevector( \
	(__v8s##T2)Vec256, \
	(__v8s##T2)Vec256, \
	4, 5, 6, 7)); \
	Vec128 = (__m128##T1)(__builtin_shufflevector( \
	(__v4s##T2)Vec128, \
	(__v4s##T2)Vec128, \
	0, 1, -1, -1) \
	Operator \
	__builtin_shufflevector( \
	(__v4s##T2)Vec128, \
	(__v4s##T2)Vec128, \
	2, 3, -1, -1)); \
	Vec128 = (__m128##T1)(__builtin_shufflevector( \
	(__v4s##T2)Vec128, \
	(__v4s##T2)Vec128, \
	0, -1, -1, -1) \
	Operator \
	__builtin_shufflevector( \
	(__v4s##T2)Vec128, \
	(__v4s##T2)Vec128, \
	1, -1, -1, -1)); \
	return Vec128[0]; \
	})

	static __inline__ int __DEFAULT_FN_ATTRS
	_mm512_reduce_add_epi32(__m512i __W) {
	_mm512_reduce_operator_32bit(__W, +, i, i);
	}

	static __inline__ int __DEFAULT_FN_ATTRS
	_mm512_reduce_mul_epi32(__m512i __W) {
	_mm512_reduce_operator_32bit(__W, *, i, i);
	}

	static __inline__ int __DEFAULT_FN_ATTRS
	_mm512_reduce_and_epi32(__m512i __W) {
	_mm512_reduce_operator_32bit(__W, &, i, i);
	}

	static __inline__ int __DEFAULT_FN_ATTRS
	_mm512_reduce_or_epi32(__m512i __W) {
	_mm512_reduce_operator_32bit(__W, \|, i, i);
	}

	static __inline__ float __DEFAULT_FN_ATTRS
	_mm512_reduce_add_ps(__m512 __W) {
	_mm512_reduce_operator_32bit(__W, +, f, );
	}

	static __inline__ float __DEFAULT_FN_ATTRS
	_mm512_reduce_mul_ps(__m512 __W) {
	_mm512_reduce_operator_32bit(__W, *, f, );
	}

	// Vec512 - Vector with size 512.
	// Vec512Neutral - All vector elements set to the identity element.
	// Identity element: {+,0},{*,1},{&,0xFFFFFFFF},{\|,0}
	// Operator - Can be one of following: +,*,&,\|
	// Mask - Intrinsic Mask
	// T2 - Can get 'i' for int and 'f' for float.
	// T1 - Can get 'i' for int and 'd' for double.
	// T3 - Can be Ps for packed single or d for d-word.

	#define _mm512_mask_reduce_operator_32bit(Vec512, Vec512Neutral, Operator, \
	Mask, T2, T1, T3) \
	__extension__({ \
	Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \
	(__mmask16)Mask, \
	(__v16s##T2)Vec512, \
	(__v16s##T2)Vec512Neutral); \
	_mm512_reduce_operator_32bit(Vec512, Operator, T2, T1); \
	})

	static __inline__ int __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) {
	_mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0), +, __M, i, i, d);
	}

	static __inline__ int __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {
	_mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(1), *, __M, i, i, d);
	}

	static __inline__ int __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) {
	_mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0xFFFFFFFF), &, __M,
	i, i, d);
	}

	static __inline__ int __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) {
	_mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0), \|, __M, i, i, d);
	}

	static __inline__ float __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) {
	_mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(0), +, __M, f, , ps);
	}

	static __inline__ float __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
	_mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(1), *, __M, f, , ps);
	}

	// Used bisection method. At each step, we partition the vector with previous
	// step in half, and the operation is performed on its two halves.
	// This takes log2(n) steps where n is the number of elements in the vector.
	// This macro uses only intrinsics from the AVX512F feature.

	// Vec512 - Vector with size of 512.
	// IntrinName - Can be one of following: {max\|min}_{epi64\|epu64\|pd} for example:
	// __mm512_max_epi64
	// T1 - Can get 'i' for int and 'd' for double.[__m512{i\|d}]
	// T2 - Can get 'i' for int and 'f' for float. [__v8d{i\|f}]

	#define _mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2) __extension__({ \
	Vec512 = _mm512_##IntrinName( \
	(__m512##T1)__builtin_shufflevector( \
	(__v8d##T2)Vec512, \
	(__v8d##T2)Vec512, \
	0, 1, 2, 3, -1, -1, -1, -1), \
	(__m512##T1)__builtin_shufflevector( \
	(__v8d##T2)Vec512, \
	(__v8d##T2)Vec512, \
	4, 5, 6, 7, -1, -1, -1, -1)); \
	Vec512 = _mm512_##IntrinName( \
	(__m512##T1)__builtin_shufflevector( \
	(__v8d##T2)Vec512, \
	(__v8d##T2)Vec512, \
	0, 1, -1, -1, -1, -1, -1, -1),\
	(__m512##T1)__builtin_shufflevector( \
	(__v8d##T2)Vec512, \
	(__v8d##T2)Vec512, \
	2, 3, -1, -1, -1, -1, -1, \
	-1)); \
	Vec512 = _mm512_##IntrinName( \
	(__m512##T1)__builtin_shufflevector( \
	(__v8d##T2)Vec512, \
	(__v8d##T2)Vec512, \
	0, -1, -1, -1, -1, -1, -1, -1),\
	(__m512##T1)__builtin_shufflevector( \
	(__v8d##T2)Vec512, \
	(__v8d##T2)Vec512, \
	1, -1, -1, -1, -1, -1, -1, -1))\
	; \
	return Vec512[0]; \
	})

	static __inline__ long long __DEFAULT_FN_ATTRS
	_mm512_reduce_max_epi64(__m512i __V) {
	_mm512_reduce_maxMin_64bit(__V, max_epi64, i, i);
	}

	static __inline__ unsigned long long __DEFAULT_FN_ATTRS
	_mm512_reduce_max_epu64(__m512i __V) {
	_mm512_reduce_maxMin_64bit(__V, max_epu64, i, i);
	}

	static __inline__ double __DEFAULT_FN_ATTRS
	_mm512_reduce_max_pd(__m512d __V) {
	_mm512_reduce_maxMin_64bit(__V, max_pd, d, f);
	}

	static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_min_epi64
	(__m512i __V) {
	_mm512_reduce_maxMin_64bit(__V, min_epi64, i, i);
	}

	static __inline__ unsigned long long __DEFAULT_FN_ATTRS
	_mm512_reduce_min_epu64(__m512i __V) {
	_mm512_reduce_maxMin_64bit(__V, min_epu64, i, i);
	}

	static __inline__ double __DEFAULT_FN_ATTRS
	_mm512_reduce_min_pd(__m512d __V) {
	_mm512_reduce_maxMin_64bit(__V, min_pd, d, f);
	}

	// Vec512 - Vector with size 512.
	// Vec512Neutral - A 512 length vector with elements set to the identity element
	// Identity element: {max_epi,0x8000000000000000}
	// {max_epu,0x0000000000000000}
	// {max_pd, 0xFFF0000000000000}
	// {min_epi,0x7FFFFFFFFFFFFFFF}
	// {min_epu,0xFFFFFFFFFFFFFFFF}
	// {min_pd, 0x7FF0000000000000}
	//
	// IntrinName - Can be one of following: {max\|min}_{epi64\|epu64\|pd} for example:
	// __mm512_max_epi64
	// T1 - Can get 'i' for int and 'd' for double.[__m512{i\|d}]
	// T2 - Can get 'i' for int and 'f' for float. [__v8d{i\|f}]
	// T3 - Can get 'q' q word and 'pd' for packed double.
	// [__builtin_ia32_select{q\|pd}_512]
	// Mask - Intrinsic Mask

	#define _mm512_mask_reduce_maxMin_64bit(Vec512, Vec512Neutral, IntrinName, T1, \
	T2, T3, Mask) \
	__extension__({ \
	Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \
	(__mmask8)Mask, \
	(__v8d##T2)Vec512, \
	(__v8d##T2)Vec512Neutral); \
	_mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2); \
	})

	static __inline__ long long __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) {
	_mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x8000000000000000),
	max_epi64, i, i, q, __M);
	}

	static __inline__ unsigned long long __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) {
	_mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x0000000000000000),
	max_epu64, i, i, q, __M);
	}

	static __inline__ double __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) {
	_mm512_mask_reduce_maxMin_64bit(__V, -_mm512_set1_pd(__builtin_inf()),
	max_pd, d, f, pd, __M);
	}

	static __inline__ long long __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) {
	_mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),
	min_epi64, i, i, q, __M);
	}

	static __inline__ unsigned long long __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
	_mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF),
	min_epu64, i, i, q, __M);
	}

	static __inline__ double __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) {
	_mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_pd(__builtin_inf()),
	min_pd, d, f, pd, __M);
	}

	// Vec512 - Vector with size 512.
	// IntrinName - Can be one of following: {max\|min}_{epi32\|epu32\|ps} for example:
	// __mm512_max_epi32
	// T1 - Can get 'i' for int and ' ' .[__m512{i\|}]
	// T2 - Can get 'i' for int and 'f' for float.[__v16s{i\|f}]

	#define _mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2) __extension__({ \
	Vec512 = _mm512_##IntrinName( \
	(__m512##T1)__builtin_shufflevector( \
	(__v16s##T2)Vec512, \
	(__v16s##T2)Vec512, \
	0, 1, 2, 3, 4, 5, 6, 7, \
	-1, -1, -1, -1, -1, -1, -1, -1), \
	(__m512##T1)__builtin_shufflevector( \
	(__v16s##T2)Vec512, \
	(__v16s##T2)Vec512, \
	8, 9, 10, 11, 12, 13, 14, 15, \
	-1, -1, -1, -1, -1, -1, -1, -1)); \
	Vec512 = _mm512_##IntrinName( \
	(__m512##T1)__builtin_shufflevector( \
	(__v16s##T2)Vec512, \
	(__v16s##T2)Vec512, \
	0, 1, 2, 3, -1, -1, -1, -1, \
	-1, -1, -1, -1, -1, -1, -1, -1), \
	(__m512##T1)__builtin_shufflevector( \
	(__v16s##T2)Vec512, \
	(__v16s##T2)Vec512, \
	4, 5, 6, 7, -1, -1, -1, -1, \
	-1, -1, -1, -1, -1, -1, -1, -1)); \
	Vec512 = _mm512_##IntrinName( \
	(__m512##T1)__builtin_shufflevector( \
	(__v16s##T2)Vec512, \
	(__v16s##T2)Vec512, \
	0, 1, -1, -1, -1, -1, -1, -1, \
	-1, -1, -1, -1, -1, -1, -1, -1), \
	(__m512##T1)__builtin_shufflevector( \
	(__v16s##T2)Vec512, \
	(__v16s##T2)Vec512, \
	2, 3, -1, -1, -1, -1, -1, -1, \
	-1, -1, -1, -1, -1, -1, -1, -1)); \
	Vec512 = _mm512_##IntrinName( \
	(__m512##T1)__builtin_shufflevector( \
	(__v16s##T2)Vec512, \
	(__v16s##T2)Vec512, \
	0, -1, -1, -1, -1, -1, -1, -1, \
	-1, -1, -1, -1, -1, -1, -1, -1), \
	(__m512##T1)__builtin_shufflevector( \
	(__v16s##T2)Vec512, \
	(__v16s##T2)Vec512, \
	1, -1, -1, -1, -1, -1, -1, -1, \
	-1, -1, -1, -1, -1, -1, -1, -1)); \
	return Vec512[0]; \
	})

	static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_max_epi32(__m512i a) {
	_mm512_reduce_maxMin_32bit(a, max_epi32, i, i);
	}

	static __inline__ unsigned int __DEFAULT_FN_ATTRS
	_mm512_reduce_max_epu32(__m512i a) {
	_mm512_reduce_maxMin_32bit(a, max_epu32, i, i);
	}

	static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_max_ps(__m512 a) {
	_mm512_reduce_maxMin_32bit(a, max_ps, , f);
	}

	static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_min_epi32(__m512i a) {
	_mm512_reduce_maxMin_32bit(a, min_epi32, i, i);
	}

	static __inline__ unsigned int __DEFAULT_FN_ATTRS
	_mm512_reduce_min_epu32(__m512i a) {
	_mm512_reduce_maxMin_32bit(a, min_epu32, i, i);
	}

	static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_min_ps(__m512 a) {
	_mm512_reduce_maxMin_32bit(a, min_ps, , f);
	}

	// Vec512 - Vector with size 512.
	// Vec512Neutral - A 512 length vector with elements set to the identity element
	// Identity element: {max_epi,0x80000000}
	// {max_epu,0x00000000}
	// {max_ps, 0xFF800000}
	// {min_epi,0x7FFFFFFF}
	// {min_epu,0xFFFFFFFF}
	// {min_ps, 0x7F800000}
	//
	// IntrinName - Can be one of following: {max\|min}_{epi32\|epu32\|ps} for example:
	// __mm512_max_epi32
	// T1 - Can get 'i' for int and ' ' .[__m512{i\|}]
	// T2 - Can get 'i' for int and 'f' for float.[__v16s{i\|f}]
	// T3 - Can get 'q' q word and 'pd' for packed double.
	// [__builtin_ia32_select{q\|pd}_512]
	// Mask - Intrinsic Mask

	#define _mm512_mask_reduce_maxMin_32bit(Vec512, Vec512Neutral, IntrinName, T1, \
	T2, T3, Mask) \
	__extension__({ \
	Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \
	(__mmask16)Mask, \
	(__v16s##T2)Vec512, \
	(__v16s##T2)Vec512Neutral); \
	_mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2); \
	})

	static __inline__ int __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) {
	_mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x80000000), max_epi32,
	i, i, d, __M);
	}

	static __inline__ unsigned int __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) {
	_mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x00000000), max_epu32,
	i, i, d, __M);
	}

	static __inline__ float __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) {
	_mm512_mask_reduce_maxMin_32bit(__V,-_mm512_set1_ps(__builtin_inff()), max_ps, , f,
	ps, __M);
	}

	static __inline__ int __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) {
	_mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x7FFFFFFF), min_epi32,
	i, i, d, __M);
	}

	static __inline__ unsigned int __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
	_mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0xFFFFFFFF), min_epu32,
	i, i, d, __M);
	}

	static __inline__ float __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) {
	_mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_ps(__builtin_inff()), min_ps, , f,
	ps, __M);
	}

	#undef __DEFAULT_FN_ATTRS

	#endif // __AVX512FINTRIN_H
	Index: head/contrib/llvm/tools/clang/lib/Lex/LiteralSupport.cpp
	===================================================================
	--- head/contrib/llvm/tools/clang/lib/Lex/LiteralSupport.cpp (revision 329409)
	+++ head/contrib/llvm/tools/clang/lib/Lex/LiteralSupport.cpp (revision 329410)
	@@ -1,1728 +1,1734 @@
	//===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the NumericLiteralParser, CharLiteralParser, and
	// StringLiteralParser interfaces.
	//
	//===----------------------------------------------------------------------===//

	#include "clang/Lex/LiteralSupport.h"
	#include "clang/Basic/CharInfo.h"
	#include "clang/Basic/LangOptions.h"
	#include "clang/Basic/SourceLocation.h"
	#include "clang/Basic/TargetInfo.h"
	#include "clang/Lex/LexDiagnostic.h"
	#include "clang/Lex/Lexer.h"
	#include "clang/Lex/Preprocessor.h"
	#include "clang/Lex/Token.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/Support/ConvertUTF.h"
	#include "llvm/Support/ErrorHandling.h"
	#include <algorithm>
	#include <cassert>
	#include <cstddef>
	#include <cstdint>
	#include <cstring>
	#include <string>

	using namespace clang;

	static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
	switch (kind) {
	default: llvm_unreachable("Unknown token type!");
	case tok::char_constant:
	case tok::string_literal:
	case tok::utf8_char_constant:
	case tok::utf8_string_literal:
	return Target.getCharWidth();
	case tok::wide_char_constant:
	case tok::wide_string_literal:
	return Target.getWCharWidth();
	case tok::utf16_char_constant:
	case tok::utf16_string_literal:
	return Target.getChar16Width();
	case tok::utf32_char_constant:
	case tok::utf32_string_literal:
	return Target.getChar32Width();
	}
	}

	static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
	FullSourceLoc TokLoc,
	const char *TokBegin,
	const char *TokRangeBegin,
	const char *TokRangeEnd) {
	SourceLocation Begin =
	Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
	TokLoc.getManager(), Features);
	SourceLocation End =
	Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
	TokLoc.getManager(), Features);
	return CharSourceRange::getCharRange(Begin, End);
	}

	/// \brief Produce a diagnostic highlighting some portion of a literal.
	///
	/// Emits the diagnostic \p DiagID, highlighting the range of characters from
	/// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
	/// a substring of a spelling buffer for the token beginning at \p TokBegin.
	static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
	const LangOptions &Features, FullSourceLoc TokLoc,
	const char TokBegin, const char TokRangeBegin,
	const char *TokRangeEnd, unsigned DiagID) {
	SourceLocation Begin =
	Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
	TokLoc.getManager(), Features);
	return Diags->Report(Begin, DiagID) <<
	MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);
	}

	/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
	/// either a character or a string literal.
	static unsigned ProcessCharEscape(const char *ThisTokBegin,
	const char *&ThisTokBuf,
	const char *ThisTokEnd, bool &HadError,
	FullSourceLoc Loc, unsigned CharWidth,
	DiagnosticsEngine *Diags,
	const LangOptions &Features) {
	const char *EscapeBegin = ThisTokBuf;

	// Skip the '\' char.
	++ThisTokBuf;

	// We know that this character can't be off the end of the buffer, because
	// that would have been \", which would not have been the end of string.
	unsigned ResultChar = *ThisTokBuf++;
	switch (ResultChar) {
	// These map to themselves.
	case '\\': case '\'': case '"': case '?': break;

	// These have fixed mappings.
	case 'a':
	// TODO: K&R: the meaning of '\\a' is different in traditional C
	ResultChar = 7;
	break;
	case 'b':
	ResultChar = 8;
	break;
	case 'e':
	if (Diags)
	Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
	diag::ext_nonstandard_escape) << "e";
	ResultChar = 27;
	break;
	case 'E':
	if (Diags)
	Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
	diag::ext_nonstandard_escape) << "E";
	ResultChar = 27;
	break;
	case 'f':
	ResultChar = 12;
	break;
	case 'n':
	ResultChar = 10;
	break;
	case 'r':
	ResultChar = 13;
	break;
	case 't':
	ResultChar = 9;
	break;
	case 'v':
	ResultChar = 11;
	break;
	case 'x': { // Hex escape.
	ResultChar = 0;
	if (ThisTokBuf == ThisTokEnd \|\| !isHexDigit(*ThisTokBuf)) {
	if (Diags)
	Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
	diag::err_hex_escape_no_digits) << "x";
	HadError = true;
	break;
	}

	// Hex escapes are a maximal series of hex digits.
	bool Overflow = false;
	for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
	int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
	if (CharVal == -1) break;
	// About to shift out a digit?
	if (ResultChar & 0xF0000000)
	Overflow = true;
	ResultChar <<= 4;
	ResultChar \|= CharVal;
	}

	// See if any bits will be truncated when evaluated as a character.
	if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
	Overflow = true;
	ResultChar &= ~0U >> (32-CharWidth);
	}

	// Check for overflow.
	if (Overflow && Diags) // Too many digits to fit in
	Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
	diag::err_escape_too_large) << 0;
	break;
	}
	case '0': case '1': case '2': case '3':
	case '4': case '5': case '6': case '7': {
	// Octal escapes.
	--ThisTokBuf;
	ResultChar = 0;

	// Octal escapes are a series of octal digits with maximum length 3.
	// "\0123" is a two digit sequence equal to "\012" "3".
	unsigned NumDigits = 0;
	do {
	ResultChar <<= 3;
	ResultChar \|= *ThisTokBuf++ - '0';
	++NumDigits;
	} while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
	ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');

	// Check for overflow. Reject '\777', but not L'\777'.
	if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
	if (Diags)
	Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
	diag::err_escape_too_large) << 1;
	ResultChar &= ~0U >> (32-CharWidth);
	}
	break;
	}

	// Otherwise, these are not valid escapes.
	case '(': case '{': case '[': case '%':
	// GCC accepts these as extensions. We warn about them as such though.
	if (Diags)
	Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
	diag::ext_nonstandard_escape)
	<< std::string(1, ResultChar);
	break;
	default:
	if (!Diags)
	break;

	if (isPrintable(ResultChar))
	Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
	diag::ext_unknown_escape)
	<< std::string(1, ResultChar);
	else
	Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
	diag::ext_unknown_escape)
	<< "x" + llvm::utohexstr(ResultChar);
	break;
	}

	return ResultChar;
	}

	static void appendCodePoint(unsigned Codepoint,
	llvm::SmallVectorImpl<char> &Str) {
	char ResultBuf[4];
	char *ResultPtr = ResultBuf;
	bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr);
	(void)Res;
	assert(Res && "Unexpected conversion failure");
	Str.append(ResultBuf, ResultPtr);
	}

	void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
	for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
	if (*I != '\\') {
	Buf.push_back(*I);
	continue;
	}

	++I;
	assert(I == 'u' \|\| I == 'U');

	unsigned NumHexDigits;
	if (*I == 'u')
	NumHexDigits = 4;
	else
	NumHexDigits = 8;

	assert(I + NumHexDigits <= E);

	uint32_t CodePoint = 0;
	for (++I; NumHexDigits != 0; ++I, --NumHexDigits) {
	unsigned Value = llvm::hexDigitValue(*I);
	assert(Value != -1U);

	CodePoint <<= 4;
	CodePoint += Value;
	}

	appendCodePoint(CodePoint, Buf);
	--I;
	}
	}

	/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
	/// return the UTF32.
	static bool ProcessUCNEscape(const char ThisTokBegin, const char &ThisTokBuf,
	const char *ThisTokEnd,
	uint32_t &UcnVal, unsigned short &UcnLen,
	FullSourceLoc Loc, DiagnosticsEngine *Diags,
	const LangOptions &Features,
	bool in_char_string_literal = false) {
	const char *UcnBegin = ThisTokBuf;

	// Skip the '\u' char's.
	ThisTokBuf += 2;

	if (ThisTokBuf == ThisTokEnd \|\| !isHexDigit(*ThisTokBuf)) {
	if (Diags)
	Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
	diag::err_hex_escape_no_digits) << StringRef(&ThisTokBuf[-1], 1);
	return false;
	}
	UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
	unsigned short UcnLenSave = UcnLen;
	for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {
	int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
	if (CharVal == -1) break;
	UcnVal <<= 4;
	UcnVal \|= CharVal;
	}
	// If we didn't consume the proper number of digits, there is a problem.
	if (UcnLenSave) {
	if (Diags)
	Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
	diag::err_ucn_escape_incomplete);
	return false;
	}

	// Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
	if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) \|\| // surrogate codepoints
	UcnVal > 0x10FFFF) { // maximum legal UTF32 value
	if (Diags)
	Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
	diag::err_ucn_escape_invalid);
	return false;
	}

	// C++11 allows UCNs that refer to control characters and basic source
	// characters inside character and string literals
	if (UcnVal < 0xa0 &&
	(UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) { // $, @, `
	bool IsError = (!Features.CPlusPlus11 \|\| !in_char_string_literal);
	if (Diags) {
	char BasicSCSChar = UcnVal;
	if (UcnVal >= 0x20 && UcnVal < 0x7f)
	Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
	IsError ? diag::err_ucn_escape_basic_scs :
	diag::warn_cxx98_compat_literal_ucn_escape_basic_scs)
	<< StringRef(&BasicSCSChar, 1);
	else
	Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
	IsError ? diag::err_ucn_control_character :
	diag::warn_cxx98_compat_literal_ucn_control_character);
	}
	if (IsError)
	return false;
	}

	if (!Features.CPlusPlus && !Features.C99 && Diags)
	Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
	diag::warn_ucn_not_valid_in_c89_literal);

	return true;
	}

	/// MeasureUCNEscape - Determine the number of bytes within the resulting string
	/// which this UCN will occupy.
	static int MeasureUCNEscape(const char ThisTokBegin, const char &ThisTokBuf,
	const char *ThisTokEnd, unsigned CharByteWidth,
	const LangOptions &Features, bool &HadError) {
	// UTF-32: 4 bytes per escape.
	if (CharByteWidth == 4)
	return 4;

	uint32_t UcnVal = 0;
	unsigned short UcnLen = 0;
	FullSourceLoc Loc;

	if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
	UcnLen, Loc, nullptr, Features, true)) {
	HadError = true;
	return 0;
	}

	// UTF-16: 2 bytes for BMP, 4 bytes otherwise.
	if (CharByteWidth == 2)
	return UcnVal <= 0xFFFF ? 2 : 4;

	// UTF-8.
	if (UcnVal < 0x80)
	return 1;
	if (UcnVal < 0x800)
	return 2;
	if (UcnVal < 0x10000)
	return 3;
	return 4;
	}

	/// EncodeUCNEscape - Read the Universal Character Name, check constraints and
	/// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
	/// StringLiteralParser. When we decide to implement UCN's for identifiers,
	/// we will likely rework our support for UCN's.
	static void EncodeUCNEscape(const char ThisTokBegin, const char &ThisTokBuf,
	const char *ThisTokEnd,
	char *&ResultBuf, bool &HadError,
	FullSourceLoc Loc, unsigned CharByteWidth,
	DiagnosticsEngine *Diags,
	const LangOptions &Features) {
	typedef uint32_t UTF32;
	UTF32 UcnVal = 0;
	unsigned short UcnLen = 0;
	if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
	Loc, Diags, Features, true)) {
	HadError = true;
	return;
	}

	assert((CharByteWidth == 1 \|\| CharByteWidth == 2 \|\| CharByteWidth == 4) &&
	"only character widths of 1, 2, or 4 bytes supported");

	(void)UcnLen;
	assert((UcnLen== 4 \|\| UcnLen== 8) && "only ucn length of 4 or 8 supported");

	if (CharByteWidth == 4) {
	// FIXME: Make the type of the result buffer correct instead of
	// using reinterpret_cast.
	llvm::UTF32 ResultPtr = reinterpret_cast<llvm::UTF32>(ResultBuf);
	*ResultPtr = UcnVal;
	ResultBuf += 4;
	return;
	}

	if (CharByteWidth == 2) {
	// FIXME: Make the type of the result buffer correct instead of
	// using reinterpret_cast.
	llvm::UTF16 ResultPtr = reinterpret_cast<llvm::UTF16>(ResultBuf);

	if (UcnVal <= (UTF32)0xFFFF) {
	*ResultPtr = UcnVal;
	ResultBuf += 2;
	return;
	}

	// Convert to UTF16.
	UcnVal -= 0x10000;
	*ResultPtr = 0xD800 + (UcnVal >> 10);
	*(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
	ResultBuf += 4;
	return;
	}

	assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");

	// Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
	// The conversion below was inspired by:
	// http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
	// First, we determine how many bytes the result will require.
	typedef uint8_t UTF8;

	unsigned short bytesToWrite = 0;
	if (UcnVal < (UTF32)0x80)
	bytesToWrite = 1;
	else if (UcnVal < (UTF32)0x800)
	bytesToWrite = 2;
	else if (UcnVal < (UTF32)0x10000)
	bytesToWrite = 3;
	else
	bytesToWrite = 4;

	const unsigned byteMask = 0xBF;
	const unsigned byteMark = 0x80;

	// Once the bits are split out into bytes of UTF8, this is a mask OR-ed
	// into the first byte, depending on how many bytes follow.
	static const UTF8 firstByteMark[5] = {
	0x00, 0x00, 0xC0, 0xE0, 0xF0
	};
	// Finally, we write the bytes into ResultBuf.
	ResultBuf += bytesToWrite;
	switch (bytesToWrite) { // note: everything falls through.
	case 4:
	*--ResultBuf = (UTF8)((UcnVal \| byteMark) & byteMask); UcnVal >>= 6;
	LLVM_FALLTHROUGH;
	case 3:
	*--ResultBuf = (UTF8)((UcnVal \| byteMark) & byteMask); UcnVal >>= 6;
	LLVM_FALLTHROUGH;
	case 2:
	*--ResultBuf = (UTF8)((UcnVal \| byteMark) & byteMask); UcnVal >>= 6;
	LLVM_FALLTHROUGH;
	case 1:
	*--ResultBuf = (UTF8) (UcnVal \| firstByteMark[bytesToWrite]);
	}
	// Update the buffer.
	ResultBuf += bytesToWrite;
	}

	/// integer-constant: [C99 6.4.4.1]
	/// decimal-constant integer-suffix
	/// octal-constant integer-suffix
	/// hexadecimal-constant integer-suffix
	/// binary-literal integer-suffix [GNU, C++1y]
	/// user-defined-integer-literal: [C++11 lex.ext]
	/// decimal-literal ud-suffix
	/// octal-literal ud-suffix
	/// hexadecimal-literal ud-suffix
	/// binary-literal ud-suffix [GNU, C++1y]
	/// decimal-constant:
	/// nonzero-digit
	/// decimal-constant digit
	/// octal-constant:
	/// 0
	/// octal-constant octal-digit
	/// hexadecimal-constant:
	/// hexadecimal-prefix hexadecimal-digit
	/// hexadecimal-constant hexadecimal-digit
	/// hexadecimal-prefix: one of
	/// 0x 0X
	/// binary-literal:
	/// 0b binary-digit
	/// 0B binary-digit
	/// binary-literal binary-digit
	/// integer-suffix:
	/// unsigned-suffix [long-suffix]
	/// unsigned-suffix [long-long-suffix]
	/// long-suffix [unsigned-suffix]
	/// long-long-suffix [unsigned-sufix]
	/// nonzero-digit:
	/// 1 2 3 4 5 6 7 8 9
	/// octal-digit:
	/// 0 1 2 3 4 5 6 7
	/// hexadecimal-digit:
	/// 0 1 2 3 4 5 6 7 8 9
	/// a b c d e f
	/// A B C D E F
	/// binary-digit:
	/// 0
	/// 1
	/// unsigned-suffix: one of
	/// u U
	/// long-suffix: one of
	/// l L
	/// long-long-suffix: one of
	/// ll LL
	///
	/// floating-constant: [C99 6.4.4.2]
	/// TODO: add rules...
	///
	NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
	SourceLocation TokLoc,
	Preprocessor &PP)
	: PP(PP), ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) {

	// This routine assumes that the range begin/end matches the regex for integer
	// and FP constants (specifically, the 'pp-number' regex), and assumes that
	// the byte at "*end" is both valid and not part of the regex. Because of
	// this, it doesn't have to check for 'overscan' in various places.
	assert(!isPreprocessingNumberBody(*ThisTokEnd) && "didn't maximally munch?");

	s = DigitsBegin = ThisTokBegin;
	saw_exponent = false;
	saw_period = false;
	saw_ud_suffix = false;
	isLong = false;
	isUnsigned = false;
	isLongLong = false;
	isHalf = false;
	isFloat = false;
	isImaginary = false;
	isFloat16 = false;
	isFloat128 = false;
	MicrosoftInteger = 0;
	hadError = false;

	if (*s == '0') { // parse radix
	ParseNumberStartingWithZero(TokLoc);
	if (hadError)
	return;
	} else { // the first digit is non-zero
	radix = 10;
	s = SkipDigits(s);
	if (s == ThisTokEnd) {
	// Done.
	} else {
	ParseDecimalOrOctalCommon(TokLoc);
	if (hadError)
	return;
	}
	}

	SuffixBegin = s;
	checkSeparator(TokLoc, s, CSK_AfterDigits);

	// Parse the suffix. At this point we can classify whether we have an FP or
	// integer constant.
	bool isFPConstant = isFloatingLiteral();

	// Loop over all of the characters of the suffix. If we see something bad,
	// we break out of the loop.
	for (; s != ThisTokEnd; ++s) {
	switch (*s) {
	case 'h': // FP Suffix for "half".
	case 'H':
	// OpenCL Extension v1.2 s9.5 - h or H suffix for half type.
	if (!PP.getLangOpts().Half) break;
	if (!isFPConstant) break; // Error for integer constant.
	if (isHalf \|\| isFloat \|\| isLong) break; // HH, FH, LH invalid.
	isHalf = true;
	continue; // Success.
	case 'f': // FP Suffix for "float"
	case 'F':
	if (!isFPConstant) break; // Error for integer constant.
	if (isHalf \|\| isFloat \|\| isLong \|\| isFloat128)
	break; // HF, FF, LF, QF invalid.

	if (s + 2 < ThisTokEnd && s[1] == '1' && s[2] == '6') {
	s += 2; // success, eat up 2 characters.
	isFloat16 = true;
	continue;
	}

	isFloat = true;
	continue; // Success.
	case 'q': // FP Suffix for "__float128"
	case 'Q':
	if (!isFPConstant) break; // Error for integer constant.
	if (isHalf \|\| isFloat \|\| isLong \|\| isFloat128)
	break; // HQ, FQ, LQ, QQ invalid.
	isFloat128 = true;
	continue; // Success.
	case 'u':
	case 'U':
	if (isFPConstant) break; // Error for floating constant.
	if (isUnsigned) break; // Cannot be repeated.
	isUnsigned = true;
	continue; // Success.
	case 'l':
	case 'L':
	if (isLong \|\| isLongLong) break; // Cannot be repeated.
	if (isHalf \|\| isFloat \|\| isFloat128) break; // LH, LF, LQ invalid.

	// Check for long long. The L's need to be adjacent and the same case.
	if (s[1] == s[0]) {
	assert(s + 1 < ThisTokEnd && "didn't maximally munch?");
	if (isFPConstant) break; // long long invalid for floats.
	isLongLong = true;
	++s; // Eat both of them.
	} else {
	isLong = true;
	}
	continue; // Success.
	case 'i':
	case 'I':
	if (PP.getLangOpts().MicrosoftExt) {
	if (isLong \|\| isLongLong \|\| MicrosoftInteger)
	break;

	if (!isFPConstant) {
	// Allow i8, i16, i32, and i64.
	switch (s[1]) {
	case '8':
	s += 2; // i8 suffix
	MicrosoftInteger = 8;
	break;
	case '1':
	if (s[2] == '6') {
	s += 3; // i16 suffix
	MicrosoftInteger = 16;
	}
	break;
	case '3':
	if (s[2] == '2') {
	s += 3; // i32 suffix
	MicrosoftInteger = 32;
	}
	break;
	case '6':
	if (s[2] == '4') {
	s += 3; // i64 suffix
	MicrosoftInteger = 64;
	}
	break;
	default:
	break;
	}
	}
	if (MicrosoftInteger) {
	assert(s <= ThisTokEnd && "didn't maximally munch?");
	break;
	}
	}
	// fall through.
	case 'j':
	case 'J':
	if (isImaginary) break; // Cannot be repeated.
	isImaginary = true;
	continue; // Success.
	}
	// If we reached here, there was an error or a ud-suffix.
	break;
	}

	// "i", "if", and "il" are user-defined suffixes in C++1y.
	if (s != ThisTokEnd \|\| isImaginary) {
	// FIXME: Don't bother expanding UCNs if !tok.hasUCN().
	expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));
	if (isValidUDSuffix(PP.getLangOpts(), UDSuffixBuf)) {
	if (!isImaginary) {
	// Any suffix pieces we might have parsed are actually part of the
	// ud-suffix.
	isLong = false;
	isUnsigned = false;
	isLongLong = false;
	isFloat = false;
	isFloat16 = false;
	isHalf = false;
	isImaginary = false;
	MicrosoftInteger = 0;
	}

	saw_ud_suffix = true;
	return;
	}

	if (s != ThisTokEnd) {
	// Report an error if there are any.
	PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, SuffixBegin - ThisTokBegin),
	diag::err_invalid_suffix_constant)
	<< StringRef(SuffixBegin, ThisTokEnd - SuffixBegin) << isFPConstant;
	hadError = true;
	}
	}
	}

	/// ParseDecimalOrOctalCommon - This method is called for decimal or octal
	/// numbers. It issues an error for illegal digits, and handles floating point
	/// parsing. If it detects a floating point number, the radix is set to 10.
	void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){
	assert((radix == 8 \|\| radix == 10) && "Unexpected radix");

	// If we have a hex digit other than 'e' (which denotes a FP exponent) then
	// the code is using an incorrect base.
	if (isHexDigit(s) && s != 'e' && *s != 'E') {
	PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
	diag::err_invalid_digit) << StringRef(s, 1) << (radix == 8 ? 1 : 0);
	hadError = true;
	return;
	}

	if (*s == '.') {
	checkSeparator(TokLoc, s, CSK_AfterDigits);
	s++;
	radix = 10;
	saw_period = true;
	checkSeparator(TokLoc, s, CSK_BeforeDigits);
	s = SkipDigits(s); // Skip suffix.
	}
	if (s == 'e' \|\| s == 'E') { // exponent
	checkSeparator(TokLoc, s, CSK_AfterDigits);
	const char *Exponent = s;
	s++;
	radix = 10;
	saw_exponent = true;
	- if (s == '+' \|\| s == '-') s++; // sign
	+ if (s != ThisTokEnd && (s == '+' \|\| s == '-')) s++; // sign
	const char *first_non_digit = SkipDigits(s);
	if (containsDigits(s, first_non_digit)) {
	checkSeparator(TokLoc, s, CSK_BeforeDigits);
	s = first_non_digit;
	} else {
	- PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
	- diag::err_exponent_has_no_digits);
	- hadError = true;
	+ if (!hadError) {
	+ PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
	+ diag::err_exponent_has_no_digits);
	+ hadError = true;
	+ }
	return;
	}
	}
	}

	/// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
	/// suffixes as ud-suffixes, because the diagnostic experience is better if we
	/// treat it as an invalid suffix.
	bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
	StringRef Suffix) {
	if (!LangOpts.CPlusPlus11 \|\| Suffix.empty())
	return false;

	// By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid.
	if (Suffix[0] == '_')
	return true;

	// In C++11, there are no library suffixes.
	if (!LangOpts.CPlusPlus14)
	return false;

	// In C++1y, "s", "h", "min", "ms", "us", and "ns" are used in the library.
	// Per tweaked N3660, "il", "i", and "if" are also used in the library.
	return llvm::StringSwitch<bool>(Suffix)
	.Cases("h", "min", "s", true)
	.Cases("ms", "us", "ns", true)
	.Cases("il", "i", "if", true)
	.Default(false);
	}

	void NumericLiteralParser::checkSeparator(SourceLocation TokLoc,
	const char *Pos,
	CheckSeparatorKind IsAfterDigits) {
	if (IsAfterDigits == CSK_AfterDigits) {
	if (Pos == ThisTokBegin)
	return;
	--Pos;
	} else if (Pos == ThisTokEnd)
	return;

	- if (isDigitSeparator(*Pos))
	+ if (isDigitSeparator(*Pos)) {
	PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Pos - ThisTokBegin),
	diag::err_digit_separator_not_between_digits)
	<< IsAfterDigits;
	+ hadError = true;
	+ }
	}

	/// ParseNumberStartingWithZero - This method is called when the first character
	/// of the number is found to be a zero. This means it is either an octal
	/// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
	/// a floating point number (01239.123e4). Eat the prefix, determining the
	/// radix etc.
	void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
	assert(s[0] == '0' && "Invalid method call");
	s++;

	int c1 = s[0];

	// Handle a hex number like 0x1234.
	if ((c1 == 'x' \|\| c1 == 'X') && (isHexDigit(s[1]) \|\| s[1] == '.')) {
	s++;
	assert(s < ThisTokEnd && "didn't maximally munch?");
	radix = 16;
	DigitsBegin = s;
	s = SkipHexDigits(s);
	bool HasSignificandDigits = containsDigits(DigitsBegin, s);
	if (s == ThisTokEnd) {
	// Done.
	} else if (*s == '.') {
	s++;
	saw_period = true;
	const char *floatDigitsBegin = s;
	s = SkipHexDigits(s);
	if (containsDigits(floatDigitsBegin, s))
	HasSignificandDigits = true;
	if (HasSignificandDigits)
	checkSeparator(TokLoc, floatDigitsBegin, CSK_BeforeDigits);
	}

	if (!HasSignificandDigits) {
	PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin),
	diag::err_hex_constant_requires)
	<< PP.getLangOpts().CPlusPlus << 1;
	hadError = true;
	return;
	}

	// A binary exponent can appear with or with a '.'. If dotted, the
	// binary exponent is required.
	if (s == 'p' \|\| s == 'P') {
	checkSeparator(TokLoc, s, CSK_AfterDigits);
	const char *Exponent = s;
	s++;
	saw_exponent = true;
	- if (s == '+' \|\| s == '-') s++; // sign
	+ if (s != ThisTokEnd && (s == '+' \|\| s == '-')) s++; // sign
	const char *first_non_digit = SkipDigits(s);
	if (!containsDigits(s, first_non_digit)) {
	- PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
	- diag::err_exponent_has_no_digits);
	- hadError = true;
	+ if (!hadError) {
	+ PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
	+ diag::err_exponent_has_no_digits);
	+ hadError = true;
	+ }
	return;
	}
	checkSeparator(TokLoc, s, CSK_BeforeDigits);
	s = first_non_digit;

	if (!PP.getLangOpts().HexFloats)
	PP.Diag(TokLoc, PP.getLangOpts().CPlusPlus
	? diag::ext_hex_literal_invalid
	: diag::ext_hex_constant_invalid);
	else if (PP.getLangOpts().CPlusPlus17)
	PP.Diag(TokLoc, diag::warn_cxx17_hex_literal);
	} else if (saw_period) {
	PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin),
	diag::err_hex_constant_requires)
	<< PP.getLangOpts().CPlusPlus << 0;
	hadError = true;
	}
	return;
	}

	// Handle simple binary numbers 0b01010
	if ((c1 == 'b' \|\| c1 == 'B') && (s[1] == '0' \|\| s[1] == '1')) {
	// 0b101010 is a C++1y / GCC extension.
	PP.Diag(TokLoc,
	PP.getLangOpts().CPlusPlus14
	? diag::warn_cxx11_compat_binary_literal
	: PP.getLangOpts().CPlusPlus
	? diag::ext_binary_literal_cxx14
	: diag::ext_binary_literal);
	++s;
	assert(s < ThisTokEnd && "didn't maximally munch?");
	radix = 2;
	DigitsBegin = s;
	s = SkipBinaryDigits(s);
	if (s == ThisTokEnd) {
	// Done.
	} else if (isHexDigit(*s)) {
	PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
	diag::err_invalid_digit) << StringRef(s, 1) << 2;
	hadError = true;
	}
	// Other suffixes will be diagnosed by the caller.
	return;
	}

	// For now, the radix is set to 8. If we discover that we have a
	// floating point constant, the radix will change to 10. Octal floating
	// point constants are not permitted (only decimal and hexadecimal).
	radix = 8;
	DigitsBegin = s;
	s = SkipOctalDigits(s);
	if (s == ThisTokEnd)
	return; // Done, simple octal number like 01234

	// If we have some other non-octal digit that is a decimal digit, see if
	// this is part of a floating point number like 094.123 or 09e1.
	if (isDigit(*s)) {
	const char *EndDecimal = SkipDigits(s);
	if (EndDecimal[0] == '.' \|\| EndDecimal[0] == 'e' \|\| EndDecimal[0] == 'E') {
	s = EndDecimal;
	radix = 10;
	}
	}

	ParseDecimalOrOctalCommon(TokLoc);
	}

	static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) {
	switch (Radix) {
	case 2:
	return NumDigits <= 64;
	case 8:
	return NumDigits <= 64 / 3; // Digits are groups of 3 bits.
	case 10:
	return NumDigits <= 19; // floor(log10(2^64))
	case 16:
	return NumDigits <= 64 / 4; // Digits are groups of 4 bits.
	default:
	llvm_unreachable("impossible Radix");
	}
	}

	/// GetIntegerValue - Convert this numeric literal value to an APInt that
	/// matches Val's input width. If there is an overflow, set Val to the low bits
	/// of the result and return true. Otherwise, return false.
	bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
	// Fast path: Compute a conservative bound on the maximum number of
	// bits per digit in this radix. If we can't possibly overflow a
	// uint64 based on that bound then do the simple conversion to
	// integer. This avoids the expensive overflow checking below, and
	// handles the common cases that matter (small decimal integers and
	// hex/octal values which don't overflow).
	const unsigned NumDigits = SuffixBegin - DigitsBegin;
	if (alwaysFitsInto64Bits(radix, NumDigits)) {
	uint64_t N = 0;
	for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; ++Ptr)
	if (!isDigitSeparator(*Ptr))
	N = N * radix + llvm::hexDigitValue(*Ptr);

	// This will truncate the value to Val's input width. Simply check
	// for overflow by comparing.
	Val = N;
	return Val.getZExtValue() != N;
	}

	Val = 0;
	const char *Ptr = DigitsBegin;

	llvm::APInt RadixVal(Val.getBitWidth(), radix);
	llvm::APInt CharVal(Val.getBitWidth(), 0);
	llvm::APInt OldVal = Val;

	bool OverflowOccurred = false;
	while (Ptr < SuffixBegin) {
	if (isDigitSeparator(*Ptr)) {
	++Ptr;
	continue;
	}

	unsigned C = llvm::hexDigitValue(*Ptr++);

	// If this letter is out of bound for this radix, reject it.
	assert(C < radix && "NumericLiteralParser ctor should have rejected this");

	CharVal = C;

	// Add the digit to the value in the appropriate radix. If adding in digits
	// made the value smaller, then this overflowed.
	OldVal = Val;

	// Multiply by radix, did overflow occur on the multiply?
	Val *= RadixVal;
	OverflowOccurred \|= Val.udiv(RadixVal) != OldVal;

	// Add value, did overflow occur on the value?
	// (a + b) ult b <=> overflow
	Val += CharVal;
	OverflowOccurred \|= Val.ult(CharVal);
	}
	return OverflowOccurred;
	}

	llvm::APFloat::opStatus
	NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
	using llvm::APFloat;

	unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);

	llvm::SmallString<16> Buffer;
	StringRef Str(ThisTokBegin, n);
	if (Str.find('\'') != StringRef::npos) {
	Buffer.reserve(n);
	std::remove_copy_if(Str.begin(), Str.end(), std::back_inserter(Buffer),
	&isDigitSeparator);
	Str = Buffer;
	}

	return Result.convertFromString(Str, APFloat::rmNearestTiesToEven);
	}

	/// \verbatim
	/// user-defined-character-literal: [C++11 lex.ext]
	/// character-literal ud-suffix
	/// ud-suffix:
	/// identifier
	/// character-literal: [C++11 lex.ccon]
	/// ' c-char-sequence '
	/// u' c-char-sequence '
	/// U' c-char-sequence '
	/// L' c-char-sequence '
	/// u8' c-char-sequence ' [C++1z lex.ccon]
	/// c-char-sequence:
	/// c-char
	/// c-char-sequence c-char
	/// c-char:
	/// any member of the source character set except the single-quote ',
	/// backslash \, or new-line character
	/// escape-sequence
	/// universal-character-name
	/// escape-sequence:
	/// simple-escape-sequence
	/// octal-escape-sequence
	/// hexadecimal-escape-sequence
	/// simple-escape-sequence:
	/// one of \' \" \? \\ \a \b \f \n \r \t \v
	/// octal-escape-sequence:
	/// \ octal-digit
	/// \ octal-digit octal-digit
	/// \ octal-digit octal-digit octal-digit
	/// hexadecimal-escape-sequence:
	/// \x hexadecimal-digit
	/// hexadecimal-escape-sequence hexadecimal-digit
	/// universal-character-name: [C++11 lex.charset]
	/// \u hex-quad
	/// \U hex-quad hex-quad
	/// hex-quad:
	/// hex-digit hex-digit hex-digit hex-digit
	/// \endverbatim
	///
	CharLiteralParser::CharLiteralParser(const char begin, const char end,
	SourceLocation Loc, Preprocessor &PP,
	tok::TokenKind kind) {
	// At this point we know that the character matches the regex "(L\|u\|U)?'.*'".
	HadError = false;

	Kind = kind;

	const char *TokBegin = begin;

	// Skip over wide character determinant.
	if (Kind != tok::char_constant)
	++begin;
	if (Kind == tok::utf8_char_constant)
	++begin;

	// Skip over the entry quote.
	assert(begin[0] == '\'' && "Invalid token lexed");
	++begin;

	// Remove an optional ud-suffix.
	if (end[-1] != '\'') {
	const char *UDSuffixEnd = end;
	do {
	--end;
	} while (end[-1] != '\'');
	// FIXME: Don't bother with this if !tok.hasUCN().
	expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end));
	UDSuffixOffset = end - TokBegin;
	}

	// Trim the ending quote.
	assert(end != begin && "Invalid token lexed");
	--end;

	// FIXME: The "Value" is an uint64_t so we can handle char literals of
	// up to 64-bits.
	// FIXME: This extensively assumes that 'char' is 8-bits.
	assert(PP.getTargetInfo().getCharWidth() == 8 &&
	"Assumes char is 8 bits");
	assert(PP.getTargetInfo().getIntWidth() <= 64 &&
	(PP.getTargetInfo().getIntWidth() & 7) == 0 &&
	"Assumes sizeof(int) on target is <= 64 and a multiple of char");
	assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
	"Assumes sizeof(wchar) on target is <= 64");

	SmallVector<uint32_t, 4> codepoint_buffer;
	codepoint_buffer.resize(end - begin);
	uint32_t *buffer_begin = &codepoint_buffer.front();
	uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();

	// Unicode escapes representing characters that cannot be correctly
	// represented in a single code unit are disallowed in character literals
	// by this implementation.
	uint32_t largest_character_for_kind;
	if (tok::wide_char_constant == Kind) {
	largest_character_for_kind =
	0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
	} else if (tok::utf8_char_constant == Kind) {
	largest_character_for_kind = 0x7F;
	} else if (tok::utf16_char_constant == Kind) {
	largest_character_for_kind = 0xFFFF;
	} else if (tok::utf32_char_constant == Kind) {
	largest_character_for_kind = 0x10FFFF;
	} else {
	largest_character_for_kind = 0x7Fu;
	}

	while (begin != end) {
	// Is this a span of non-escape characters?
	if (begin[0] != '\\') {
	char const *start = begin;
	do {
	++begin;
	} while (begin != end && *begin != '\\');

	char const *tmp_in_start = start;
	uint32_t *tmp_out_start = buffer_begin;
	llvm::ConversionResult res =
	llvm::ConvertUTF8toUTF32(reinterpret_cast<llvm::UTF8 const **>(&start),
	reinterpret_cast<llvm::UTF8 const *>(begin),
	&buffer_begin, buffer_end, llvm::strictConversion);
	if (res != llvm::conversionOK) {
	// If we see bad encoding for unprefixed character literals, warn and
	// simply copy the byte values, for compatibility with gcc and
	// older versions of clang.
	bool NoErrorOnBadEncoding = isAscii();
	unsigned Msg = diag::err_bad_character_encoding;
	if (NoErrorOnBadEncoding)
	Msg = diag::warn_bad_character_encoding;
	PP.Diag(Loc, Msg);
	if (NoErrorOnBadEncoding) {
	start = tmp_in_start;
	buffer_begin = tmp_out_start;
	for (; start != begin; ++start, ++buffer_begin)
	buffer_begin = static_cast<uint8_t>(start);
	} else {
	HadError = true;
	}
	} else {
	for (; tmp_out_start < buffer_begin; ++tmp_out_start) {
	if (*tmp_out_start > largest_character_for_kind) {
	HadError = true;
	PP.Diag(Loc, diag::err_character_too_large);
	}
	}
	}

	continue;
	}
	// Is this a Universal Character Name escape?
	if (begin[1] == 'u' \|\| begin[1] == 'U') {
	unsigned short UcnLen = 0;
	if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
	FullSourceLoc(Loc, PP.getSourceManager()),
	&PP.getDiagnostics(), PP.getLangOpts(), true)) {
	HadError = true;
	} else if (*buffer_begin > largest_character_for_kind) {
	HadError = true;
	PP.Diag(Loc, diag::err_character_too_large);
	}

	++buffer_begin;
	continue;
	}
	unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
	uint64_t result =
	ProcessCharEscape(TokBegin, begin, end, HadError,
	FullSourceLoc(Loc,PP.getSourceManager()),
	CharWidth, &PP.getDiagnostics(), PP.getLangOpts());
	*buffer_begin++ = result;
	}

	unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front();

	if (NumCharsSoFar > 1) {
	if (isWide())
	PP.Diag(Loc, diag::warn_extraneous_char_constant);
	else if (isAscii() && NumCharsSoFar == 4)
	PP.Diag(Loc, diag::ext_four_char_character_literal);
	else if (isAscii())
	PP.Diag(Loc, diag::ext_multichar_character_literal);
	else
	PP.Diag(Loc, diag::err_multichar_utf_character_literal);
	IsMultiChar = true;
	} else {
	IsMultiChar = false;
	}

	llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);

	// Narrow character literals act as though their value is concatenated
	// in this implementation, but warn on overflow.
	bool multi_char_too_long = false;
	if (isAscii() && isMultiChar()) {
	LitVal = 0;
	for (size_t i = 0; i < NumCharsSoFar; ++i) {
	// check for enough leading zeros to shift into
	multi_char_too_long \|= (LitVal.countLeadingZeros() < 8);
	LitVal <<= 8;
	LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
	}
	} else if (NumCharsSoFar > 0) {
	// otherwise just take the last character
	LitVal = buffer_begin[-1];
	}

	if (!HadError && multi_char_too_long) {
	PP.Diag(Loc, diag::warn_char_constant_too_large);
	}

	// Transfer the value from APInt to uint64_t
	Value = LitVal.getZExtValue();

	// If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
	// if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple
	// character constants are not sign extended in the this implementation:
	// '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
	if (isAscii() && NumCharsSoFar == 1 && (Value & 128) &&
	PP.getLangOpts().CharIsSigned)
	Value = (signed char)Value;
	}

	/// \verbatim
	/// string-literal: [C++0x lex.string]
	/// encoding-prefix " [s-char-sequence] "
	/// encoding-prefix R raw-string
	/// encoding-prefix:
	/// u8
	/// u
	/// U
	/// L
	/// s-char-sequence:
	/// s-char
	/// s-char-sequence s-char
	/// s-char:
	/// any member of the source character set except the double-quote ",
	/// backslash \, or new-line character
	/// escape-sequence
	/// universal-character-name
	/// raw-string:
	/// " d-char-sequence ( r-char-sequence ) d-char-sequence "
	/// r-char-sequence:
	/// r-char
	/// r-char-sequence r-char
	/// r-char:
	/// any member of the source character set, except a right parenthesis )
	/// followed by the initial d-char-sequence (which may be empty)
	/// followed by a double quote ".
	/// d-char-sequence:
	/// d-char
	/// d-char-sequence d-char
	/// d-char:
	/// any member of the basic source character set except:
	/// space, the left parenthesis (, the right parenthesis ),
	/// the backslash \, and the control characters representing horizontal
	/// tab, vertical tab, form feed, and newline.
	/// escape-sequence: [C++0x lex.ccon]
	/// simple-escape-sequence
	/// octal-escape-sequence
	/// hexadecimal-escape-sequence
	/// simple-escape-sequence:
	/// one of \' \" \? \\ \a \b \f \n \r \t \v
	/// octal-escape-sequence:
	/// \ octal-digit
	/// \ octal-digit octal-digit
	/// \ octal-digit octal-digit octal-digit
	/// hexadecimal-escape-sequence:
	/// \x hexadecimal-digit
	/// hexadecimal-escape-sequence hexadecimal-digit
	/// universal-character-name:
	/// \u hex-quad
	/// \U hex-quad hex-quad
	/// hex-quad:
	/// hex-digit hex-digit hex-digit hex-digit
	/// \endverbatim
	///
	StringLiteralParser::
	StringLiteralParser(ArrayRef<Token> StringToks,
	Preprocessor &PP, bool Complain)
	: SM(PP.getSourceManager()), Features(PP.getLangOpts()),
	Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() :nullptr),
	MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
	ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
	init(StringToks);
	}

	void StringLiteralParser::init(ArrayRef<Token> StringToks){
	// The literal token may have come from an invalid source location (e.g. due
	// to a PCH error), in which case the token length will be 0.
	if (StringToks.empty() \|\| StringToks[0].getLength() < 2)
	return DiagnoseLexingError(SourceLocation());

	// Scan all of the string portions, remember the max individual token length,
	// computing a bound on the concatenated string length, and see whether any
	// piece is a wide-string. If any of the string portions is a wide-string
	// literal, the result is a wide-string literal [C99 6.4.5p4].
	assert(!StringToks.empty() && "expected at least one token");
	MaxTokenLength = StringToks[0].getLength();
	assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
	SizeBound = StringToks[0].getLength()-2; // -2 for "".
	Kind = StringToks[0].getKind();

	hadError = false;

	// Implement Translation Phase #6: concatenation of string literals
	/// (C99 5.1.1.2p1). The common case is only one string fragment.
	for (unsigned i = 1; i != StringToks.size(); ++i) {
	if (StringToks[i].getLength() < 2)
	return DiagnoseLexingError(StringToks[i].getLocation());

	// The string could be shorter than this if it needs cleaning, but this is a
	// reasonable bound, which is all we need.
	assert(StringToks[i].getLength() >= 2 && "literal token is invalid!");
	SizeBound += StringToks[i].getLength()-2; // -2 for "".

	// Remember maximum string piece length.
	if (StringToks[i].getLength() > MaxTokenLength)
	MaxTokenLength = StringToks[i].getLength();

	// Remember if we see any wide or utf-8/16/32 strings.
	// Also check for illegal concatenations.
	if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) {
	if (isAscii()) {
	Kind = StringToks[i].getKind();
	} else {
	if (Diags)
	Diags->Report(StringToks[i].getLocation(),
	diag::err_unsupported_string_concat);
	hadError = true;
	}
	}
	}

	// Include space for the null terminator.
	++SizeBound;

	// TODO: K&R warning: "traditional C rejects string constant concatenation"

	// Get the width in bytes of char/wchar_t/char16_t/char32_t
	CharByteWidth = getCharWidth(Kind, Target);
	assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
	CharByteWidth /= 8;

	// The output buffer size needs to be large enough to hold wide characters.
	// This is a worst-case assumption which basically corresponds to L"" "long".
	SizeBound *= CharByteWidth;

	// Size the temporary buffer to hold the result string data.
	ResultBuf.resize(SizeBound);

	// Likewise, but for each string piece.
	SmallString<512> TokenBuf;
	TokenBuf.resize(MaxTokenLength);

	// Loop over all the strings, getting their spelling, and expanding them to
	// wide strings as appropriate.
	ResultPtr = &ResultBuf[0]; // Next byte to fill in.

	Pascal = false;

	SourceLocation UDSuffixTokLoc;

	for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
	const char *ThisTokBuf = &TokenBuf[0];
	// Get the spelling of the token, which eliminates trigraphs, etc. We know
	// that ThisTokBuf points to a buffer that is big enough for the whole token
	// and 'spelled' tokens can only shrink.
	bool StringInvalid = false;
	unsigned ThisTokLen =
	Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
	&StringInvalid);
	if (StringInvalid)
	return DiagnoseLexingError(StringToks[i].getLocation());

	const char *ThisTokBegin = ThisTokBuf;
	const char *ThisTokEnd = ThisTokBuf+ThisTokLen;

	// Remove an optional ud-suffix.
	if (ThisTokEnd[-1] != '"') {
	const char *UDSuffixEnd = ThisTokEnd;
	do {
	--ThisTokEnd;
	} while (ThisTokEnd[-1] != '"');

	StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);

	if (UDSuffixBuf.empty()) {
	if (StringToks[i].hasUCN())
	expandUCNs(UDSuffixBuf, UDSuffix);
	else
	UDSuffixBuf.assign(UDSuffix);
	UDSuffixToken = i;
	UDSuffixOffset = ThisTokEnd - ThisTokBuf;
	UDSuffixTokLoc = StringToks[i].getLocation();
	} else {
	SmallString<32> ExpandedUDSuffix;
	if (StringToks[i].hasUCN()) {
	expandUCNs(ExpandedUDSuffix, UDSuffix);
	UDSuffix = ExpandedUDSuffix;
	}

	// C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
	// result of a concatenation involving at least one user-defined-string-
	// literal, all the participating user-defined-string-literals shall
	// have the same ud-suffix.
	if (UDSuffixBuf != UDSuffix) {
	if (Diags) {
	SourceLocation TokLoc = StringToks[i].getLocation();
	Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
	<< UDSuffixBuf << UDSuffix
	<< SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
	<< SourceRange(TokLoc, TokLoc);
	}
	hadError = true;
	}
	}
	}

	// Strip the end quote.
	--ThisTokEnd;

	// TODO: Input character set mapping support.

	// Skip marker for wide or unicode strings.
	if (ThisTokBuf[0] == 'L' \|\| ThisTokBuf[0] == 'u' \|\| ThisTokBuf[0] == 'U') {
	++ThisTokBuf;
	// Skip 8 of u8 marker for utf8 strings.
	if (ThisTokBuf[0] == '8')
	++ThisTokBuf;
	}

	// Check for raw string
	if (ThisTokBuf[0] == 'R') {
	ThisTokBuf += 2; // skip R"

	const char *Prefix = ThisTokBuf;
	while (ThisTokBuf[0] != '(')
	++ThisTokBuf;
	++ThisTokBuf; // skip '('

	// Remove same number of characters from the end
	ThisTokEnd -= ThisTokBuf - Prefix;
	assert(ThisTokEnd >= ThisTokBuf && "malformed raw string literal");

	// C++14 [lex.string]p4: A source-file new-line in a raw string literal
	// results in a new-line in the resulting execution string-literal.
	StringRef RemainingTokenSpan(ThisTokBuf, ThisTokEnd - ThisTokBuf);
	while (!RemainingTokenSpan.empty()) {
	// Split the string literal on \r\n boundaries.
	size_t CRLFPos = RemainingTokenSpan.find("\r\n");
	StringRef BeforeCRLF = RemainingTokenSpan.substr(0, CRLFPos);
	StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos);

	// Copy everything before the \r\n sequence into the string literal.
	if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))
	hadError = true;

	// Point into the \n inside the \r\n sequence and operate on the
	// remaining portion of the literal.
	RemainingTokenSpan = AfterCRLF.substr(1);
	}
	} else {
	if (ThisTokBuf[0] != '"') {
	// The file may have come from PCH and then changed after loading the
	// PCH; Fail gracefully.
	return DiagnoseLexingError(StringToks[i].getLocation());
	}
	++ThisTokBuf; // skip "

	// Check if this is a pascal string
	if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
	ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {

	// If the \p sequence is found in the first token, we have a pascal string
	// Otherwise, if we already have a pascal string, ignore the first \p
	if (i == 0) {
	++ThisTokBuf;
	Pascal = true;
	} else if (Pascal)
	ThisTokBuf += 2;
	}

	while (ThisTokBuf != ThisTokEnd) {
	// Is this a span of non-escape characters?
	if (ThisTokBuf[0] != '\\') {
	const char *InStart = ThisTokBuf;
	do {
	++ThisTokBuf;
	} while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');

	// Copy the character span over.
	if (CopyStringFragment(StringToks[i], ThisTokBegin,
	StringRef(InStart, ThisTokBuf - InStart)))
	hadError = true;
	continue;
	}
	// Is this a Universal Character Name escape?
	if (ThisTokBuf[1] == 'u' \|\| ThisTokBuf[1] == 'U') {
	EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
	ResultPtr, hadError,
	FullSourceLoc(StringToks[i].getLocation(), SM),
	CharByteWidth, Diags, Features);
	continue;
	}
	// Otherwise, this is a non-UCN escape character. Process it.
	unsigned ResultChar =
	ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
	FullSourceLoc(StringToks[i].getLocation(), SM),
	CharByteWidth*8, Diags, Features);

	if (CharByteWidth == 4) {
	// FIXME: Make the type of the result buffer correct instead of
	// using reinterpret_cast.
	llvm::UTF32 ResultWidePtr = reinterpret_cast<llvm::UTF32>(ResultPtr);
	*ResultWidePtr = ResultChar;
	ResultPtr += 4;
	} else if (CharByteWidth == 2) {
	// FIXME: Make the type of the result buffer correct instead of
	// using reinterpret_cast.
	llvm::UTF16 ResultWidePtr = reinterpret_cast<llvm::UTF16>(ResultPtr);
	*ResultWidePtr = ResultChar & 0xFFFF;
	ResultPtr += 2;
	} else {
	assert(CharByteWidth == 1 && "Unexpected char width");
	*ResultPtr++ = ResultChar & 0xFF;
	}
	}
	}
	}

	if (Pascal) {
	if (CharByteWidth == 4) {
	// FIXME: Make the type of the result buffer correct instead of
	// using reinterpret_cast.
	llvm::UTF32 ResultWidePtr = reinterpret_cast<llvm::UTF32>(ResultBuf.data());
	ResultWidePtr[0] = GetNumStringChars() - 1;
	} else if (CharByteWidth == 2) {
	// FIXME: Make the type of the result buffer correct instead of
	// using reinterpret_cast.
	llvm::UTF16 ResultWidePtr = reinterpret_cast<llvm::UTF16>(ResultBuf.data());
	ResultWidePtr[0] = GetNumStringChars() - 1;
	} else {
	assert(CharByteWidth == 1 && "Unexpected char width");
	ResultBuf[0] = GetNumStringChars() - 1;
	}

	// Verify that pascal strings aren't too large.
	if (GetStringLength() > 256) {
	if (Diags)
	Diags->Report(StringToks.front().getLocation(),
	diag::err_pascal_string_too_long)
	<< SourceRange(StringToks.front().getLocation(),
	StringToks.back().getLocation());
	hadError = true;
	return;
	}
	} else if (Diags) {
	// Complain if this string literal has too many characters.
	unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;

	if (GetNumStringChars() > MaxChars)
	Diags->Report(StringToks.front().getLocation(),
	diag::ext_string_too_long)
	<< GetNumStringChars() << MaxChars
	<< (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
	<< SourceRange(StringToks.front().getLocation(),
	StringToks.back().getLocation());
	}
	}

	static const char resyncUTF8(const char Err, const char *End) {
	if (Err == End)
	return End;
	End = Err + std::min<unsigned>(llvm::getNumBytesForUTF8(*Err), End-Err);
	while (++Err != End && (*Err & 0xC0) == 0x80)
	;
	return Err;
	}

	/// \brief This function copies from Fragment, which is a sequence of bytes
	/// within Tok's contents (which begin at TokBegin) into ResultPtr.
	/// Performs widening for multi-byte characters.
	bool StringLiteralParser::CopyStringFragment(const Token &Tok,
	const char *TokBegin,
	StringRef Fragment) {
	const llvm::UTF8 *ErrorPtrTmp;
	if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
	return false;

	// If we see bad encoding for unprefixed string literals, warn and
	// simply copy the byte values, for compatibility with gcc and older
	// versions of clang.
	bool NoErrorOnBadEncoding = isAscii();
	if (NoErrorOnBadEncoding) {
	memcpy(ResultPtr, Fragment.data(), Fragment.size());
	ResultPtr += Fragment.size();
	}

	if (Diags) {
	const char ErrorPtr = reinterpret_cast<const char >(ErrorPtrTmp);

	FullSourceLoc SourceLoc(Tok.getLocation(), SM);
	const DiagnosticBuilder &Builder =
	Diag(Diags, Features, SourceLoc, TokBegin,
	ErrorPtr, resyncUTF8(ErrorPtr, Fragment.end()),
	NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
	: diag::err_bad_string_encoding);

	const char *NextStart = resyncUTF8(ErrorPtr, Fragment.end());
	StringRef NextFragment(NextStart, Fragment.end()-NextStart);

	// Decode into a dummy buffer.
	SmallString<512> Dummy;
	Dummy.reserve(Fragment.size() * CharByteWidth);
	char *Ptr = Dummy.data();

	while (!ConvertUTF8toWide(CharByteWidth, NextFragment, Ptr, ErrorPtrTmp)) {
	const char ErrorPtr = reinterpret_cast<const char >(ErrorPtrTmp);
	NextStart = resyncUTF8(ErrorPtr, Fragment.end());
	Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin,
	ErrorPtr, NextStart);
	NextFragment = StringRef(NextStart, Fragment.end()-NextStart);
	}
	}
	return !NoErrorOnBadEncoding;
	}

	void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {
	hadError = true;
	if (Diags)
	Diags->Report(Loc, diag::err_lexing_string);
	}

	/// getOffsetOfStringByte - This function returns the offset of the
	/// specified byte of the string data represented by Token. This handles
	/// advancing over escape sequences in the string.
	unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
	unsigned ByteNo) const {
	// Get the spelling of the token.
	SmallString<32> SpellingBuffer;
	SpellingBuffer.resize(Tok.getLength());

	bool StringInvalid = false;
	const char *SpellingPtr = &SpellingBuffer[0];
	unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
	&StringInvalid);
	if (StringInvalid)
	return 0;

	const char *SpellingStart = SpellingPtr;
	const char *SpellingEnd = SpellingPtr+TokLen;

	// Handle UTF-8 strings just like narrow strings.
	if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')
	SpellingPtr += 2;

	assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
	SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");

	// For raw string literals, this is easy.
	if (SpellingPtr[0] == 'R') {
	assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");
	// Skip 'R"'.
	SpellingPtr += 2;
	while (*SpellingPtr != '(') {
	++SpellingPtr;
	assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");
	}
	// Skip '('.
	++SpellingPtr;
	return SpellingPtr - SpellingStart + ByteNo;
	}

	// Skip over the leading quote
	assert(SpellingPtr[0] == '"' && "Should be a string literal!");
	++SpellingPtr;

	// Skip over bytes until we find the offset we're looking for.
	while (ByteNo) {
	assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");

	// Step over non-escapes simply.
	if (*SpellingPtr != '\\') {
	++SpellingPtr;
	--ByteNo;
	continue;
	}

	// Otherwise, this is an escape character. Advance over it.
	bool HadError = false;
	if (SpellingPtr[1] == 'u' \|\| SpellingPtr[1] == 'U') {
	const char *EscapePtr = SpellingPtr;
	unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
	1, Features, HadError);
	if (Len > ByteNo) {
	// ByteNo is somewhere within the escape sequence.
	SpellingPtr = EscapePtr;
	break;
	}
	ByteNo -= Len;
	} else {
	ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
	FullSourceLoc(Tok.getLocation(), SM),
	CharByteWidth*8, Diags, Features);
	--ByteNo;
	}
	assert(!HadError && "This method isn't valid on erroneous strings");
	}

	return SpellingPtr-SpellingStart;
	}

	/// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
	/// suffixes as ud-suffixes, because the diagnostic experience is better if we
	/// treat it as an invalid suffix.
	bool StringLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
	StringRef Suffix) {
	return NumericLiteralParser::isValidUDSuffix(LangOpts, Suffix) \|\|
	Suffix == "sv";
	}
	Index: head/contrib/llvm/tools/clang/lib/Sema/SemaChecking.cpp
	===================================================================
	--- head/contrib/llvm/tools/clang/lib/Sema/SemaChecking.cpp (revision 329409)
	+++ head/contrib/llvm/tools/clang/lib/Sema/SemaChecking.cpp (revision 329410)
	@@ -1,12500 +1,12510 @@
	//===- SemaChecking.cpp - Extra Semantic Checking -------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements extra semantic analysis beyond what is enforced
	// by the C type system.
	//
	//===----------------------------------------------------------------------===//

	#include "clang/AST/APValue.h"
	#include "clang/AST/ASTContext.h"
	#include "clang/AST/Attr.h"
	#include "clang/AST/AttrIterator.h"
	#include "clang/AST/CharUnits.h"
	#include "clang/AST/Decl.h"
	#include "clang/AST/DeclBase.h"
	#include "clang/AST/DeclCXX.h"
	#include "clang/AST/DeclObjC.h"
	#include "clang/AST/DeclarationName.h"
	#include "clang/AST/EvaluatedExprVisitor.h"
	#include "clang/AST/Expr.h"
	#include "clang/AST/ExprCXX.h"
	#include "clang/AST/ExprObjC.h"
	#include "clang/AST/ExprOpenMP.h"
	#include "clang/AST/NSAPI.h"
	#include "clang/AST/OperationKinds.h"
	#include "clang/AST/Stmt.h"
	#include "clang/AST/TemplateBase.h"
	#include "clang/AST/Type.h"
	#include "clang/AST/TypeLoc.h"
	#include "clang/AST/UnresolvedSet.h"
	#include "clang/Analysis/Analyses/FormatString.h"
	#include "clang/Basic/AddressSpaces.h"
	#include "clang/Basic/CharInfo.h"
	#include "clang/Basic/Diagnostic.h"
	#include "clang/Basic/IdentifierTable.h"
	#include "clang/Basic/LLVM.h"
	#include "clang/Basic/LangOptions.h"
	#include "clang/Basic/OpenCLOptions.h"
	#include "clang/Basic/OperatorKinds.h"
	#include "clang/Basic/PartialDiagnostic.h"
	#include "clang/Basic/SourceLocation.h"
	#include "clang/Basic/SourceManager.h"
	#include "clang/Basic/Specifiers.h"
	#include "clang/Basic/SyncScope.h"
	#include "clang/Basic/TargetBuiltins.h"
	#include "clang/Basic/TargetCXXABI.h"
	#include "clang/Basic/TargetInfo.h"
	#include "clang/Basic/TypeTraits.h"
	#include "clang/Lex/Lexer.h" // TODO: Extract static functions to fix layering.
	#include "clang/Sema/Initialization.h"
	#include "clang/Sema/Lookup.h"
	#include "clang/Sema/Ownership.h"
	#include "clang/Sema/Scope.h"
	#include "clang/Sema/ScopeInfo.h"
	#include "clang/Sema/Sema.h"
	#include "clang/Sema/SemaInternal.h"
	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/APSInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/FoldingSet.h"
	#include "llvm/ADT/None.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallBitVector.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallString.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/Support/AtomicOrdering.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/ConvertUTF.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/Format.h"
	#include "llvm/Support/Locale.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include <algorithm>
	#include <cassert>
	#include <cstddef>
	#include <cstdint>
	#include <functional>
	#include <limits>
	#include <string>
	#include <tuple>
	#include <utility>

	using namespace clang;
	using namespace sema;

	SourceLocation Sema::getLocationOfStringLiteralByte(const StringLiteral *SL,
	unsigned ByteNo) const {
	return SL->getLocationOfByte(ByteNo, getSourceManager(), LangOpts,
	Context.getTargetInfo());
	}

	/// Checks that a call expression's argument count is the desired number.
	/// This is useful when doing custom type-checking. Returns true on error.
	static bool checkArgCount(Sema &S, CallExpr *call, unsigned desiredArgCount) {
	unsigned argCount = call->getNumArgs();
	if (argCount == desiredArgCount) return false;

	if (argCount < desiredArgCount)
	return S.Diag(call->getLocEnd(), diag::err_typecheck_call_too_few_args)
	<< 0 /function call/ << desiredArgCount << argCount
	<< call->getSourceRange();

	// Highlight all the excess arguments.
	SourceRange range(call->getArg(desiredArgCount)->getLocStart(),
	call->getArg(argCount - 1)->getLocEnd());

	return S.Diag(range.getBegin(), diag::err_typecheck_call_too_many_args)
	<< 0 /function call/ << desiredArgCount << argCount
	<< call->getArg(1)->getSourceRange();
	}

	/// Check that the first argument to __builtin_annotation is an integer
	/// and the second argument is a non-wide string literal.
	static bool SemaBuiltinAnnotation(Sema &S, CallExpr *TheCall) {
	if (checkArgCount(S, TheCall, 2))
	return true;

	// First argument should be an integer.
	Expr *ValArg = TheCall->getArg(0);
	QualType Ty = ValArg->getType();
	if (!Ty->isIntegerType()) {
	S.Diag(ValArg->getLocStart(), diag::err_builtin_annotation_first_arg)
	<< ValArg->getSourceRange();
	return true;
	}

	// Second argument should be a constant string.
	Expr *StrArg = TheCall->getArg(1)->IgnoreParenCasts();
	StringLiteral *Literal = dyn_cast<StringLiteral>(StrArg);
	if (!Literal \|\| !Literal->isAscii()) {
	S.Diag(StrArg->getLocStart(), diag::err_builtin_annotation_second_arg)
	<< StrArg->getSourceRange();
	return true;
	}

	TheCall->setType(Ty);
	return false;
	}

	static bool SemaBuiltinMSVCAnnotation(Sema &S, CallExpr *TheCall) {
	// We need at least one argument.
	if (TheCall->getNumArgs() < 1) {
	S.Diag(TheCall->getLocEnd(), diag::err_typecheck_call_too_few_args_at_least)
	<< 0 << 1 << TheCall->getNumArgs()
	<< TheCall->getCallee()->getSourceRange();
	return true;
	}

	// All arguments should be wide string literals.
	for (Expr *Arg : TheCall->arguments()) {
	auto *Literal = dyn_cast<StringLiteral>(Arg->IgnoreParenCasts());
	if (!Literal \|\| !Literal->isWide()) {
	S.Diag(Arg->getLocStart(), diag::err_msvc_annotation_wide_str)
	<< Arg->getSourceRange();
	return true;
	}
	}

	return false;
	}

	/// Check that the argument to __builtin_addressof is a glvalue, and set the
	/// result type to the corresponding pointer type.
	static bool SemaBuiltinAddressof(Sema &S, CallExpr *TheCall) {
	if (checkArgCount(S, TheCall, 1))
	return true;

	ExprResult Arg(TheCall->getArg(0));
	QualType ResultType = S.CheckAddressOfOperand(Arg, TheCall->getLocStart());
	if (ResultType.isNull())
	return true;

	TheCall->setArg(0, Arg.get());
	TheCall->setType(ResultType);
	return false;
	}

	static bool SemaBuiltinOverflow(Sema &S, CallExpr *TheCall) {
	if (checkArgCount(S, TheCall, 3))
	return true;

	// First two arguments should be integers.
	for (unsigned I = 0; I < 2; ++I) {
	Expr *Arg = TheCall->getArg(I);
	QualType Ty = Arg->getType();
	if (!Ty->isIntegerType()) {
	S.Diag(Arg->getLocStart(), diag::err_overflow_builtin_must_be_int)
	<< Ty << Arg->getSourceRange();
	return true;
	}
	}

	// Third argument should be a pointer to a non-const integer.
	// IRGen correctly handles volatile, restrict, and address spaces, and
	// the other qualifiers aren't possible.
	{
	Expr *Arg = TheCall->getArg(2);
	QualType Ty = Arg->getType();
	const auto *PtrTy = Ty->getAs<PointerType>();
	if (!(PtrTy && PtrTy->getPointeeType()->isIntegerType() &&
	!PtrTy->getPointeeType().isConstQualified())) {
	S.Diag(Arg->getLocStart(), diag::err_overflow_builtin_must_be_ptr_int)
	<< Ty << Arg->getSourceRange();
	return true;
	}
	}

	return false;
	}

	static void SemaBuiltinMemChkCall(Sema &S, FunctionDecl *FDecl,
	CallExpr *TheCall, unsigned SizeIdx,
	unsigned DstSizeIdx) {
	if (TheCall->getNumArgs() <= SizeIdx \|\|
	TheCall->getNumArgs() <= DstSizeIdx)
	return;

	const Expr *SizeArg = TheCall->getArg(SizeIdx);
	const Expr *DstSizeArg = TheCall->getArg(DstSizeIdx);

	llvm::APSInt Size, DstSize;

	// find out if both sizes are known at compile time
	if (!SizeArg->EvaluateAsInt(Size, S.Context) \|\|
	!DstSizeArg->EvaluateAsInt(DstSize, S.Context))
	return;

	if (Size.ule(DstSize))
	return;

	// confirmed overflow so generate the diagnostic.
	IdentifierInfo *FnName = FDecl->getIdentifier();
	SourceLocation SL = TheCall->getLocStart();
	SourceRange SR = TheCall->getSourceRange();

	S.Diag(SL, diag::warn_memcpy_chk_overflow) << SR << FnName;
	}

	static bool SemaBuiltinCallWithStaticChain(Sema &S, CallExpr *BuiltinCall) {
	if (checkArgCount(S, BuiltinCall, 2))
	return true;

	SourceLocation BuiltinLoc = BuiltinCall->getLocStart();
	Expr *Builtin = BuiltinCall->getCallee()->IgnoreImpCasts();
	Expr *Call = BuiltinCall->getArg(0);
	Expr *Chain = BuiltinCall->getArg(1);

	if (Call->getStmtClass() != Stmt::CallExprClass) {
	S.Diag(BuiltinLoc, diag::err_first_argument_to_cwsc_not_call)
	<< Call->getSourceRange();
	return true;
	}

	auto CE = cast<CallExpr>(Call);
	if (CE->getCallee()->getType()->isBlockPointerType()) {
	S.Diag(BuiltinLoc, diag::err_first_argument_to_cwsc_block_call)
	<< Call->getSourceRange();
	return true;
	}

	const Decl *TargetDecl = CE->getCalleeDecl();
	if (const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(TargetDecl))
	if (FD->getBuiltinID()) {
	S.Diag(BuiltinLoc, diag::err_first_argument_to_cwsc_builtin_call)
	<< Call->getSourceRange();
	return true;
	}

	if (isa<CXXPseudoDestructorExpr>(CE->getCallee()->IgnoreParens())) {
	S.Diag(BuiltinLoc, diag::err_first_argument_to_cwsc_pdtor_call)
	<< Call->getSourceRange();
	return true;
	}

	ExprResult ChainResult = S.UsualUnaryConversions(Chain);
	if (ChainResult.isInvalid())
	return true;
	if (!ChainResult.get()->getType()->isPointerType()) {
	S.Diag(BuiltinLoc, diag::err_second_argument_to_cwsc_not_pointer)
	<< Chain->getSourceRange();
	return true;
	}

	QualType ReturnTy = CE->getCallReturnType(S.Context);
	QualType ArgTys[2] = { ReturnTy, ChainResult.get()->getType() };
	QualType BuiltinTy = S.Context.getFunctionType(
	ReturnTy, ArgTys, FunctionProtoType::ExtProtoInfo());
	QualType BuiltinPtrTy = S.Context.getPointerType(BuiltinTy);

	Builtin =
	S.ImpCastExprToType(Builtin, BuiltinPtrTy, CK_BuiltinFnToFnPtr).get();

	BuiltinCall->setType(CE->getType());
	BuiltinCall->setValueKind(CE->getValueKind());
	BuiltinCall->setObjectKind(CE->getObjectKind());
	BuiltinCall->setCallee(Builtin);
	BuiltinCall->setArg(1, ChainResult.get());

	return false;
	}

	static bool SemaBuiltinSEHScopeCheck(Sema &SemaRef, CallExpr *TheCall,
	Scope::ScopeFlags NeededScopeFlags,
	unsigned DiagID) {
	// Scopes aren't available during instantiation. Fortunately, builtin
	// functions cannot be template args so they cannot be formed through template
	// instantiation. Therefore checking once during the parse is sufficient.
	if (SemaRef.inTemplateInstantiation())
	return false;

	Scope *S = SemaRef.getCurScope();
	while (S && !S->isSEHExceptScope())
	S = S->getParent();
	if (!S \|\| !(S->getFlags() & NeededScopeFlags)) {
	auto *DRE = cast<DeclRefExpr>(TheCall->getCallee()->IgnoreParenCasts());
	SemaRef.Diag(TheCall->getExprLoc(), DiagID)
	<< DRE->getDecl()->getIdentifier();
	return true;
	}

	return false;
	}

	static inline bool isBlockPointer(Expr *Arg) {
	return Arg->getType()->isBlockPointerType();
	}

	/// OpenCL C v2.0, s6.13.17.2 - Checks that the block parameters are all local
	/// void*, which is a requirement of device side enqueue.
	static bool checkOpenCLBlockArgs(Sema &S, Expr *BlockArg) {
	const BlockPointerType *BPT =
	cast<BlockPointerType>(BlockArg->getType().getCanonicalType());
	ArrayRef<QualType> Params =
	BPT->getPointeeType()->getAs<FunctionProtoType>()->getParamTypes();
	unsigned ArgCounter = 0;
	bool IllegalParams = false;
	// Iterate through the block parameters until either one is found that is not
	// a local void*, or the block is valid.
	for (ArrayRef<QualType>::iterator I = Params.begin(), E = Params.end();
	I != E; ++I, ++ArgCounter) {
	if (!(I)->isPointerType() \|\| !(I)->getPointeeType()->isVoidType() \|\|
	(*I)->getPointeeType().getQualifiers().getAddressSpace() !=
	LangAS::opencl_local) {
	// Get the location of the error. If a block literal has been passed
	// (BlockExpr) then we can point straight to the offending argument,
	// else we just point to the variable reference.
	SourceLocation ErrorLoc;
	if (isa<BlockExpr>(BlockArg)) {
	BlockDecl *BD = cast<BlockExpr>(BlockArg)->getBlockDecl();
	ErrorLoc = BD->getParamDecl(ArgCounter)->getLocStart();
	} else if (isa<DeclRefExpr>(BlockArg)) {
	ErrorLoc = cast<DeclRefExpr>(BlockArg)->getLocStart();
	}
	S.Diag(ErrorLoc,
	diag::err_opencl_enqueue_kernel_blocks_non_local_void_args);
	IllegalParams = true;
	}
	}

	return IllegalParams;
	}

	static bool checkOpenCLSubgroupExt(Sema &S, CallExpr *Call) {
	if (!S.getOpenCLOptions().isEnabled("cl_khr_subgroups")) {
	S.Diag(Call->getLocStart(), diag::err_opencl_requires_extension)
	<< 1 << Call->getDirectCallee() << "cl_khr_subgroups";
	return true;
	}
	return false;
	}

	static bool SemaOpenCLBuiltinNDRangeAndBlock(Sema &S, CallExpr *TheCall) {
	if (checkArgCount(S, TheCall, 2))
	return true;

	if (checkOpenCLSubgroupExt(S, TheCall))
	return true;

	// First argument is an ndrange_t type.
	Expr *NDRangeArg = TheCall->getArg(0);
	if (NDRangeArg->getType().getUnqualifiedType().getAsString() != "ndrange_t") {
	S.Diag(NDRangeArg->getLocStart(),
	diag::err_opencl_builtin_expected_type)
	<< TheCall->getDirectCallee() << "'ndrange_t'";
	return true;
	}

	Expr *BlockArg = TheCall->getArg(1);
	if (!isBlockPointer(BlockArg)) {
	S.Diag(BlockArg->getLocStart(),
	diag::err_opencl_builtin_expected_type)
	<< TheCall->getDirectCallee() << "block";
	return true;
	}
	return checkOpenCLBlockArgs(S, BlockArg);
	}

	/// OpenCL C v2.0, s6.13.17.6 - Check the argument to the
	/// get_kernel_work_group_size
	/// and get_kernel_preferred_work_group_size_multiple builtin functions.
	static bool SemaOpenCLBuiltinKernelWorkGroupSize(Sema &S, CallExpr *TheCall) {
	if (checkArgCount(S, TheCall, 1))
	return true;

	Expr *BlockArg = TheCall->getArg(0);
	if (!isBlockPointer(BlockArg)) {
	S.Diag(BlockArg->getLocStart(),
	diag::err_opencl_builtin_expected_type)
	<< TheCall->getDirectCallee() << "block";
	return true;
	}
	return checkOpenCLBlockArgs(S, BlockArg);
	}

	/// Diagnose integer type and any valid implicit conversion to it.
	static bool checkOpenCLEnqueueIntType(Sema &S, Expr *E,
	const QualType &IntType);

	static bool checkOpenCLEnqueueLocalSizeArgs(Sema &S, CallExpr *TheCall,
	unsigned Start, unsigned End) {
	bool IllegalParams = false;
	for (unsigned I = Start; I <= End; ++I)
	IllegalParams \|= checkOpenCLEnqueueIntType(S, TheCall->getArg(I),
	S.Context.getSizeType());
	return IllegalParams;
	}

	/// OpenCL v2.0, s6.13.17.1 - Check that sizes are provided for all
	/// 'local void*' parameter of passed block.
	static bool checkOpenCLEnqueueVariadicArgs(Sema &S, CallExpr *TheCall,
	Expr *BlockArg,
	unsigned NumNonVarArgs) {
	const BlockPointerType *BPT =
	cast<BlockPointerType>(BlockArg->getType().getCanonicalType());
	unsigned NumBlockParams =
	BPT->getPointeeType()->getAs<FunctionProtoType>()->getNumParams();
	unsigned TotalNumArgs = TheCall->getNumArgs();

	// For each argument passed to the block, a corresponding uint needs to
	// be passed to describe the size of the local memory.
	if (TotalNumArgs != NumBlockParams + NumNonVarArgs) {
	S.Diag(TheCall->getLocStart(),
	diag::err_opencl_enqueue_kernel_local_size_args);
	return true;
	}

	// Check that the sizes of the local memory are specified by integers.
	return checkOpenCLEnqueueLocalSizeArgs(S, TheCall, NumNonVarArgs,
	TotalNumArgs - 1);
	}

	/// OpenCL C v2.0, s6.13.17 - Enqueue kernel function contains four different
	/// overload formats specified in Table 6.13.17.1.
	/// int enqueue_kernel(queue_t queue,
	/// kernel_enqueue_flags_t flags,
	/// const ndrange_t ndrange,
	/// void (^block)(void))
	/// int enqueue_kernel(queue_t queue,
	/// kernel_enqueue_flags_t flags,
	/// const ndrange_t ndrange,
	/// uint num_events_in_wait_list,
	/// clk_event_t *event_wait_list,
	/// clk_event_t *event_ret,
	/// void (^block)(void))
	/// int enqueue_kernel(queue_t queue,
	/// kernel_enqueue_flags_t flags,
	/// const ndrange_t ndrange,
	/// void (^block)(local void*, ...),
	/// uint size0, ...)
	/// int enqueue_kernel(queue_t queue,
	/// kernel_enqueue_flags_t flags,
	/// const ndrange_t ndrange,
	/// uint num_events_in_wait_list,
	/// clk_event_t *event_wait_list,
	/// clk_event_t *event_ret,
	/// void (^block)(local void*, ...),
	/// uint size0, ...)
	static bool SemaOpenCLBuiltinEnqueueKernel(Sema &S, CallExpr *TheCall) {
	unsigned NumArgs = TheCall->getNumArgs();

	if (NumArgs < 4) {
	S.Diag(TheCall->getLocStart(), diag::err_typecheck_call_too_few_args);
	return true;
	}

	Expr *Arg0 = TheCall->getArg(0);
	Expr *Arg1 = TheCall->getArg(1);
	Expr *Arg2 = TheCall->getArg(2);
	Expr *Arg3 = TheCall->getArg(3);

	// First argument always needs to be a queue_t type.
	if (!Arg0->getType()->isQueueT()) {
	S.Diag(TheCall->getArg(0)->getLocStart(),
	diag::err_opencl_builtin_expected_type)
	<< TheCall->getDirectCallee() << S.Context.OCLQueueTy;
	return true;
	}

	// Second argument always needs to be a kernel_enqueue_flags_t enum value.
	if (!Arg1->getType()->isIntegerType()) {
	S.Diag(TheCall->getArg(1)->getLocStart(),
	diag::err_opencl_builtin_expected_type)
	<< TheCall->getDirectCallee() << "'kernel_enqueue_flags_t' (i.e. uint)";
	return true;
	}

	// Third argument is always an ndrange_t type.
	if (Arg2->getType().getUnqualifiedType().getAsString() != "ndrange_t") {
	S.Diag(TheCall->getArg(2)->getLocStart(),
	diag::err_opencl_builtin_expected_type)
	<< TheCall->getDirectCallee() << "'ndrange_t'";
	return true;
	}

	// With four arguments, there is only one form that the function could be
	// called in: no events and no variable arguments.
	if (NumArgs == 4) {
	// check that the last argument is the right block type.
	if (!isBlockPointer(Arg3)) {
	S.Diag(Arg3->getLocStart(), diag::err_opencl_builtin_expected_type)
	<< TheCall->getDirectCallee() << "block";
	return true;
	}
	// we have a block type, check the prototype
	const BlockPointerType *BPT =
	cast<BlockPointerType>(Arg3->getType().getCanonicalType());
	if (BPT->getPointeeType()->getAs<FunctionProtoType>()->getNumParams() > 0) {
	S.Diag(Arg3->getLocStart(),
	diag::err_opencl_enqueue_kernel_blocks_no_args);
	return true;
	}
	return false;
	}
	// we can have block + varargs.
	if (isBlockPointer(Arg3))
	return (checkOpenCLBlockArgs(S, Arg3) \|\|
	checkOpenCLEnqueueVariadicArgs(S, TheCall, Arg3, 4));
	// last two cases with either exactly 7 args or 7 args and varargs.
	if (NumArgs >= 7) {
	// check common block argument.
	Expr *Arg6 = TheCall->getArg(6);
	if (!isBlockPointer(Arg6)) {
	S.Diag(Arg6->getLocStart(), diag::err_opencl_builtin_expected_type)
	<< TheCall->getDirectCallee() << "block";
	return true;
	}
	if (checkOpenCLBlockArgs(S, Arg6))
	return true;

	// Forth argument has to be any integer type.
	if (!Arg3->getType()->isIntegerType()) {
	S.Diag(TheCall->getArg(3)->getLocStart(),
	diag::err_opencl_builtin_expected_type)
	<< TheCall->getDirectCallee() << "integer";
	return true;
	}
	// check remaining common arguments.
	Expr *Arg4 = TheCall->getArg(4);
	Expr *Arg5 = TheCall->getArg(5);

	// Fifth argument is always passed as a pointer to clk_event_t.
	if (!Arg4->isNullPointerConstant(S.Context,
	Expr::NPC_ValueDependentIsNotNull) &&
	!Arg4->getType()->getPointeeOrArrayElementType()->isClkEventT()) {
	S.Diag(TheCall->getArg(4)->getLocStart(),
	diag::err_opencl_builtin_expected_type)
	<< TheCall->getDirectCallee()
	<< S.Context.getPointerType(S.Context.OCLClkEventTy);
	return true;
	}

	// Sixth argument is always passed as a pointer to clk_event_t.
	if (!Arg5->isNullPointerConstant(S.Context,
	Expr::NPC_ValueDependentIsNotNull) &&
	!(Arg5->getType()->isPointerType() &&
	Arg5->getType()->getPointeeType()->isClkEventT())) {
	S.Diag(TheCall->getArg(5)->getLocStart(),
	diag::err_opencl_builtin_expected_type)
	<< TheCall->getDirectCallee()
	<< S.Context.getPointerType(S.Context.OCLClkEventTy);
	return true;
	}

	if (NumArgs == 7)
	return false;

	return checkOpenCLEnqueueVariadicArgs(S, TheCall, Arg6, 7);
	}

	// None of the specific case has been detected, give generic error
	S.Diag(TheCall->getLocStart(),
	diag::err_opencl_enqueue_kernel_incorrect_args);
	return true;
	}

	/// Returns OpenCL access qual.
	static OpenCLAccessAttr getOpenCLArgAccess(const Decl D) {
	return D->getAttr<OpenCLAccessAttr>();
	}

	/// Returns true if pipe element type is different from the pointer.
	static bool checkOpenCLPipeArg(Sema &S, CallExpr *Call) {
	const Expr *Arg0 = Call->getArg(0);
	// First argument type should always be pipe.
	if (!Arg0->getType()->isPipeType()) {
	S.Diag(Call->getLocStart(), diag::err_opencl_builtin_pipe_first_arg)
	<< Call->getDirectCallee() << Arg0->getSourceRange();
	return true;
	}
	OpenCLAccessAttr *AccessQual =
	getOpenCLArgAccess(cast<DeclRefExpr>(Arg0)->getDecl());
	// Validates the access qualifier is compatible with the call.
	// OpenCL v2.0 s6.13.16 - The access qualifiers for pipe should only be
	// read_only and write_only, and assumed to be read_only if no qualifier is
	// specified.
	switch (Call->getDirectCallee()->getBuiltinID()) {
	case Builtin::BIread_pipe:
	case Builtin::BIreserve_read_pipe:
	case Builtin::BIcommit_read_pipe:
	case Builtin::BIwork_group_reserve_read_pipe:
	case Builtin::BIsub_group_reserve_read_pipe:
	case Builtin::BIwork_group_commit_read_pipe:
	case Builtin::BIsub_group_commit_read_pipe:
	if (!(!AccessQual \|\| AccessQual->isReadOnly())) {
	S.Diag(Arg0->getLocStart(),
	diag::err_opencl_builtin_pipe_invalid_access_modifier)
	<< "read_only" << Arg0->getSourceRange();
	return true;
	}
	break;
	case Builtin::BIwrite_pipe:
	case Builtin::BIreserve_write_pipe:
	case Builtin::BIcommit_write_pipe:
	case Builtin::BIwork_group_reserve_write_pipe:
	case Builtin::BIsub_group_reserve_write_pipe:
	case Builtin::BIwork_group_commit_write_pipe:
	case Builtin::BIsub_group_commit_write_pipe:
	if (!(AccessQual && AccessQual->isWriteOnly())) {
	S.Diag(Arg0->getLocStart(),
	diag::err_opencl_builtin_pipe_invalid_access_modifier)
	<< "write_only" << Arg0->getSourceRange();
	return true;
	}
	break;
	default:
	break;
	}
	return false;
	}

	/// Returns true if pipe element type is different from the pointer.
	static bool checkOpenCLPipePacketType(Sema &S, CallExpr *Call, unsigned Idx) {
	const Expr *Arg0 = Call->getArg(0);
	const Expr *ArgIdx = Call->getArg(Idx);
	const PipeType *PipeTy = cast<PipeType>(Arg0->getType());
	const QualType EltTy = PipeTy->getElementType();
	const PointerType *ArgTy = ArgIdx->getType()->getAs<PointerType>();
	// The Idx argument should be a pointer and the type of the pointer and
	// the type of pipe element should also be the same.
	if (!ArgTy \|\|
	!S.Context.hasSameType(
	EltTy, ArgTy->getPointeeType()->getCanonicalTypeInternal())) {
	S.Diag(Call->getLocStart(), diag::err_opencl_builtin_pipe_invalid_arg)
	<< Call->getDirectCallee() << S.Context.getPointerType(EltTy)
	<< ArgIdx->getType() << ArgIdx->getSourceRange();
	return true;
	}
	return false;
	}

	// \brief Performs semantic analysis for the read/write_pipe call.
	// \param S Reference to the semantic analyzer.
	// \param Call A pointer to the builtin call.
	// \return True if a semantic error has been found, false otherwise.
	static bool SemaBuiltinRWPipe(Sema &S, CallExpr *Call) {
	// OpenCL v2.0 s6.13.16.2 - The built-in read/write
	// functions have two forms.
	switch (Call->getNumArgs()) {
	case 2:
	if (checkOpenCLPipeArg(S, Call))
	return true;
	// The call with 2 arguments should be
	// read/write_pipe(pipe T, T*).
	// Check packet type T.
	if (checkOpenCLPipePacketType(S, Call, 1))
	return true;
	break;

	case 4: {
	if (checkOpenCLPipeArg(S, Call))
	return true;
	// The call with 4 arguments should be
	// read/write_pipe(pipe T, reserve_id_t, uint, T*).
	// Check reserve_id_t.
	if (!Call->getArg(1)->getType()->isReserveIDT()) {
	S.Diag(Call->getLocStart(), diag::err_opencl_builtin_pipe_invalid_arg)
	<< Call->getDirectCallee() << S.Context.OCLReserveIDTy
	<< Call->getArg(1)->getType() << Call->getArg(1)->getSourceRange();
	return true;
	}

	// Check the index.
	const Expr *Arg2 = Call->getArg(2);
	if (!Arg2->getType()->isIntegerType() &&
	!Arg2->getType()->isUnsignedIntegerType()) {
	S.Diag(Call->getLocStart(), diag::err_opencl_builtin_pipe_invalid_arg)
	<< Call->getDirectCallee() << S.Context.UnsignedIntTy
	<< Arg2->getType() << Arg2->getSourceRange();
	return true;
	}

	// Check packet type T.
	if (checkOpenCLPipePacketType(S, Call, 3))
	return true;
	} break;
	default:
	S.Diag(Call->getLocStart(), diag::err_opencl_builtin_pipe_arg_num)
	<< Call->getDirectCallee() << Call->getSourceRange();
	return true;
	}

	return false;
	}

	// \brief Performs a semantic analysis on the {work_group_/sub_group_
	// /_}reserve_{read/write}_pipe
	// \param S Reference to the semantic analyzer.
	// \param Call The call to the builtin function to be analyzed.
	// \return True if a semantic error was found, false otherwise.
	static bool SemaBuiltinReserveRWPipe(Sema &S, CallExpr *Call) {
	if (checkArgCount(S, Call, 2))
	return true;

	if (checkOpenCLPipeArg(S, Call))
	return true;

	// Check the reserve size.
	if (!Call->getArg(1)->getType()->isIntegerType() &&
	!Call->getArg(1)->getType()->isUnsignedIntegerType()) {
	S.Diag(Call->getLocStart(), diag::err_opencl_builtin_pipe_invalid_arg)
	<< Call->getDirectCallee() << S.Context.UnsignedIntTy
	<< Call->getArg(1)->getType() << Call->getArg(1)->getSourceRange();
	return true;
	}

	// Since return type of reserve_read/write_pipe built-in function is
	// reserve_id_t, which is not defined in the builtin def file , we used int
	// as return type and need to override the return type of these functions.
	Call->setType(S.Context.OCLReserveIDTy);

	return false;
	}

	// \brief Performs a semantic analysis on {work_group_/sub_group_
	// /_}commit_{read/write}_pipe
	// \param S Reference to the semantic analyzer.
	// \param Call The call to the builtin function to be analyzed.
	// \return True if a semantic error was found, false otherwise.
	static bool SemaBuiltinCommitRWPipe(Sema &S, CallExpr *Call) {
	if (checkArgCount(S, Call, 2))
	return true;

	if (checkOpenCLPipeArg(S, Call))
	return true;

	// Check reserve_id_t.
	if (!Call->getArg(1)->getType()->isReserveIDT()) {
	S.Diag(Call->getLocStart(), diag::err_opencl_builtin_pipe_invalid_arg)
	<< Call->getDirectCallee() << S.Context.OCLReserveIDTy
	<< Call->getArg(1)->getType() << Call->getArg(1)->getSourceRange();
	return true;
	}

	return false;
	}

	// \brief Performs a semantic analysis on the call to built-in Pipe
	// Query Functions.
	// \param S Reference to the semantic analyzer.
	// \param Call The call to the builtin function to be analyzed.
	// \return True if a semantic error was found, false otherwise.
	static bool SemaBuiltinPipePackets(Sema &S, CallExpr *Call) {
	if (checkArgCount(S, Call, 1))
	return true;

	if (!Call->getArg(0)->getType()->isPipeType()) {
	S.Diag(Call->getLocStart(), diag::err_opencl_builtin_pipe_first_arg)
	<< Call->getDirectCallee() << Call->getArg(0)->getSourceRange();
	return true;
	}

	return false;
	}

	// \brief OpenCL v2.0 s6.13.9 - Address space qualifier functions.
	// \brief Performs semantic analysis for the to_global/local/private call.
	// \param S Reference to the semantic analyzer.
	// \param BuiltinID ID of the builtin function.
	// \param Call A pointer to the builtin call.
	// \return True if a semantic error has been found, false otherwise.
	static bool SemaOpenCLBuiltinToAddr(Sema &S, unsigned BuiltinID,
	CallExpr *Call) {
	if (Call->getNumArgs() != 1) {
	S.Diag(Call->getLocStart(), diag::err_opencl_builtin_to_addr_arg_num)
	<< Call->getDirectCallee() << Call->getSourceRange();
	return true;
	}

	auto RT = Call->getArg(0)->getType();
	if (!RT->isPointerType() \|\| RT->getPointeeType()
	.getAddressSpace() == LangAS::opencl_constant) {
	S.Diag(Call->getLocStart(), diag::err_opencl_builtin_to_addr_invalid_arg)
	<< Call->getArg(0) << Call->getDirectCallee() << Call->getSourceRange();
	return true;
	}

	RT = RT->getPointeeType();
	auto Qual = RT.getQualifiers();
	switch (BuiltinID) {
	case Builtin::BIto_global:
	Qual.setAddressSpace(LangAS::opencl_global);
	break;
	case Builtin::BIto_local:
	Qual.setAddressSpace(LangAS::opencl_local);
	break;
	case Builtin::BIto_private:
	Qual.setAddressSpace(LangAS::opencl_private);
	break;
	default:
	llvm_unreachable("Invalid builtin function");
	}
	Call->setType(S.Context.getPointerType(S.Context.getQualifiedType(
	RT.getUnqualifiedType(), Qual)));

	return false;
	}

	ExprResult
	Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
	CallExpr *TheCall) {
	ExprResult TheCallResult(TheCall);

	// Find out if any arguments are required to be integer constant expressions.
	unsigned ICEArguments = 0;
	ASTContext::GetBuiltinTypeError Error;
	Context.GetBuiltinType(BuiltinID, Error, &ICEArguments);
	if (Error != ASTContext::GE_None)
	ICEArguments = 0; // Don't diagnose previously diagnosed errors.

	// If any arguments are required to be ICE's, check and diagnose.
	for (unsigned ArgNo = 0; ICEArguments != 0; ++ArgNo) {
	// Skip arguments not required to be ICE's.
	if ((ICEArguments & (1 << ArgNo)) == 0) continue;

	llvm::APSInt Result;
	if (SemaBuiltinConstantArg(TheCall, ArgNo, Result))
	return true;
	ICEArguments &= ~(1 << ArgNo);
	}

	switch (BuiltinID) {
	case Builtin::BI__builtin___CFStringMakeConstantString:
	assert(TheCall->getNumArgs() == 1 &&
	"Wrong # arguments to builtin CFStringMakeConstantString");
	if (CheckObjCString(TheCall->getArg(0)))
	return ExprError();
	break;
	case Builtin::BI__builtin_ms_va_start:
	case Builtin::BI__builtin_stdarg_start:
	case Builtin::BI__builtin_va_start:
	if (SemaBuiltinVAStart(BuiltinID, TheCall))
	return ExprError();
	break;
	case Builtin::BI__va_start: {
	switch (Context.getTargetInfo().getTriple().getArch()) {
	case llvm::Triple::arm:
	case llvm::Triple::thumb:
	if (SemaBuiltinVAStartARMMicrosoft(TheCall))
	return ExprError();
	break;
	default:
	if (SemaBuiltinVAStart(BuiltinID, TheCall))
	return ExprError();
	break;
	}
	break;
	}
	case Builtin::BI__builtin_isgreater:
	case Builtin::BI__builtin_isgreaterequal:
	case Builtin::BI__builtin_isless:
	case Builtin::BI__builtin_islessequal:
	case Builtin::BI__builtin_islessgreater:
	case Builtin::BI__builtin_isunordered:
	if (SemaBuiltinUnorderedCompare(TheCall))
	return ExprError();
	break;
	case Builtin::BI__builtin_fpclassify:
	if (SemaBuiltinFPClassification(TheCall, 6))
	return ExprError();
	break;
	case Builtin::BI__builtin_isfinite:
	case Builtin::BI__builtin_isinf:
	case Builtin::BI__builtin_isinf_sign:
	case Builtin::BI__builtin_isnan:
	case Builtin::BI__builtin_isnormal:
	if (SemaBuiltinFPClassification(TheCall, 1))
	return ExprError();
	break;
	case Builtin::BI__builtin_shufflevector:
	return SemaBuiltinShuffleVector(TheCall);
	// TheCall will be freed by the smart pointer here, but that's fine, since
	// SemaBuiltinShuffleVector guts it, but then doesn't release it.
	case Builtin::BI__builtin_prefetch:
	if (SemaBuiltinPrefetch(TheCall))
	return ExprError();
	break;
	case Builtin::BI__builtin_alloca_with_align:
	if (SemaBuiltinAllocaWithAlign(TheCall))
	return ExprError();
	break;
	case Builtin::BI__assume:
	case Builtin::BI__builtin_assume:
	if (SemaBuiltinAssume(TheCall))
	return ExprError();
	break;
	case Builtin::BI__builtin_assume_aligned:
	if (SemaBuiltinAssumeAligned(TheCall))
	return ExprError();
	break;
	case Builtin::BI__builtin_object_size:
	if (SemaBuiltinConstantArgRange(TheCall, 1, 0, 3))
	return ExprError();
	break;
	case Builtin::BI__builtin_longjmp:
	if (SemaBuiltinLongjmp(TheCall))
	return ExprError();
	break;
	case Builtin::BI__builtin_setjmp:
	if (SemaBuiltinSetjmp(TheCall))
	return ExprError();
	break;
	case Builtin::BI_setjmp:
	case Builtin::BI_setjmpex:
	if (checkArgCount(*this, TheCall, 1))
	return true;
	break;
	case Builtin::BI__builtin_classify_type:
	if (checkArgCount(*this, TheCall, 1)) return true;
	TheCall->setType(Context.IntTy);
	break;
	case Builtin::BI__builtin_constant_p:
	if (checkArgCount(*this, TheCall, 1)) return true;
	TheCall->setType(Context.IntTy);
	break;
	case Builtin::BI__sync_fetch_and_add:
	case Builtin::BI__sync_fetch_and_add_1:
	case Builtin::BI__sync_fetch_and_add_2:
	case Builtin::BI__sync_fetch_and_add_4:
	case Builtin::BI__sync_fetch_and_add_8:
	case Builtin::BI__sync_fetch_and_add_16:
	case Builtin::BI__sync_fetch_and_sub:
	case Builtin::BI__sync_fetch_and_sub_1:
	case Builtin::BI__sync_fetch_and_sub_2:
	case Builtin::BI__sync_fetch_and_sub_4:
	case Builtin::BI__sync_fetch_and_sub_8:
	case Builtin::BI__sync_fetch_and_sub_16:
	case Builtin::BI__sync_fetch_and_or:
	case Builtin::BI__sync_fetch_and_or_1:
	case Builtin::BI__sync_fetch_and_or_2:
	case Builtin::BI__sync_fetch_and_or_4:
	case Builtin::BI__sync_fetch_and_or_8:
	case Builtin::BI__sync_fetch_and_or_16:
	case Builtin::BI__sync_fetch_and_and:
	case Builtin::BI__sync_fetch_and_and_1:
	case Builtin::BI__sync_fetch_and_and_2:
	case Builtin::BI__sync_fetch_and_and_4:
	case Builtin::BI__sync_fetch_and_and_8:
	case Builtin::BI__sync_fetch_and_and_16:
	case Builtin::BI__sync_fetch_and_xor:
	case Builtin::BI__sync_fetch_and_xor_1:
	case Builtin::BI__sync_fetch_and_xor_2:
	case Builtin::BI__sync_fetch_and_xor_4:
	case Builtin::BI__sync_fetch_and_xor_8:
	case Builtin::BI__sync_fetch_and_xor_16:
	case Builtin::BI__sync_fetch_and_nand:
	case Builtin::BI__sync_fetch_and_nand_1:
	case Builtin::BI__sync_fetch_and_nand_2:
	case Builtin::BI__sync_fetch_and_nand_4:
	case Builtin::BI__sync_fetch_and_nand_8:
	case Builtin::BI__sync_fetch_and_nand_16:
	case Builtin::BI__sync_add_and_fetch:
	case Builtin::BI__sync_add_and_fetch_1:
	case Builtin::BI__sync_add_and_fetch_2:
	case Builtin::BI__sync_add_and_fetch_4:
	case Builtin::BI__sync_add_and_fetch_8:
	case Builtin::BI__sync_add_and_fetch_16:
	case Builtin::BI__sync_sub_and_fetch:
	case Builtin::BI__sync_sub_and_fetch_1:
	case Builtin::BI__sync_sub_and_fetch_2:
	case Builtin::BI__sync_sub_and_fetch_4:
	case Builtin::BI__sync_sub_and_fetch_8:
	case Builtin::BI__sync_sub_and_fetch_16:
	case Builtin::BI__sync_and_and_fetch:
	case Builtin::BI__sync_and_and_fetch_1:
	case Builtin::BI__sync_and_and_fetch_2:
	case Builtin::BI__sync_and_and_fetch_4:
	case Builtin::BI__sync_and_and_fetch_8:
	case Builtin::BI__sync_and_and_fetch_16:
	case Builtin::BI__sync_or_and_fetch:
	case Builtin::BI__sync_or_and_fetch_1:
	case Builtin::BI__sync_or_and_fetch_2:
	case Builtin::BI__sync_or_and_fetch_4:
	case Builtin::BI__sync_or_and_fetch_8:
	case Builtin::BI__sync_or_and_fetch_16:
	case Builtin::BI__sync_xor_and_fetch:
	case Builtin::BI__sync_xor_and_fetch_1:
	case Builtin::BI__sync_xor_and_fetch_2:
	case Builtin::BI__sync_xor_and_fetch_4:
	case Builtin::BI__sync_xor_and_fetch_8:
	case Builtin::BI__sync_xor_and_fetch_16:
	case Builtin::BI__sync_nand_and_fetch:
	case Builtin::BI__sync_nand_and_fetch_1:
	case Builtin::BI__sync_nand_and_fetch_2:
	case Builtin::BI__sync_nand_and_fetch_4:
	case Builtin::BI__sync_nand_and_fetch_8:
	case Builtin::BI__sync_nand_and_fetch_16:
	case Builtin::BI__sync_val_compare_and_swap:
	case Builtin::BI__sync_val_compare_and_swap_1:
	case Builtin::BI__sync_val_compare_and_swap_2:
	case Builtin::BI__sync_val_compare_and_swap_4:
	case Builtin::BI__sync_val_compare_and_swap_8:
	case Builtin::BI__sync_val_compare_and_swap_16:
	case Builtin::BI__sync_bool_compare_and_swap:
	case Builtin::BI__sync_bool_compare_and_swap_1:
	case Builtin::BI__sync_bool_compare_and_swap_2:
	case Builtin::BI__sync_bool_compare_and_swap_4:
	case Builtin::BI__sync_bool_compare_and_swap_8:
	case Builtin::BI__sync_bool_compare_and_swap_16:
	case Builtin::BI__sync_lock_test_and_set:
	case Builtin::BI__sync_lock_test_and_set_1:
	case Builtin::BI__sync_lock_test_and_set_2:
	case Builtin::BI__sync_lock_test_and_set_4:
	case Builtin::BI__sync_lock_test_and_set_8:
	case Builtin::BI__sync_lock_test_and_set_16:
	case Builtin::BI__sync_lock_release:
	case Builtin::BI__sync_lock_release_1:
	case Builtin::BI__sync_lock_release_2:
	case Builtin::BI__sync_lock_release_4:
	case Builtin::BI__sync_lock_release_8:
	case Builtin::BI__sync_lock_release_16:
	case Builtin::BI__sync_swap:
	case Builtin::BI__sync_swap_1:
	case Builtin::BI__sync_swap_2:
	case Builtin::BI__sync_swap_4:
	case Builtin::BI__sync_swap_8:
	case Builtin::BI__sync_swap_16:
	return SemaBuiltinAtomicOverloaded(TheCallResult);
	case Builtin::BI__builtin_nontemporal_load:
	case Builtin::BI__builtin_nontemporal_store:
	return SemaBuiltinNontemporalOverloaded(TheCallResult);
	#define BUILTIN(ID, TYPE, ATTRS)
	#define ATOMIC_BUILTIN(ID, TYPE, ATTRS) \
	case Builtin::BI##ID: \
	return SemaAtomicOpsOverloaded(TheCallResult, AtomicExpr::AO##ID);
	#include "clang/Basic/Builtins.def"
	case Builtin::BI__annotation:
	if (SemaBuiltinMSVCAnnotation(*this, TheCall))
	return ExprError();
	break;
	case Builtin::BI__builtin_annotation:
	if (SemaBuiltinAnnotation(*this, TheCall))
	return ExprError();
	break;
	case Builtin::BI__builtin_addressof:
	if (SemaBuiltinAddressof(*this, TheCall))
	return ExprError();
	break;
	case Builtin::BI__builtin_add_overflow:
	case Builtin::BI__builtin_sub_overflow:
	case Builtin::BI__builtin_mul_overflow:
	if (SemaBuiltinOverflow(*this, TheCall))
	return ExprError();
	break;
	case Builtin::BI__builtin_operator_new:
	case Builtin::BI__builtin_operator_delete:
	if (!getLangOpts().CPlusPlus) {
	Diag(TheCall->getExprLoc(), diag::err_builtin_requires_language)
	<< (BuiltinID == Builtin::BI__builtin_operator_new
	? "__builtin_operator_new"
	: "__builtin_operator_delete")
	<< "C++";
	return ExprError();
	}
	// CodeGen assumes it can find the global new and delete to call,
	// so ensure that they are declared.
	DeclareGlobalNewDelete();
	break;

	// check secure string manipulation functions where overflows
	// are detectable at compile time
	case Builtin::BI__builtin___memcpy_chk:
	case Builtin::BI__builtin___memmove_chk:
	case Builtin::BI__builtin___memset_chk:
	case Builtin::BI__builtin___strlcat_chk:
	case Builtin::BI__builtin___strlcpy_chk:
	case Builtin::BI__builtin___strncat_chk:
	case Builtin::BI__builtin___strncpy_chk:
	case Builtin::BI__builtin___stpncpy_chk:
	SemaBuiltinMemChkCall(*this, FDecl, TheCall, 2, 3);
	break;
	case Builtin::BI__builtin___memccpy_chk:
	SemaBuiltinMemChkCall(*this, FDecl, TheCall, 3, 4);
	break;
	case Builtin::BI__builtin___snprintf_chk:
	case Builtin::BI__builtin___vsnprintf_chk:
	SemaBuiltinMemChkCall(*this, FDecl, TheCall, 1, 3);
	break;
	case Builtin::BI__builtin_call_with_static_chain:
	if (SemaBuiltinCallWithStaticChain(*this, TheCall))
	return ExprError();
	break;
	case Builtin::BI__exception_code:
	case Builtin::BI_exception_code:
	if (SemaBuiltinSEHScopeCheck(*this, TheCall, Scope::SEHExceptScope,
	diag::err_seh___except_block))
	return ExprError();
	break;
	case Builtin::BI__exception_info:
	case Builtin::BI_exception_info:
	if (SemaBuiltinSEHScopeCheck(*this, TheCall, Scope::SEHFilterScope,
	diag::err_seh___except_filter))
	return ExprError();
	break;
	case Builtin::BI__GetExceptionInfo:
	if (checkArgCount(*this, TheCall, 1))
	return ExprError();

	if (CheckCXXThrowOperand(
	TheCall->getLocStart(),
	Context.getExceptionObjectType(FDecl->getParamDecl(0)->getType()),
	TheCall))
	return ExprError();

	TheCall->setType(Context.VoidPtrTy);
	break;
	// OpenCL v2.0, s6.13.16 - Pipe functions
	case Builtin::BIread_pipe:
	case Builtin::BIwrite_pipe:
	// Since those two functions are declared with var args, we need a semantic
	// check for the argument.
	if (SemaBuiltinRWPipe(*this, TheCall))
	return ExprError();
	TheCall->setType(Context.IntTy);
	break;
	case Builtin::BIreserve_read_pipe:
	case Builtin::BIreserve_write_pipe:
	case Builtin::BIwork_group_reserve_read_pipe:
	case Builtin::BIwork_group_reserve_write_pipe:
	if (SemaBuiltinReserveRWPipe(*this, TheCall))
	return ExprError();
	break;
	case Builtin::BIsub_group_reserve_read_pipe:
	case Builtin::BIsub_group_reserve_write_pipe:
	if (checkOpenCLSubgroupExt(*this, TheCall) \|\|
	SemaBuiltinReserveRWPipe(*this, TheCall))
	return ExprError();
	break;
	case Builtin::BIcommit_read_pipe:
	case Builtin::BIcommit_write_pipe:
	case Builtin::BIwork_group_commit_read_pipe:
	case Builtin::BIwork_group_commit_write_pipe:
	if (SemaBuiltinCommitRWPipe(*this, TheCall))
	return ExprError();
	break;
	case Builtin::BIsub_group_commit_read_pipe:
	case Builtin::BIsub_group_commit_write_pipe:
	if (checkOpenCLSubgroupExt(*this, TheCall) \|\|
	SemaBuiltinCommitRWPipe(*this, TheCall))
	return ExprError();
	break;
	case Builtin::BIget_pipe_num_packets:
	case Builtin::BIget_pipe_max_packets:
	if (SemaBuiltinPipePackets(*this, TheCall))
	return ExprError();
	TheCall->setType(Context.UnsignedIntTy);
	break;
	case Builtin::BIto_global:
	case Builtin::BIto_local:
	case Builtin::BIto_private:
	if (SemaOpenCLBuiltinToAddr(*this, BuiltinID, TheCall))
	return ExprError();
	break;
	// OpenCL v2.0, s6.13.17 - Enqueue kernel functions.
	case Builtin::BIenqueue_kernel:
	if (SemaOpenCLBuiltinEnqueueKernel(*this, TheCall))
	return ExprError();
	break;
	case Builtin::BIget_kernel_work_group_size:
	case Builtin::BIget_kernel_preferred_work_group_size_multiple:
	if (SemaOpenCLBuiltinKernelWorkGroupSize(*this, TheCall))
	return ExprError();
	break;
	break;
	case Builtin::BIget_kernel_max_sub_group_size_for_ndrange:
	case Builtin::BIget_kernel_sub_group_count_for_ndrange:
	if (SemaOpenCLBuiltinNDRangeAndBlock(*this, TheCall))
	return ExprError();
	break;
	case Builtin::BI__builtin_os_log_format:
	case Builtin::BI__builtin_os_log_format_buffer_size:
	if (SemaBuiltinOSLogFormat(TheCall))
	return ExprError();
	break;
	}

	// Since the target specific builtins for each arch overlap, only check those
	// of the arch we are compiling for.
	if (Context.BuiltinInfo.isTSBuiltin(BuiltinID)) {
	switch (Context.getTargetInfo().getTriple().getArch()) {
	case llvm::Triple::arm:
	case llvm::Triple::armeb:
	case llvm::Triple::thumb:
	case llvm::Triple::thumbeb:
	if (CheckARMBuiltinFunctionCall(BuiltinID, TheCall))
	return ExprError();
	break;
	case llvm::Triple::aarch64:
	case llvm::Triple::aarch64_be:
	if (CheckAArch64BuiltinFunctionCall(BuiltinID, TheCall))
	return ExprError();
	break;
	case llvm::Triple::mips:
	case llvm::Triple::mipsel:
	case llvm::Triple::mips64:
	case llvm::Triple::mips64el:
	if (CheckMipsBuiltinFunctionCall(BuiltinID, TheCall))
	return ExprError();
	break;
	case llvm::Triple::systemz:
	if (CheckSystemZBuiltinFunctionCall(BuiltinID, TheCall))
	return ExprError();
	break;
	case llvm::Triple::x86:
	case llvm::Triple::x86_64:
	if (CheckX86BuiltinFunctionCall(BuiltinID, TheCall))
	return ExprError();
	break;
	case llvm::Triple::ppc:
	case llvm::Triple::ppc64:
	case llvm::Triple::ppc64le:
	if (CheckPPCBuiltinFunctionCall(BuiltinID, TheCall))
	return ExprError();
	break;
	default:
	break;
	}
	}

	return TheCallResult;
	}

	// Get the valid immediate range for the specified NEON type code.
	static unsigned RFT(unsigned t, bool shift = false, bool ForceQuad = false) {
	NeonTypeFlags Type(t);
	int IsQuad = ForceQuad ? true : Type.isQuad();
	switch (Type.getEltType()) {
	case NeonTypeFlags::Int8:
	case NeonTypeFlags::Poly8:
	return shift ? 7 : (8 << IsQuad) - 1;
	case NeonTypeFlags::Int16:
	case NeonTypeFlags::Poly16:
	return shift ? 15 : (4 << IsQuad) - 1;
	case NeonTypeFlags::Int32:
	return shift ? 31 : (2 << IsQuad) - 1;
	case NeonTypeFlags::Int64:
	case NeonTypeFlags::Poly64:
	return shift ? 63 : (1 << IsQuad) - 1;
	case NeonTypeFlags::Poly128:
	return shift ? 127 : (1 << IsQuad) - 1;
	case NeonTypeFlags::Float16:
	assert(!shift && "cannot shift float types!");
	return (4 << IsQuad) - 1;
	case NeonTypeFlags::Float32:
	assert(!shift && "cannot shift float types!");
	return (2 << IsQuad) - 1;
	case NeonTypeFlags::Float64:
	assert(!shift && "cannot shift float types!");
	return (1 << IsQuad) - 1;
	}
	llvm_unreachable("Invalid NeonTypeFlag!");
	}

	/// getNeonEltType - Return the QualType corresponding to the elements of
	/// the vector type specified by the NeonTypeFlags. This is used to check
	/// the pointer arguments for Neon load/store intrinsics.
	static QualType getNeonEltType(NeonTypeFlags Flags, ASTContext &Context,
	bool IsPolyUnsigned, bool IsInt64Long) {
	switch (Flags.getEltType()) {
	case NeonTypeFlags::Int8:
	return Flags.isUnsigned() ? Context.UnsignedCharTy : Context.SignedCharTy;
	case NeonTypeFlags::Int16:
	return Flags.isUnsigned() ? Context.UnsignedShortTy : Context.ShortTy;
	case NeonTypeFlags::Int32:
	return Flags.isUnsigned() ? Context.UnsignedIntTy : Context.IntTy;
	case NeonTypeFlags::Int64:
	if (IsInt64Long)
	return Flags.isUnsigned() ? Context.UnsignedLongTy : Context.LongTy;
	else
	return Flags.isUnsigned() ? Context.UnsignedLongLongTy
	: Context.LongLongTy;
	case NeonTypeFlags::Poly8:
	return IsPolyUnsigned ? Context.UnsignedCharTy : Context.SignedCharTy;
	case NeonTypeFlags::Poly16:
	return IsPolyUnsigned ? Context.UnsignedShortTy : Context.ShortTy;
	case NeonTypeFlags::Poly64:
	if (IsInt64Long)
	return Context.UnsignedLongTy;
	else
	return Context.UnsignedLongLongTy;
	case NeonTypeFlags::Poly128:
	break;
	case NeonTypeFlags::Float16:
	return Context.HalfTy;
	case NeonTypeFlags::Float32:
	return Context.FloatTy;
	case NeonTypeFlags::Float64:
	return Context.DoubleTy;
	}
	llvm_unreachable("Invalid NeonTypeFlag!");
	}

	bool Sema::CheckNeonBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
	llvm::APSInt Result;
	uint64_t mask = 0;
	unsigned TV = 0;
	int PtrArgNum = -1;
	bool HasConstPtr = false;
	switch (BuiltinID) {
	#define GET_NEON_OVERLOAD_CHECK
	#include "clang/Basic/arm_neon.inc"
	#undef GET_NEON_OVERLOAD_CHECK
	}

	// For NEON intrinsics which are overloaded on vector element type, validate
	// the immediate which specifies which variant to emit.
	unsigned ImmArg = TheCall->getNumArgs()-1;
	if (mask) {
	if (SemaBuiltinConstantArg(TheCall, ImmArg, Result))
	return true;

	TV = Result.getLimitedValue(64);
	if ((TV > 63) \|\| (mask & (1ULL << TV)) == 0)
	return Diag(TheCall->getLocStart(), diag::err_invalid_neon_type_code)
	<< TheCall->getArg(ImmArg)->getSourceRange();
	}

	if (PtrArgNum >= 0) {
	// Check that pointer arguments have the specified type.
	Expr *Arg = TheCall->getArg(PtrArgNum);
	if (ImplicitCastExpr *ICE = dyn_cast<ImplicitCastExpr>(Arg))
	Arg = ICE->getSubExpr();
	ExprResult RHS = DefaultFunctionArrayLvalueConversion(Arg);
	QualType RHSTy = RHS.get()->getType();

	llvm::Triple::ArchType Arch = Context.getTargetInfo().getTriple().getArch();
	bool IsPolyUnsigned = Arch == llvm::Triple::aarch64 \|\|
	Arch == llvm::Triple::aarch64_be;
	bool IsInt64Long =
	Context.getTargetInfo().getInt64Type() == TargetInfo::SignedLong;
	QualType EltTy =
	getNeonEltType(NeonTypeFlags(TV), Context, IsPolyUnsigned, IsInt64Long);
	if (HasConstPtr)
	EltTy = EltTy.withConst();
	QualType LHSTy = Context.getPointerType(EltTy);
	AssignConvertType ConvTy;
	ConvTy = CheckSingleAssignmentConstraints(LHSTy, RHS);
	if (RHS.isInvalid())
	return true;
	if (DiagnoseAssignmentResult(ConvTy, Arg->getLocStart(), LHSTy, RHSTy,
	RHS.get(), AA_Assigning))
	return true;
	}

	// For NEON intrinsics which take an immediate value as part of the
	// instruction, range check them here.
	unsigned i = 0, l = 0, u = 0;
	switch (BuiltinID) {
	default:
	return false;
	#define GET_NEON_IMMEDIATE_CHECK
	#include "clang/Basic/arm_neon.inc"
	#undef GET_NEON_IMMEDIATE_CHECK
	}

	return SemaBuiltinConstantArgRange(TheCall, i, l, u + l);
	}

	bool Sema::CheckARMBuiltinExclusiveCall(unsigned BuiltinID, CallExpr *TheCall,
	unsigned MaxWidth) {
	assert((BuiltinID == ARM::BI__builtin_arm_ldrex \|\|
	BuiltinID == ARM::BI__builtin_arm_ldaex \|\|
	BuiltinID == ARM::BI__builtin_arm_strex \|\|
	BuiltinID == ARM::BI__builtin_arm_stlex \|\|
	BuiltinID == AArch64::BI__builtin_arm_ldrex \|\|
	BuiltinID == AArch64::BI__builtin_arm_ldaex \|\|
	BuiltinID == AArch64::BI__builtin_arm_strex \|\|
	BuiltinID == AArch64::BI__builtin_arm_stlex) &&
	"unexpected ARM builtin");
	bool IsLdrex = BuiltinID == ARM::BI__builtin_arm_ldrex \|\|
	BuiltinID == ARM::BI__builtin_arm_ldaex \|\|
	BuiltinID == AArch64::BI__builtin_arm_ldrex \|\|
	BuiltinID == AArch64::BI__builtin_arm_ldaex;

	DeclRefExpr *DRE =cast<DeclRefExpr>(TheCall->getCallee()->IgnoreParenCasts());

	// Ensure that we have the proper number of arguments.
	if (checkArgCount(*this, TheCall, IsLdrex ? 1 : 2))
	return true;

	// Inspect the pointer argument of the atomic builtin. This should always be
	// a pointer type, whose element is an integral scalar or pointer type.
	// Because it is a pointer type, we don't have to worry about any implicit
	// casts here.
	Expr *PointerArg = TheCall->getArg(IsLdrex ? 0 : 1);
	ExprResult PointerArgRes = DefaultFunctionArrayLvalueConversion(PointerArg);
	if (PointerArgRes.isInvalid())
	return true;
	PointerArg = PointerArgRes.get();

	const PointerType *pointerType = PointerArg->getType()->getAs<PointerType>();
	if (!pointerType) {
	Diag(DRE->getLocStart(), diag::err_atomic_builtin_must_be_pointer)
	<< PointerArg->getType() << PointerArg->getSourceRange();
	return true;
	}

	// ldrex takes a "const volatile T" and strex takes a "volatile T". Our next
	// task is to insert the appropriate casts into the AST. First work out just
	// what the appropriate type is.
	QualType ValType = pointerType->getPointeeType();
	QualType AddrType = ValType.getUnqualifiedType().withVolatile();
	if (IsLdrex)
	AddrType.addConst();

	// Issue a warning if the cast is dodgy.
	CastKind CastNeeded = CK_NoOp;
	if (!AddrType.isAtLeastAsQualifiedAs(ValType)) {
	CastNeeded = CK_BitCast;
	Diag(DRE->getLocStart(), diag::ext_typecheck_convert_discards_qualifiers)
	<< PointerArg->getType()
	<< Context.getPointerType(AddrType)
	<< AA_Passing << PointerArg->getSourceRange();
	}

	// Finally, do the cast and replace the argument with the corrected version.
	AddrType = Context.getPointerType(AddrType);
	PointerArgRes = ImpCastExprToType(PointerArg, AddrType, CastNeeded);
	if (PointerArgRes.isInvalid())
	return true;
	PointerArg = PointerArgRes.get();

	TheCall->setArg(IsLdrex ? 0 : 1, PointerArg);

	// In general, we allow ints, floats and pointers to be loaded and stored.
	if (!ValType->isIntegerType() && !ValType->isAnyPointerType() &&
	!ValType->isBlockPointerType() && !ValType->isFloatingType()) {
	Diag(DRE->getLocStart(), diag::err_atomic_builtin_must_be_pointer_intfltptr)
	<< PointerArg->getType() << PointerArg->getSourceRange();
	return true;
	}

	// But ARM doesn't have instructions to deal with 128-bit versions.
	if (Context.getTypeSize(ValType) > MaxWidth) {
	assert(MaxWidth == 64 && "Diagnostic unexpectedly inaccurate");
	Diag(DRE->getLocStart(), diag::err_atomic_exclusive_builtin_pointer_size)
	<< PointerArg->getType() << PointerArg->getSourceRange();
	return true;
	}

	switch (ValType.getObjCLifetime()) {
	case Qualifiers::OCL_None:
	case Qualifiers::OCL_ExplicitNone:
	// okay
	break;

	case Qualifiers::OCL_Weak:
	case Qualifiers::OCL_Strong:
	case Qualifiers::OCL_Autoreleasing:
	Diag(DRE->getLocStart(), diag::err_arc_atomic_ownership)
	<< ValType << PointerArg->getSourceRange();
	return true;
	}

	if (IsLdrex) {
	TheCall->setType(ValType);
	return false;
	}

	// Initialize the argument to be stored.
	ExprResult ValArg = TheCall->getArg(0);
	InitializedEntity Entity = InitializedEntity::InitializeParameter(
	Context, ValType, /consume/ false);
	ValArg = PerformCopyInitialization(Entity, SourceLocation(), ValArg);
	if (ValArg.isInvalid())
	return true;
	TheCall->setArg(0, ValArg.get());

	// __builtin_arm_strex always returns an int. It's marked as such in the .def,
	// but the custom checker bypasses all default analysis.
	TheCall->setType(Context.IntTy);
	return false;
	}

	bool Sema::CheckARMBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
	if (BuiltinID == ARM::BI__builtin_arm_ldrex \|\|
	BuiltinID == ARM::BI__builtin_arm_ldaex \|\|
	BuiltinID == ARM::BI__builtin_arm_strex \|\|
	BuiltinID == ARM::BI__builtin_arm_stlex) {
	return CheckARMBuiltinExclusiveCall(BuiltinID, TheCall, 64);
	}

	if (BuiltinID == ARM::BI__builtin_arm_prefetch) {
	return SemaBuiltinConstantArgRange(TheCall, 1, 0, 1) \|\|
	SemaBuiltinConstantArgRange(TheCall, 2, 0, 1);
	}

	if (BuiltinID == ARM::BI__builtin_arm_rsr64 \|\|
	BuiltinID == ARM::BI__builtin_arm_wsr64)
	return SemaBuiltinARMSpecialReg(BuiltinID, TheCall, 0, 3, false);

	if (BuiltinID == ARM::BI__builtin_arm_rsr \|\|
	BuiltinID == ARM::BI__builtin_arm_rsrp \|\|
	BuiltinID == ARM::BI__builtin_arm_wsr \|\|
	BuiltinID == ARM::BI__builtin_arm_wsrp)
	return SemaBuiltinARMSpecialReg(BuiltinID, TheCall, 0, 5, true);

	if (CheckNeonBuiltinFunctionCall(BuiltinID, TheCall))
	return true;

	// For intrinsics which take an immediate value as part of the instruction,
	// range check them here.
	// FIXME: VFP Intrinsics should error if VFP not present.
	switch (BuiltinID) {
	default: return false;
	case ARM::BI__builtin_arm_ssat:
	return SemaBuiltinConstantArgRange(TheCall, 1, 1, 32);
	case ARM::BI__builtin_arm_usat:
	return SemaBuiltinConstantArgRange(TheCall, 1, 0, 31);
	case ARM::BI__builtin_arm_ssat16:
	return SemaBuiltinConstantArgRange(TheCall, 1, 1, 16);
	case ARM::BI__builtin_arm_usat16:
	return SemaBuiltinConstantArgRange(TheCall, 1, 0, 15);
	case ARM::BI__builtin_arm_vcvtr_f:
	case ARM::BI__builtin_arm_vcvtr_d:
	return SemaBuiltinConstantArgRange(TheCall, 1, 0, 1);
	case ARM::BI__builtin_arm_dmb:
	case ARM::BI__builtin_arm_dsb:
	case ARM::BI__builtin_arm_isb:
	case ARM::BI__builtin_arm_dbg:
	return SemaBuiltinConstantArgRange(TheCall, 0, 0, 15);
	}
	}

	bool Sema::CheckAArch64BuiltinFunctionCall(unsigned BuiltinID,
	CallExpr *TheCall) {
	if (BuiltinID == AArch64::BI__builtin_arm_ldrex \|\|
	BuiltinID == AArch64::BI__builtin_arm_ldaex \|\|
	BuiltinID == AArch64::BI__builtin_arm_strex \|\|
	BuiltinID == AArch64::BI__builtin_arm_stlex) {
	return CheckARMBuiltinExclusiveCall(BuiltinID, TheCall, 128);
	}

	if (BuiltinID == AArch64::BI__builtin_arm_prefetch) {
	return SemaBuiltinConstantArgRange(TheCall, 1, 0, 1) \|\|
	SemaBuiltinConstantArgRange(TheCall, 2, 0, 2) \|\|
	SemaBuiltinConstantArgRange(TheCall, 3, 0, 1) \|\|
	SemaBuiltinConstantArgRange(TheCall, 4, 0, 1);
	}

	if (BuiltinID == AArch64::BI__builtin_arm_rsr64 \|\|
	BuiltinID == AArch64::BI__builtin_arm_wsr64)
	return SemaBuiltinARMSpecialReg(BuiltinID, TheCall, 0, 5, true);

	if (BuiltinID == AArch64::BI__builtin_arm_rsr \|\|
	BuiltinID == AArch64::BI__builtin_arm_rsrp \|\|
	BuiltinID == AArch64::BI__builtin_arm_wsr \|\|
	BuiltinID == AArch64::BI__builtin_arm_wsrp)
	return SemaBuiltinARMSpecialReg(BuiltinID, TheCall, 0, 5, true);

	if (CheckNeonBuiltinFunctionCall(BuiltinID, TheCall))
	return true;

	// For intrinsics which take an immediate value as part of the instruction,
	// range check them here.
	unsigned i = 0, l = 0, u = 0;
	switch (BuiltinID) {
	default: return false;
	case AArch64::BI__builtin_arm_dmb:
	case AArch64::BI__builtin_arm_dsb:
	case AArch64::BI__builtin_arm_isb: l = 0; u = 15; break;
	}

	return SemaBuiltinConstantArgRange(TheCall, i, l, u + l);
	}

	// CheckMipsBuiltinFunctionCall - Checks the constant value passed to the
	// intrinsic is correct. The switch statement is ordered by DSP, MSA. The
	// ordering for DSP is unspecified. MSA is ordered by the data format used
	// by the underlying instruction i.e., df/m, df/n and then by size.
	//
	// FIXME: The size tests here should instead be tablegen'd along with the
	// definitions from include/clang/Basic/BuiltinsMips.def.
	// FIXME: GCC is strict on signedness for some of these intrinsics, we should
	// be too.
	bool Sema::CheckMipsBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
	unsigned i = 0, l = 0, u = 0, m = 0;
	switch (BuiltinID) {
	default: return false;
	case Mips::BI__builtin_mips_wrdsp: i = 1; l = 0; u = 63; break;
	case Mips::BI__builtin_mips_rddsp: i = 0; l = 0; u = 63; break;
	case Mips::BI__builtin_mips_append: i = 2; l = 0; u = 31; break;
	case Mips::BI__builtin_mips_balign: i = 2; l = 0; u = 3; break;
	case Mips::BI__builtin_mips_precr_sra_ph_w: i = 2; l = 0; u = 31; break;
	case Mips::BI__builtin_mips_precr_sra_r_ph_w: i = 2; l = 0; u = 31; break;
	case Mips::BI__builtin_mips_prepend: i = 2; l = 0; u = 31; break;
	// MSA instrinsics. Instructions (which the intrinsics maps to) which use the
	// df/m field.
	// These intrinsics take an unsigned 3 bit immediate.
	case Mips::BI__builtin_msa_bclri_b:
	case Mips::BI__builtin_msa_bnegi_b:
	case Mips::BI__builtin_msa_bseti_b:
	case Mips::BI__builtin_msa_sat_s_b:
	case Mips::BI__builtin_msa_sat_u_b:
	case Mips::BI__builtin_msa_slli_b:
	case Mips::BI__builtin_msa_srai_b:
	case Mips::BI__builtin_msa_srari_b:
	case Mips::BI__builtin_msa_srli_b:
	case Mips::BI__builtin_msa_srlri_b: i = 1; l = 0; u = 7; break;
	case Mips::BI__builtin_msa_binsli_b:
	case Mips::BI__builtin_msa_binsri_b: i = 2; l = 0; u = 7; break;
	// These intrinsics take an unsigned 4 bit immediate.
	case Mips::BI__builtin_msa_bclri_h:
	case Mips::BI__builtin_msa_bnegi_h:
	case Mips::BI__builtin_msa_bseti_h:
	case Mips::BI__builtin_msa_sat_s_h:
	case Mips::BI__builtin_msa_sat_u_h:
	case Mips::BI__builtin_msa_slli_h:
	case Mips::BI__builtin_msa_srai_h:
	case Mips::BI__builtin_msa_srari_h:
	case Mips::BI__builtin_msa_srli_h:
	case Mips::BI__builtin_msa_srlri_h: i = 1; l = 0; u = 15; break;
	case Mips::BI__builtin_msa_binsli_h:
	case Mips::BI__builtin_msa_binsri_h: i = 2; l = 0; u = 15; break;
	// These intrinsics take an unsigned 5 bit immedate.
	// The first block of intrinsics actually have an unsigned 5 bit field,
	// not a df/n field.
	case Mips::BI__builtin_msa_clei_u_b:
	case Mips::BI__builtin_msa_clei_u_h:
	case Mips::BI__builtin_msa_clei_u_w:
	case Mips::BI__builtin_msa_clei_u_d:
	case Mips::BI__builtin_msa_clti_u_b:
	case Mips::BI__builtin_msa_clti_u_h:
	case Mips::BI__builtin_msa_clti_u_w:
	case Mips::BI__builtin_msa_clti_u_d:
	case Mips::BI__builtin_msa_maxi_u_b:
	case Mips::BI__builtin_msa_maxi_u_h:
	case Mips::BI__builtin_msa_maxi_u_w:
	case Mips::BI__builtin_msa_maxi_u_d:
	case Mips::BI__builtin_msa_mini_u_b:
	case Mips::BI__builtin_msa_mini_u_h:
	case Mips::BI__builtin_msa_mini_u_w:
	case Mips::BI__builtin_msa_mini_u_d:
	case Mips::BI__builtin_msa_addvi_b:
	case Mips::BI__builtin_msa_addvi_h:
	case Mips::BI__builtin_msa_addvi_w:
	case Mips::BI__builtin_msa_addvi_d:
	case Mips::BI__builtin_msa_bclri_w:
	case Mips::BI__builtin_msa_bnegi_w:
	case Mips::BI__builtin_msa_bseti_w:
	case Mips::BI__builtin_msa_sat_s_w:
	case Mips::BI__builtin_msa_sat_u_w:
	case Mips::BI__builtin_msa_slli_w:
	case Mips::BI__builtin_msa_srai_w:
	case Mips::BI__builtin_msa_srari_w:
	case Mips::BI__builtin_msa_srli_w:
	case Mips::BI__builtin_msa_srlri_w:
	case Mips::BI__builtin_msa_subvi_b:
	case Mips::BI__builtin_msa_subvi_h:
	case Mips::BI__builtin_msa_subvi_w:
	case Mips::BI__builtin_msa_subvi_d: i = 1; l = 0; u = 31; break;
	case Mips::BI__builtin_msa_binsli_w:
	case Mips::BI__builtin_msa_binsri_w: i = 2; l = 0; u = 31; break;
	// These intrinsics take an unsigned 6 bit immediate.
	case Mips::BI__builtin_msa_bclri_d:
	case Mips::BI__builtin_msa_bnegi_d:
	case Mips::BI__builtin_msa_bseti_d:
	case Mips::BI__builtin_msa_sat_s_d:
	case Mips::BI__builtin_msa_sat_u_d:
	case Mips::BI__builtin_msa_slli_d:
	case Mips::BI__builtin_msa_srai_d:
	case Mips::BI__builtin_msa_srari_d:
	case Mips::BI__builtin_msa_srli_d:
	case Mips::BI__builtin_msa_srlri_d: i = 1; l = 0; u = 63; break;
	case Mips::BI__builtin_msa_binsli_d:
	case Mips::BI__builtin_msa_binsri_d: i = 2; l = 0; u = 63; break;
	// These intrinsics take a signed 5 bit immediate.
	case Mips::BI__builtin_msa_ceqi_b:
	case Mips::BI__builtin_msa_ceqi_h:
	case Mips::BI__builtin_msa_ceqi_w:
	case Mips::BI__builtin_msa_ceqi_d:
	case Mips::BI__builtin_msa_clti_s_b:
	case Mips::BI__builtin_msa_clti_s_h:
	case Mips::BI__builtin_msa_clti_s_w:
	case Mips::BI__builtin_msa_clti_s_d:
	case Mips::BI__builtin_msa_clei_s_b:
	case Mips::BI__builtin_msa_clei_s_h:
	case Mips::BI__builtin_msa_clei_s_w:
	case Mips::BI__builtin_msa_clei_s_d:
	case Mips::BI__builtin_msa_maxi_s_b:
	case Mips::BI__builtin_msa_maxi_s_h:
	case Mips::BI__builtin_msa_maxi_s_w:
	case Mips::BI__builtin_msa_maxi_s_d:
	case Mips::BI__builtin_msa_mini_s_b:
	case Mips::BI__builtin_msa_mini_s_h:
	case Mips::BI__builtin_msa_mini_s_w:
	case Mips::BI__builtin_msa_mini_s_d: i = 1; l = -16; u = 15; break;
	// These intrinsics take an unsigned 8 bit immediate.
	case Mips::BI__builtin_msa_andi_b:
	case Mips::BI__builtin_msa_nori_b:
	case Mips::BI__builtin_msa_ori_b:
	case Mips::BI__builtin_msa_shf_b:
	case Mips::BI__builtin_msa_shf_h:
	case Mips::BI__builtin_msa_shf_w:
	case Mips::BI__builtin_msa_xori_b: i = 1; l = 0; u = 255; break;
	case Mips::BI__builtin_msa_bseli_b:
	case Mips::BI__builtin_msa_bmnzi_b:
	case Mips::BI__builtin_msa_bmzi_b: i = 2; l = 0; u = 255; break;
	// df/n format
	// These intrinsics take an unsigned 4 bit immediate.
	case Mips::BI__builtin_msa_copy_s_b:
	case Mips::BI__builtin_msa_copy_u_b:
	case Mips::BI__builtin_msa_insve_b:
	case Mips::BI__builtin_msa_splati_b: i = 1; l = 0; u = 15; break;
	case Mips::BI__builtin_msa_sldi_b: i = 2; l = 0; u = 15; break;
	// These intrinsics take an unsigned 3 bit immediate.
	case Mips::BI__builtin_msa_copy_s_h:
	case Mips::BI__builtin_msa_copy_u_h:
	case Mips::BI__builtin_msa_insve_h:
	case Mips::BI__builtin_msa_splati_h: i = 1; l = 0; u = 7; break;
	case Mips::BI__builtin_msa_sldi_h: i = 2; l = 0; u = 7; break;
	// These intrinsics take an unsigned 2 bit immediate.
	case Mips::BI__builtin_msa_copy_s_w:
	case Mips::BI__builtin_msa_copy_u_w:
	case Mips::BI__builtin_msa_insve_w:
	case Mips::BI__builtin_msa_splati_w: i = 1; l = 0; u = 3; break;
	case Mips::BI__builtin_msa_sldi_w: i = 2; l = 0; u = 3; break;
	// These intrinsics take an unsigned 1 bit immediate.
	case Mips::BI__builtin_msa_copy_s_d:
	case Mips::BI__builtin_msa_copy_u_d:
	case Mips::BI__builtin_msa_insve_d:
	case Mips::BI__builtin_msa_splati_d: i = 1; l = 0; u = 1; break;
	case Mips::BI__builtin_msa_sldi_d: i = 2; l = 0; u = 1; break;
	// Memory offsets and immediate loads.
	// These intrinsics take a signed 10 bit immediate.
	case Mips::BI__builtin_msa_ldi_b: i = 0; l = -128; u = 255; break;
	case Mips::BI__builtin_msa_ldi_h:
	case Mips::BI__builtin_msa_ldi_w:
	case Mips::BI__builtin_msa_ldi_d: i = 0; l = -512; u = 511; break;
	case Mips::BI__builtin_msa_ld_b: i = 1; l = -512; u = 511; m = 16; break;
	case Mips::BI__builtin_msa_ld_h: i = 1; l = -1024; u = 1022; m = 16; break;
	case Mips::BI__builtin_msa_ld_w: i = 1; l = -2048; u = 2044; m = 16; break;
	case Mips::BI__builtin_msa_ld_d: i = 1; l = -4096; u = 4088; m = 16; break;
	case Mips::BI__builtin_msa_st_b: i = 2; l = -512; u = 511; m = 16; break;
	case Mips::BI__builtin_msa_st_h: i = 2; l = -1024; u = 1022; m = 16; break;
	case Mips::BI__builtin_msa_st_w: i = 2; l = -2048; u = 2044; m = 16; break;
	case Mips::BI__builtin_msa_st_d: i = 2; l = -4096; u = 4088; m = 16; break;
	}

	if (!m)
	return SemaBuiltinConstantArgRange(TheCall, i, l, u);

	return SemaBuiltinConstantArgRange(TheCall, i, l, u) \|\|
	SemaBuiltinConstantArgMultiple(TheCall, i, m);
	}

	bool Sema::CheckPPCBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
	unsigned i = 0, l = 0, u = 0;
	bool Is64BitBltin = BuiltinID == PPC::BI__builtin_divde \|\|
	BuiltinID == PPC::BI__builtin_divdeu \|\|
	BuiltinID == PPC::BI__builtin_bpermd;
	bool IsTarget64Bit = Context.getTargetInfo()
	.getTypeWidth(Context
	.getTargetInfo()
	.getIntPtrType()) == 64;
	bool IsBltinExtDiv = BuiltinID == PPC::BI__builtin_divwe \|\|
	BuiltinID == PPC::BI__builtin_divweu \|\|
	BuiltinID == PPC::BI__builtin_divde \|\|
	BuiltinID == PPC::BI__builtin_divdeu;

	if (Is64BitBltin && !IsTarget64Bit)
	return Diag(TheCall->getLocStart(), diag::err_64_bit_builtin_32_bit_tgt)
	<< TheCall->getSourceRange();

	if ((IsBltinExtDiv && !Context.getTargetInfo().hasFeature("extdiv")) \|\|
	(BuiltinID == PPC::BI__builtin_bpermd &&
	!Context.getTargetInfo().hasFeature("bpermd")))
	return Diag(TheCall->getLocStart(), diag::err_ppc_builtin_only_on_pwr7)
	<< TheCall->getSourceRange();

	switch (BuiltinID) {
	default: return false;
	case PPC::BI__builtin_altivec_crypto_vshasigmaw:
	case PPC::BI__builtin_altivec_crypto_vshasigmad:
	return SemaBuiltinConstantArgRange(TheCall, 1, 0, 1) \|\|
	SemaBuiltinConstantArgRange(TheCall, 2, 0, 15);
	case PPC::BI__builtin_tbegin:
	case PPC::BI__builtin_tend: i = 0; l = 0; u = 1; break;
	case PPC::BI__builtin_tsr: i = 0; l = 0; u = 7; break;
	case PPC::BI__builtin_tabortwc:
	case PPC::BI__builtin_tabortdc: i = 0; l = 0; u = 31; break;
	case PPC::BI__builtin_tabortwci:
	case PPC::BI__builtin_tabortdci:
	return SemaBuiltinConstantArgRange(TheCall, 0, 0, 31) \|\|
	SemaBuiltinConstantArgRange(TheCall, 2, 0, 31);
	case PPC::BI__builtin_vsx_xxpermdi:
	case PPC::BI__builtin_vsx_xxsldwi:
	return SemaBuiltinVSX(TheCall);
	}
	return SemaBuiltinConstantArgRange(TheCall, i, l, u);
	}

	bool Sema::CheckSystemZBuiltinFunctionCall(unsigned BuiltinID,
	CallExpr *TheCall) {
	if (BuiltinID == SystemZ::BI__builtin_tabort) {
	Expr *Arg = TheCall->getArg(0);
	llvm::APSInt AbortCode(32);
	if (Arg->isIntegerConstantExpr(AbortCode, Context) &&
	AbortCode.getSExtValue() >= 0 && AbortCode.getSExtValue() < 256)
	return Diag(Arg->getLocStart(), diag::err_systemz_invalid_tabort_code)
	<< Arg->getSourceRange();
	}

	// For intrinsics which take an immediate value as part of the instruction,
	// range check them here.
	unsigned i = 0, l = 0, u = 0;
	switch (BuiltinID) {
	default: return false;
	case SystemZ::BI__builtin_s390_lcbb: i = 1; l = 0; u = 15; break;
	case SystemZ::BI__builtin_s390_verimb:
	case SystemZ::BI__builtin_s390_verimh:
	case SystemZ::BI__builtin_s390_verimf:
	case SystemZ::BI__builtin_s390_verimg: i = 3; l = 0; u = 255; break;
	case SystemZ::BI__builtin_s390_vfaeb:
	case SystemZ::BI__builtin_s390_vfaeh:
	case SystemZ::BI__builtin_s390_vfaef:
	case SystemZ::BI__builtin_s390_vfaebs:
	case SystemZ::BI__builtin_s390_vfaehs:
	case SystemZ::BI__builtin_s390_vfaefs:
	case SystemZ::BI__builtin_s390_vfaezb:
	case SystemZ::BI__builtin_s390_vfaezh:
	case SystemZ::BI__builtin_s390_vfaezf:
	case SystemZ::BI__builtin_s390_vfaezbs:
	case SystemZ::BI__builtin_s390_vfaezhs:
	case SystemZ::BI__builtin_s390_vfaezfs: i = 2; l = 0; u = 15; break;
	case SystemZ::BI__builtin_s390_vfisb:
	case SystemZ::BI__builtin_s390_vfidb:
	return SemaBuiltinConstantArgRange(TheCall, 1, 0, 15) \|\|
	SemaBuiltinConstantArgRange(TheCall, 2, 0, 15);
	case SystemZ::BI__builtin_s390_vftcisb:
	case SystemZ::BI__builtin_s390_vftcidb: i = 1; l = 0; u = 4095; break;
	case SystemZ::BI__builtin_s390_vlbb: i = 1; l = 0; u = 15; break;
	case SystemZ::BI__builtin_s390_vpdi: i = 2; l = 0; u = 15; break;
	case SystemZ::BI__builtin_s390_vsldb: i = 2; l = 0; u = 15; break;
	case SystemZ::BI__builtin_s390_vstrcb:
	case SystemZ::BI__builtin_s390_vstrch:
	case SystemZ::BI__builtin_s390_vstrcf:
	case SystemZ::BI__builtin_s390_vstrczb:
	case SystemZ::BI__builtin_s390_vstrczh:
	case SystemZ::BI__builtin_s390_vstrczf:
	case SystemZ::BI__builtin_s390_vstrcbs:
	case SystemZ::BI__builtin_s390_vstrchs:
	case SystemZ::BI__builtin_s390_vstrcfs:
	case SystemZ::BI__builtin_s390_vstrczbs:
	case SystemZ::BI__builtin_s390_vstrczhs:
	case SystemZ::BI__builtin_s390_vstrczfs: i = 3; l = 0; u = 15; break;
	case SystemZ::BI__builtin_s390_vmslg: i = 3; l = 0; u = 15; break;
	case SystemZ::BI__builtin_s390_vfminsb:
	case SystemZ::BI__builtin_s390_vfmaxsb:
	case SystemZ::BI__builtin_s390_vfmindb:
	case SystemZ::BI__builtin_s390_vfmaxdb: i = 2; l = 0; u = 15; break;
	}
	return SemaBuiltinConstantArgRange(TheCall, i, l, u);
	}

	/// SemaBuiltinCpuSupports - Handle __builtin_cpu_supports(char *).
	/// This checks that the target supports __builtin_cpu_supports and
	/// that the string argument is constant and valid.
	static bool SemaBuiltinCpuSupports(Sema &S, CallExpr *TheCall) {
	Expr *Arg = TheCall->getArg(0);

	// Check if the argument is a string literal.
	if (!isa<StringLiteral>(Arg->IgnoreParenImpCasts()))
	return S.Diag(TheCall->getLocStart(), diag::err_expr_not_string_literal)
	<< Arg->getSourceRange();

	// Check the contents of the string.
	StringRef Feature =
	cast<StringLiteral>(Arg->IgnoreParenImpCasts())->getString();
	if (!S.Context.getTargetInfo().validateCpuSupports(Feature))
	return S.Diag(TheCall->getLocStart(), diag::err_invalid_cpu_supports)
	<< Arg->getSourceRange();
	return false;
	}

	/// SemaBuiltinCpuIs - Handle __builtin_cpu_is(char *).
	/// This checks that the target supports __builtin_cpu_is and
	/// that the string argument is constant and valid.
	static bool SemaBuiltinCpuIs(Sema &S, CallExpr *TheCall) {
	Expr *Arg = TheCall->getArg(0);

	// Check if the argument is a string literal.
	if (!isa<StringLiteral>(Arg->IgnoreParenImpCasts()))
	return S.Diag(TheCall->getLocStart(), diag::err_expr_not_string_literal)
	<< Arg->getSourceRange();

	// Check the contents of the string.
	StringRef Feature =
	cast<StringLiteral>(Arg->IgnoreParenImpCasts())->getString();
	if (!S.Context.getTargetInfo().validateCpuIs(Feature))
	return S.Diag(TheCall->getLocStart(), diag::err_invalid_cpu_is)
	<< Arg->getSourceRange();
	return false;
	}

	// Check if the rounding mode is legal.
	bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
	// Indicates if this instruction has rounding control or just SAE.
	bool HasRC = false;

	unsigned ArgNum = 0;
	switch (BuiltinID) {
	default:
	return false;
	case X86::BI__builtin_ia32_vcvttsd2si32:
	case X86::BI__builtin_ia32_vcvttsd2si64:
	case X86::BI__builtin_ia32_vcvttsd2usi32:
	case X86::BI__builtin_ia32_vcvttsd2usi64:
	case X86::BI__builtin_ia32_vcvttss2si32:
	case X86::BI__builtin_ia32_vcvttss2si64:
	case X86::BI__builtin_ia32_vcvttss2usi32:
	case X86::BI__builtin_ia32_vcvttss2usi64:
	ArgNum = 1;
	break;
	case X86::BI__builtin_ia32_cvtps2pd512_mask:
	case X86::BI__builtin_ia32_cvttpd2dq512_mask:
	case X86::BI__builtin_ia32_cvttpd2qq512_mask:
	case X86::BI__builtin_ia32_cvttpd2udq512_mask:
	case X86::BI__builtin_ia32_cvttpd2uqq512_mask:
	case X86::BI__builtin_ia32_cvttps2dq512_mask:
	case X86::BI__builtin_ia32_cvttps2qq512_mask:
	case X86::BI__builtin_ia32_cvttps2udq512_mask:
	case X86::BI__builtin_ia32_cvttps2uqq512_mask:
	case X86::BI__builtin_ia32_exp2pd_mask:
	case X86::BI__builtin_ia32_exp2ps_mask:
	case X86::BI__builtin_ia32_getexppd512_mask:
	case X86::BI__builtin_ia32_getexpps512_mask:
	case X86::BI__builtin_ia32_rcp28pd_mask:
	case X86::BI__builtin_ia32_rcp28ps_mask:
	case X86::BI__builtin_ia32_rsqrt28pd_mask:
	case X86::BI__builtin_ia32_rsqrt28ps_mask:
	case X86::BI__builtin_ia32_vcomisd:
	case X86::BI__builtin_ia32_vcomiss:
	case X86::BI__builtin_ia32_vcvtph2ps512_mask:
	ArgNum = 3;
	break;
	case X86::BI__builtin_ia32_cmppd512_mask:
	case X86::BI__builtin_ia32_cmpps512_mask:
	case X86::BI__builtin_ia32_cmpsd_mask:
	case X86::BI__builtin_ia32_cmpss_mask:
	case X86::BI__builtin_ia32_cvtss2sd_round_mask:
	case X86::BI__builtin_ia32_getexpsd128_round_mask:
	case X86::BI__builtin_ia32_getexpss128_round_mask:
	case X86::BI__builtin_ia32_maxpd512_mask:
	case X86::BI__builtin_ia32_maxps512_mask:
	case X86::BI__builtin_ia32_maxsd_round_mask:
	case X86::BI__builtin_ia32_maxss_round_mask:
	case X86::BI__builtin_ia32_minpd512_mask:
	case X86::BI__builtin_ia32_minps512_mask:
	case X86::BI__builtin_ia32_minsd_round_mask:
	case X86::BI__builtin_ia32_minss_round_mask:
	case X86::BI__builtin_ia32_rcp28sd_round_mask:
	case X86::BI__builtin_ia32_rcp28ss_round_mask:
	case X86::BI__builtin_ia32_reducepd512_mask:
	case X86::BI__builtin_ia32_reduceps512_mask:
	case X86::BI__builtin_ia32_rndscalepd_mask:
	case X86::BI__builtin_ia32_rndscaleps_mask:
	case X86::BI__builtin_ia32_rsqrt28sd_round_mask:
	case X86::BI__builtin_ia32_rsqrt28ss_round_mask:
	ArgNum = 4;
	break;
	case X86::BI__builtin_ia32_fixupimmpd512_mask:
	case X86::BI__builtin_ia32_fixupimmpd512_maskz:
	case X86::BI__builtin_ia32_fixupimmps512_mask:
	case X86::BI__builtin_ia32_fixupimmps512_maskz:
	case X86::BI__builtin_ia32_fixupimmsd_mask:
	case X86::BI__builtin_ia32_fixupimmsd_maskz:
	case X86::BI__builtin_ia32_fixupimmss_mask:
	case X86::BI__builtin_ia32_fixupimmss_maskz:
	case X86::BI__builtin_ia32_rangepd512_mask:
	case X86::BI__builtin_ia32_rangeps512_mask:
	case X86::BI__builtin_ia32_rangesd128_round_mask:
	case X86::BI__builtin_ia32_rangess128_round_mask:
	case X86::BI__builtin_ia32_reducesd_mask:
	case X86::BI__builtin_ia32_reducess_mask:
	case X86::BI__builtin_ia32_rndscalesd_round_mask:
	case X86::BI__builtin_ia32_rndscaless_round_mask:
	ArgNum = 5;
	break;
	case X86::BI__builtin_ia32_vcvtsd2si64:
	case X86::BI__builtin_ia32_vcvtsd2si32:
	case X86::BI__builtin_ia32_vcvtsd2usi32:
	case X86::BI__builtin_ia32_vcvtsd2usi64:
	case X86::BI__builtin_ia32_vcvtss2si32:
	case X86::BI__builtin_ia32_vcvtss2si64:
	case X86::BI__builtin_ia32_vcvtss2usi32:
	case X86::BI__builtin_ia32_vcvtss2usi64:
	ArgNum = 1;
	HasRC = true;
	break;
	case X86::BI__builtin_ia32_cvtsi2sd64:
	case X86::BI__builtin_ia32_cvtsi2ss32:
	case X86::BI__builtin_ia32_cvtsi2ss64:
	case X86::BI__builtin_ia32_cvtusi2sd64:
	case X86::BI__builtin_ia32_cvtusi2ss32:
	case X86::BI__builtin_ia32_cvtusi2ss64:
	ArgNum = 2;
	HasRC = true;
	break;
	case X86::BI__builtin_ia32_cvtdq2ps512_mask:
	case X86::BI__builtin_ia32_cvtudq2ps512_mask:
	case X86::BI__builtin_ia32_cvtpd2ps512_mask:
	case X86::BI__builtin_ia32_cvtpd2qq512_mask:
	case X86::BI__builtin_ia32_cvtpd2uqq512_mask:
	case X86::BI__builtin_ia32_cvtps2qq512_mask:
	case X86::BI__builtin_ia32_cvtps2uqq512_mask:
	case X86::BI__builtin_ia32_cvtqq2pd512_mask:
	case X86::BI__builtin_ia32_cvtqq2ps512_mask:
	case X86::BI__builtin_ia32_cvtuqq2pd512_mask:
	case X86::BI__builtin_ia32_cvtuqq2ps512_mask:
	case X86::BI__builtin_ia32_sqrtpd512_mask:
	case X86::BI__builtin_ia32_sqrtps512_mask:
	ArgNum = 3;
	HasRC = true;
	break;
	case X86::BI__builtin_ia32_addpd512_mask:
	case X86::BI__builtin_ia32_addps512_mask:
	case X86::BI__builtin_ia32_divpd512_mask:
	case X86::BI__builtin_ia32_divps512_mask:
	case X86::BI__builtin_ia32_mulpd512_mask:
	case X86::BI__builtin_ia32_mulps512_mask:
	case X86::BI__builtin_ia32_subpd512_mask:
	case X86::BI__builtin_ia32_subps512_mask:
	case X86::BI__builtin_ia32_addss_round_mask:
	case X86::BI__builtin_ia32_addsd_round_mask:
	case X86::BI__builtin_ia32_divss_round_mask:
	case X86::BI__builtin_ia32_divsd_round_mask:
	case X86::BI__builtin_ia32_mulss_round_mask:
	case X86::BI__builtin_ia32_mulsd_round_mask:
	case X86::BI__builtin_ia32_subss_round_mask:
	case X86::BI__builtin_ia32_subsd_round_mask:
	case X86::BI__builtin_ia32_scalefpd512_mask:
	case X86::BI__builtin_ia32_scalefps512_mask:
	case X86::BI__builtin_ia32_scalefsd_round_mask:
	case X86::BI__builtin_ia32_scalefss_round_mask:
	case X86::BI__builtin_ia32_getmantpd512_mask:
	case X86::BI__builtin_ia32_getmantps512_mask:
	case X86::BI__builtin_ia32_cvtsd2ss_round_mask:
	case X86::BI__builtin_ia32_sqrtsd_round_mask:
	case X86::BI__builtin_ia32_sqrtss_round_mask:
	case X86::BI__builtin_ia32_vfmaddpd512_mask:
	case X86::BI__builtin_ia32_vfmaddpd512_mask3:
	case X86::BI__builtin_ia32_vfmaddpd512_maskz:
	case X86::BI__builtin_ia32_vfmaddps512_mask:
	case X86::BI__builtin_ia32_vfmaddps512_mask3:
	case X86::BI__builtin_ia32_vfmaddps512_maskz:
	case X86::BI__builtin_ia32_vfmaddsubpd512_mask:
	case X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
	case X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
	case X86::BI__builtin_ia32_vfmaddsubps512_mask:
	case X86::BI__builtin_ia32_vfmaddsubps512_mask3:
	case X86::BI__builtin_ia32_vfmaddsubps512_maskz:
	case X86::BI__builtin_ia32_vfmsubpd512_mask3:
	case X86::BI__builtin_ia32_vfmsubps512_mask3:
	case X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
	case X86::BI__builtin_ia32_vfmsubaddps512_mask3:
	case X86::BI__builtin_ia32_vfnmaddpd512_mask:
	case X86::BI__builtin_ia32_vfnmaddps512_mask:
	case X86::BI__builtin_ia32_vfnmsubpd512_mask:
	case X86::BI__builtin_ia32_vfnmsubpd512_mask3:
	case X86::BI__builtin_ia32_vfnmsubps512_mask:
	case X86::BI__builtin_ia32_vfnmsubps512_mask3:
	case X86::BI__builtin_ia32_vfmaddsd3_mask:
	case X86::BI__builtin_ia32_vfmaddsd3_maskz:
	case X86::BI__builtin_ia32_vfmaddsd3_mask3:
	case X86::BI__builtin_ia32_vfmaddss3_mask:
	case X86::BI__builtin_ia32_vfmaddss3_maskz:
	case X86::BI__builtin_ia32_vfmaddss3_mask3:
	ArgNum = 4;
	HasRC = true;
	break;
	case X86::BI__builtin_ia32_getmantsd_round_mask:
	case X86::BI__builtin_ia32_getmantss_round_mask:
	ArgNum = 5;
	HasRC = true;
	break;
	}

	llvm::APSInt Result;

	// We can't check the value of a dependent argument.
	Expr *Arg = TheCall->getArg(ArgNum);
	if (Arg->isTypeDependent() \|\| Arg->isValueDependent())
	return false;

	// Check constant-ness first.
	if (SemaBuiltinConstantArg(TheCall, ArgNum, Result))
	return true;

	// Make sure rounding mode is either ROUND_CUR_DIRECTION or ROUND_NO_EXC bit
	// is set. If the intrinsic has rounding control(bits 1:0), make sure its only
	// combined with ROUND_NO_EXC.
	if (Result == 4/ROUND_CUR_DIRECTION/ \|\|
	Result == 8/ROUND_NO_EXC/ \|\|
	(HasRC && Result.getZExtValue() >= 8 && Result.getZExtValue() <= 11))
	return false;

	return Diag(TheCall->getLocStart(), diag::err_x86_builtin_invalid_rounding)
	<< Arg->getSourceRange();
	}

	// Check if the gather/scatter scale is legal.
	bool Sema::CheckX86BuiltinGatherScatterScale(unsigned BuiltinID,
	CallExpr *TheCall) {
	unsigned ArgNum = 0;
	switch (BuiltinID) {
	default:
	return false;
	case X86::BI__builtin_ia32_gatherpfdpd:
	case X86::BI__builtin_ia32_gatherpfdps:
	case X86::BI__builtin_ia32_gatherpfqpd:
	case X86::BI__builtin_ia32_gatherpfqps:
	case X86::BI__builtin_ia32_scatterpfdpd:
	case X86::BI__builtin_ia32_scatterpfdps:
	case X86::BI__builtin_ia32_scatterpfqpd:
	case X86::BI__builtin_ia32_scatterpfqps:
	ArgNum = 3;
	break;
	case X86::BI__builtin_ia32_gatherd_pd:
	case X86::BI__builtin_ia32_gatherd_pd256:
	case X86::BI__builtin_ia32_gatherq_pd:
	case X86::BI__builtin_ia32_gatherq_pd256:
	case X86::BI__builtin_ia32_gatherd_ps:
	case X86::BI__builtin_ia32_gatherd_ps256:
	case X86::BI__builtin_ia32_gatherq_ps:
	case X86::BI__builtin_ia32_gatherq_ps256:
	case X86::BI__builtin_ia32_gatherd_q:
	case X86::BI__builtin_ia32_gatherd_q256:
	case X86::BI__builtin_ia32_gatherq_q:
	case X86::BI__builtin_ia32_gatherq_q256:
	case X86::BI__builtin_ia32_gatherd_d:
	case X86::BI__builtin_ia32_gatherd_d256:
	case X86::BI__builtin_ia32_gatherq_d:
	case X86::BI__builtin_ia32_gatherq_d256:
	case X86::BI__builtin_ia32_gather3div2df:
	case X86::BI__builtin_ia32_gather3div2di:
	case X86::BI__builtin_ia32_gather3div4df:
	case X86::BI__builtin_ia32_gather3div4di:
	case X86::BI__builtin_ia32_gather3div4sf:
	case X86::BI__builtin_ia32_gather3div4si:
	case X86::BI__builtin_ia32_gather3div8sf:
	case X86::BI__builtin_ia32_gather3div8si:
	case X86::BI__builtin_ia32_gather3siv2df:
	case X86::BI__builtin_ia32_gather3siv2di:
	case X86::BI__builtin_ia32_gather3siv4df:
	case X86::BI__builtin_ia32_gather3siv4di:
	case X86::BI__builtin_ia32_gather3siv4sf:
	case X86::BI__builtin_ia32_gather3siv4si:
	case X86::BI__builtin_ia32_gather3siv8sf:
	case X86::BI__builtin_ia32_gather3siv8si:
	case X86::BI__builtin_ia32_gathersiv8df:
	case X86::BI__builtin_ia32_gathersiv16sf:
	case X86::BI__builtin_ia32_gatherdiv8df:
	case X86::BI__builtin_ia32_gatherdiv16sf:
	case X86::BI__builtin_ia32_gathersiv8di:
	case X86::BI__builtin_ia32_gathersiv16si:
	case X86::BI__builtin_ia32_gatherdiv8di:
	case X86::BI__builtin_ia32_gatherdiv16si:
	case X86::BI__builtin_ia32_scatterdiv2df:
	case X86::BI__builtin_ia32_scatterdiv2di:
	case X86::BI__builtin_ia32_scatterdiv4df:
	case X86::BI__builtin_ia32_scatterdiv4di:
	case X86::BI__builtin_ia32_scatterdiv4sf:
	case X86::BI__builtin_ia32_scatterdiv4si:
	case X86::BI__builtin_ia32_scatterdiv8sf:
	case X86::BI__builtin_ia32_scatterdiv8si:
	case X86::BI__builtin_ia32_scattersiv2df:
	case X86::BI__builtin_ia32_scattersiv2di:
	case X86::BI__builtin_ia32_scattersiv4df:
	case X86::BI__builtin_ia32_scattersiv4di:
	case X86::BI__builtin_ia32_scattersiv4sf:
	case X86::BI__builtin_ia32_scattersiv4si:
	case X86::BI__builtin_ia32_scattersiv8sf:
	case X86::BI__builtin_ia32_scattersiv8si:
	case X86::BI__builtin_ia32_scattersiv8df:
	case X86::BI__builtin_ia32_scattersiv16sf:
	case X86::BI__builtin_ia32_scatterdiv8df:
	case X86::BI__builtin_ia32_scatterdiv16sf:
	case X86::BI__builtin_ia32_scattersiv8di:
	case X86::BI__builtin_ia32_scattersiv16si:
	case X86::BI__builtin_ia32_scatterdiv8di:
	case X86::BI__builtin_ia32_scatterdiv16si:
	ArgNum = 4;
	break;
	}

	llvm::APSInt Result;

	// We can't check the value of a dependent argument.
	Expr *Arg = TheCall->getArg(ArgNum);
	if (Arg->isTypeDependent() \|\| Arg->isValueDependent())
	return false;

	// Check constant-ness first.
	if (SemaBuiltinConstantArg(TheCall, ArgNum, Result))
	return true;

	if (Result == 1 \|\| Result == 2 \|\| Result == 4 \|\| Result == 8)
	return false;

	return Diag(TheCall->getLocStart(), diag::err_x86_builtin_invalid_scale)
	<< Arg->getSourceRange();
	}

	bool Sema::CheckX86BuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
	if (BuiltinID == X86::BI__builtin_cpu_supports)
	return SemaBuiltinCpuSupports(*this, TheCall);

	if (BuiltinID == X86::BI__builtin_cpu_is)
	return SemaBuiltinCpuIs(*this, TheCall);

	// If the intrinsic has rounding or SAE make sure its valid.
	if (CheckX86BuiltinRoundingOrSAE(BuiltinID, TheCall))
	return true;

	// If the intrinsic has a gather/scatter scale immediate make sure its valid.
	if (CheckX86BuiltinGatherScatterScale(BuiltinID, TheCall))
	return true;

	// For intrinsics which take an immediate value as part of the instruction,
	// range check them here.
	int i = 0, l = 0, u = 0;
	switch (BuiltinID) {
	default:
	return false;
	case X86::BI_mm_prefetch:
	i = 1; l = 0; u = 7;
	break;
	case X86::BI__builtin_ia32_sha1rnds4:
	case X86::BI__builtin_ia32_shuf_f32x4_256_mask:
	case X86::BI__builtin_ia32_shuf_f64x2_256_mask:
	case X86::BI__builtin_ia32_shuf_i32x4_256_mask:
	case X86::BI__builtin_ia32_shuf_i64x2_256_mask:
	i = 2; l = 0; u = 3;
	break;
	case X86::BI__builtin_ia32_vpermil2pd:
	case X86::BI__builtin_ia32_vpermil2pd256:
	case X86::BI__builtin_ia32_vpermil2ps:
	case X86::BI__builtin_ia32_vpermil2ps256:
	i = 3; l = 0; u = 3;
	break;
	case X86::BI__builtin_ia32_cmpb128_mask:
	case X86::BI__builtin_ia32_cmpw128_mask:
	case X86::BI__builtin_ia32_cmpd128_mask:
	case X86::BI__builtin_ia32_cmpq128_mask:
	case X86::BI__builtin_ia32_cmpb256_mask:
	case X86::BI__builtin_ia32_cmpw256_mask:
	case X86::BI__builtin_ia32_cmpd256_mask:
	case X86::BI__builtin_ia32_cmpq256_mask:
	case X86::BI__builtin_ia32_cmpb512_mask:
	case X86::BI__builtin_ia32_cmpw512_mask:
	case X86::BI__builtin_ia32_cmpd512_mask:
	case X86::BI__builtin_ia32_cmpq512_mask:
	case X86::BI__builtin_ia32_ucmpb128_mask:
	case X86::BI__builtin_ia32_ucmpw128_mask:
	case X86::BI__builtin_ia32_ucmpd128_mask:
	case X86::BI__builtin_ia32_ucmpq128_mask:
	case X86::BI__builtin_ia32_ucmpb256_mask:
	case X86::BI__builtin_ia32_ucmpw256_mask:
	case X86::BI__builtin_ia32_ucmpd256_mask:
	case X86::BI__builtin_ia32_ucmpq256_mask:
	case X86::BI__builtin_ia32_ucmpb512_mask:
	case X86::BI__builtin_ia32_ucmpw512_mask:
	case X86::BI__builtin_ia32_ucmpd512_mask:
	case X86::BI__builtin_ia32_ucmpq512_mask:
	case X86::BI__builtin_ia32_vpcomub:
	case X86::BI__builtin_ia32_vpcomuw:
	case X86::BI__builtin_ia32_vpcomud:
	case X86::BI__builtin_ia32_vpcomuq:
	case X86::BI__builtin_ia32_vpcomb:
	case X86::BI__builtin_ia32_vpcomw:
	case X86::BI__builtin_ia32_vpcomd:
	case X86::BI__builtin_ia32_vpcomq:
	i = 2; l = 0; u = 7;
	break;
	case X86::BI__builtin_ia32_roundps:
	case X86::BI__builtin_ia32_roundpd:
	case X86::BI__builtin_ia32_roundps256:
	case X86::BI__builtin_ia32_roundpd256:
	i = 1; l = 0; u = 15;
	break;
	case X86::BI__builtin_ia32_roundss:
	case X86::BI__builtin_ia32_roundsd:
	case X86::BI__builtin_ia32_rangepd128_mask:
	case X86::BI__builtin_ia32_rangepd256_mask:
	case X86::BI__builtin_ia32_rangepd512_mask:
	case X86::BI__builtin_ia32_rangeps128_mask:
	case X86::BI__builtin_ia32_rangeps256_mask:
	case X86::BI__builtin_ia32_rangeps512_mask:
	case X86::BI__builtin_ia32_getmantsd_round_mask:
	case X86::BI__builtin_ia32_getmantss_round_mask:
	i = 2; l = 0; u = 15;
	break;
	case X86::BI__builtin_ia32_cmpps:
	case X86::BI__builtin_ia32_cmpss:
	case X86::BI__builtin_ia32_cmppd:
	case X86::BI__builtin_ia32_cmpsd:
	case X86::BI__builtin_ia32_cmpps256:
	case X86::BI__builtin_ia32_cmppd256:
	case X86::BI__builtin_ia32_cmpps128_mask:
	case X86::BI__builtin_ia32_cmppd128_mask:
	case X86::BI__builtin_ia32_cmpps256_mask:
	case X86::BI__builtin_ia32_cmppd256_mask:
	case X86::BI__builtin_ia32_cmpps512_mask:
	case X86::BI__builtin_ia32_cmppd512_mask:
	case X86::BI__builtin_ia32_cmpsd_mask:
	case X86::BI__builtin_ia32_cmpss_mask:
	i = 2; l = 0; u = 31;
	break;
	case X86::BI__builtin_ia32_xabort:
	i = 0; l = -128; u = 255;
	break;
	case X86::BI__builtin_ia32_pshufw:
	case X86::BI__builtin_ia32_aeskeygenassist128:
	i = 1; l = -128; u = 255;
	break;
	case X86::BI__builtin_ia32_vcvtps2ph:
	case X86::BI__builtin_ia32_vcvtps2ph_mask:
	case X86::BI__builtin_ia32_vcvtps2ph256:
	case X86::BI__builtin_ia32_vcvtps2ph256_mask:
	case X86::BI__builtin_ia32_vcvtps2ph512_mask:
	case X86::BI__builtin_ia32_rndscaleps_128_mask:
	case X86::BI__builtin_ia32_rndscalepd_128_mask:
	case X86::BI__builtin_ia32_rndscaleps_256_mask:
	case X86::BI__builtin_ia32_rndscalepd_256_mask:
	case X86::BI__builtin_ia32_rndscaleps_mask:
	case X86::BI__builtin_ia32_rndscalepd_mask:
	case X86::BI__builtin_ia32_reducepd128_mask:
	case X86::BI__builtin_ia32_reducepd256_mask:
	case X86::BI__builtin_ia32_reducepd512_mask:
	case X86::BI__builtin_ia32_reduceps128_mask:
	case X86::BI__builtin_ia32_reduceps256_mask:
	case X86::BI__builtin_ia32_reduceps512_mask:
	case X86::BI__builtin_ia32_prold512_mask:
	case X86::BI__builtin_ia32_prolq512_mask:
	case X86::BI__builtin_ia32_prold128_mask:
	case X86::BI__builtin_ia32_prold256_mask:
	case X86::BI__builtin_ia32_prolq128_mask:
	case X86::BI__builtin_ia32_prolq256_mask:
	case X86::BI__builtin_ia32_prord128_mask:
	case X86::BI__builtin_ia32_prord256_mask:
	case X86::BI__builtin_ia32_prorq128_mask:
	case X86::BI__builtin_ia32_prorq256_mask:
	case X86::BI__builtin_ia32_fpclasspd128_mask:
	case X86::BI__builtin_ia32_fpclasspd256_mask:
	case X86::BI__builtin_ia32_fpclassps128_mask:
	case X86::BI__builtin_ia32_fpclassps256_mask:
	case X86::BI__builtin_ia32_fpclassps512_mask:
	case X86::BI__builtin_ia32_fpclasspd512_mask:
	case X86::BI__builtin_ia32_fpclasssd_mask:
	case X86::BI__builtin_ia32_fpclassss_mask:
	i = 1; l = 0; u = 255;
	break;
	case X86::BI__builtin_ia32_palignr:
	case X86::BI__builtin_ia32_insertps128:
	case X86::BI__builtin_ia32_dpps:
	case X86::BI__builtin_ia32_dppd:
	case X86::BI__builtin_ia32_dpps256:
	case X86::BI__builtin_ia32_mpsadbw128:
	case X86::BI__builtin_ia32_mpsadbw256:
	case X86::BI__builtin_ia32_pcmpistrm128:
	case X86::BI__builtin_ia32_pcmpistri128:
	case X86::BI__builtin_ia32_pcmpistria128:
	case X86::BI__builtin_ia32_pcmpistric128:
	case X86::BI__builtin_ia32_pcmpistrio128:
	case X86::BI__builtin_ia32_pcmpistris128:
	case X86::BI__builtin_ia32_pcmpistriz128:
	case X86::BI__builtin_ia32_pclmulqdq128:
	case X86::BI__builtin_ia32_vperm2f128_pd256:
	case X86::BI__builtin_ia32_vperm2f128_ps256:
	case X86::BI__builtin_ia32_vperm2f128_si256:
	case X86::BI__builtin_ia32_permti256:
	i = 2; l = -128; u = 255;
	break;
	case X86::BI__builtin_ia32_palignr128:
	case X86::BI__builtin_ia32_palignr256:
	case X86::BI__builtin_ia32_palignr512_mask:
	case X86::BI__builtin_ia32_vcomisd:
	case X86::BI__builtin_ia32_vcomiss:
	case X86::BI__builtin_ia32_shuf_f32x4_mask:
	case X86::BI__builtin_ia32_shuf_f64x2_mask:
	case X86::BI__builtin_ia32_shuf_i32x4_mask:
	case X86::BI__builtin_ia32_shuf_i64x2_mask:
	case X86::BI__builtin_ia32_dbpsadbw128_mask:
	case X86::BI__builtin_ia32_dbpsadbw256_mask:
	case X86::BI__builtin_ia32_dbpsadbw512_mask:
	i = 2; l = 0; u = 255;
	break;
	case X86::BI__builtin_ia32_fixupimmpd512_mask:
	case X86::BI__builtin_ia32_fixupimmpd512_maskz:
	case X86::BI__builtin_ia32_fixupimmps512_mask:
	case X86::BI__builtin_ia32_fixupimmps512_maskz:
	case X86::BI__builtin_ia32_fixupimmsd_mask:
	case X86::BI__builtin_ia32_fixupimmsd_maskz:
	case X86::BI__builtin_ia32_fixupimmss_mask:
	case X86::BI__builtin_ia32_fixupimmss_maskz:
	case X86::BI__builtin_ia32_fixupimmpd128_mask:
	case X86::BI__builtin_ia32_fixupimmpd128_maskz:
	case X86::BI__builtin_ia32_fixupimmpd256_mask:
	case X86::BI__builtin_ia32_fixupimmpd256_maskz:
	case X86::BI__builtin_ia32_fixupimmps128_mask:
	case X86::BI__builtin_ia32_fixupimmps128_maskz:
	case X86::BI__builtin_ia32_fixupimmps256_mask:
	case X86::BI__builtin_ia32_fixupimmps256_maskz:
	case X86::BI__builtin_ia32_pternlogd512_mask:
	case X86::BI__builtin_ia32_pternlogd512_maskz:
	case X86::BI__builtin_ia32_pternlogq512_mask:
	case X86::BI__builtin_ia32_pternlogq512_maskz:
	case X86::BI__builtin_ia32_pternlogd128_mask:
	case X86::BI__builtin_ia32_pternlogd128_maskz:
	case X86::BI__builtin_ia32_pternlogd256_mask:
	case X86::BI__builtin_ia32_pternlogd256_maskz:
	case X86::BI__builtin_ia32_pternlogq128_mask:
	case X86::BI__builtin_ia32_pternlogq128_maskz:
	case X86::BI__builtin_ia32_pternlogq256_mask:
	case X86::BI__builtin_ia32_pternlogq256_maskz:
	i = 3; l = 0; u = 255;
	break;
	case X86::BI__builtin_ia32_gatherpfdpd:
	case X86::BI__builtin_ia32_gatherpfdps:
	case X86::BI__builtin_ia32_gatherpfqpd:
	case X86::BI__builtin_ia32_gatherpfqps:
	case X86::BI__builtin_ia32_scatterpfdpd:
	case X86::BI__builtin_ia32_scatterpfdps:
	case X86::BI__builtin_ia32_scatterpfqpd:
	case X86::BI__builtin_ia32_scatterpfqps:
	i = 4; l = 2; u = 3;
	break;
	case X86::BI__builtin_ia32_pcmpestrm128:
	case X86::BI__builtin_ia32_pcmpestri128:
	case X86::BI__builtin_ia32_pcmpestria128:
	case X86::BI__builtin_ia32_pcmpestric128:
	case X86::BI__builtin_ia32_pcmpestrio128:
	case X86::BI__builtin_ia32_pcmpestris128:
	case X86::BI__builtin_ia32_pcmpestriz128:
	i = 4; l = -128; u = 255;
	break;
	case X86::BI__builtin_ia32_rndscalesd_round_mask:
	case X86::BI__builtin_ia32_rndscaless_round_mask:
	i = 4; l = 0; u = 255;
	break;
	}
	return SemaBuiltinConstantArgRange(TheCall, i, l, u);
	}

	/// Given a FunctionDecl's FormatAttr, attempts to populate the FomatStringInfo
	/// parameter with the FormatAttr's correct format_idx and firstDataArg.
	/// Returns true when the format fits the function and the FormatStringInfo has
	/// been populated.
	bool Sema::getFormatStringInfo(const FormatAttr *Format, bool IsCXXMember,
	FormatStringInfo *FSI) {
	FSI->HasVAListArg = Format->getFirstArg() == 0;
	FSI->FormatIdx = Format->getFormatIdx() - 1;
	FSI->FirstDataArg = FSI->HasVAListArg ? 0 : Format->getFirstArg() - 1;

	// The way the format attribute works in GCC, the implicit this argument
	// of member functions is counted. However, it doesn't appear in our own
	// lists, so decrement format_idx in that case.
	if (IsCXXMember) {
	if(FSI->FormatIdx == 0)
	return false;
	--FSI->FormatIdx;
	if (FSI->FirstDataArg != 0)
	--FSI->FirstDataArg;
	}
	return true;
	}

	/// Checks if a the given expression evaluates to null.
	///
	/// \brief Returns true if the value evaluates to null.
	static bool CheckNonNullExpr(Sema &S, const Expr *Expr) {
	// If the expression has non-null type, it doesn't evaluate to null.
	if (auto nullability
	= Expr->IgnoreImplicit()->getType()->getNullability(S.Context)) {
	if (*nullability == NullabilityKind::NonNull)
	return false;
	}

	// As a special case, transparent unions initialized with zero are
	// considered null for the purposes of the nonnull attribute.
	if (const RecordType *UT = Expr->getType()->getAsUnionType()) {
	if (UT->getDecl()->hasAttr<TransparentUnionAttr>())
	if (const CompoundLiteralExpr *CLE =
	dyn_cast<CompoundLiteralExpr>(Expr))
	if (const InitListExpr *ILE =
	dyn_cast<InitListExpr>(CLE->getInitializer()))
	Expr = ILE->getInit(0);
	}

	bool Result;
	return (!Expr->isValueDependent() &&
	Expr->EvaluateAsBooleanCondition(Result, S.Context) &&
	!Result);
	}

	static void CheckNonNullArgument(Sema &S,
	const Expr *ArgExpr,
	SourceLocation CallSiteLoc) {
	if (CheckNonNullExpr(S, ArgExpr))
	S.DiagRuntimeBehavior(CallSiteLoc, ArgExpr,
	S.PDiag(diag::warn_null_arg) << ArgExpr->getSourceRange());
	}

	bool Sema::GetFormatNSStringIdx(const FormatAttr *Format, unsigned &Idx) {
	FormatStringInfo FSI;
	if ((GetFormatStringType(Format) == FST_NSString) &&
	getFormatStringInfo(Format, false, &FSI)) {
	Idx = FSI.FormatIdx;
	return true;
	}
	return false;
	}

	/// \brief Diagnose use of %s directive in an NSString which is being passed
	/// as formatting string to formatting method.
	static void
	DiagnoseCStringFormatDirectiveInCFAPI(Sema &S,
	const NamedDecl *FDecl,
	Expr **Args,
	unsigned NumArgs) {
	unsigned Idx = 0;
	bool Format = false;
	ObjCStringFormatFamily SFFamily = FDecl->getObjCFStringFormattingFamily();
	if (SFFamily == ObjCStringFormatFamily::SFF_CFString) {
	Idx = 2;
	Format = true;
	}
	else
	for (const auto *I : FDecl->specific_attrs<FormatAttr>()) {
	if (S.GetFormatNSStringIdx(I, Idx)) {
	Format = true;
	break;
	}
	}
	if (!Format \|\| NumArgs <= Idx)
	return;
	const Expr *FormatExpr = Args[Idx];
	if (const CStyleCastExpr *CSCE = dyn_cast<CStyleCastExpr>(FormatExpr))
	FormatExpr = CSCE->getSubExpr();
	const StringLiteral *FormatString;
	if (const ObjCStringLiteral *OSL =
	dyn_cast<ObjCStringLiteral>(FormatExpr->IgnoreParenImpCasts()))
	FormatString = OSL->getString();
	else
	FormatString = dyn_cast<StringLiteral>(FormatExpr->IgnoreParenImpCasts());
	if (!FormatString)
	return;
	if (S.FormatStringHasSArg(FormatString)) {
	S.Diag(FormatExpr->getExprLoc(), diag::warn_objc_cdirective_format_string)
	<< "%s" << 1 << 1;
	S.Diag(FDecl->getLocation(), diag::note_entity_declared_at)
	<< FDecl->getDeclName();
	}
	}

	/// Determine whether the given type has a non-null nullability annotation.
	static bool isNonNullType(ASTContext &ctx, QualType type) {
	if (auto nullability = type->getNullability(ctx))
	return *nullability == NullabilityKind::NonNull;

	return false;
	}

	static void CheckNonNullArguments(Sema &S,
	const NamedDecl *FDecl,
	const FunctionProtoType *Proto,
	ArrayRef<const Expr *> Args,
	SourceLocation CallSiteLoc) {
	assert((FDecl \|\| Proto) && "Need a function declaration or prototype");

	// Check the attributes attached to the method/function itself.
	llvm::SmallBitVector NonNullArgs;
	if (FDecl) {
	// Handle the nonnull attribute on the function/method declaration itself.
	for (const auto *NonNull : FDecl->specific_attrs<NonNullAttr>()) {
	if (!NonNull->args_size()) {
	// Easy case: all pointer arguments are nonnull.
	for (const auto *Arg : Args)
	if (S.isValidPointerAttrType(Arg->getType()))
	CheckNonNullArgument(S, Arg, CallSiteLoc);
	return;
	}

	for (unsigned Val : NonNull->args()) {
	if (Val >= Args.size())
	continue;
	if (NonNullArgs.empty())
	NonNullArgs.resize(Args.size());
	NonNullArgs.set(Val);
	}
	}
	}

	if (FDecl && (isa<FunctionDecl>(FDecl) \|\| isa<ObjCMethodDecl>(FDecl))) {
	// Handle the nonnull attribute on the parameters of the
	// function/method.
	ArrayRef<ParmVarDecl*> parms;
	if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(FDecl))
	parms = FD->parameters();
	else
	parms = cast<ObjCMethodDecl>(FDecl)->parameters();

	unsigned ParamIndex = 0;
	for (ArrayRef<ParmVarDecl*>::iterator I = parms.begin(), E = parms.end();
	I != E; ++I, ++ParamIndex) {
	const ParmVarDecl PVD = I;
	if (PVD->hasAttr<NonNullAttr>() \|\|
	isNonNullType(S.Context, PVD->getType())) {
	if (NonNullArgs.empty())
	NonNullArgs.resize(Args.size());

	NonNullArgs.set(ParamIndex);
	}
	}
	} else {
	// If we have a non-function, non-method declaration but no
	// function prototype, try to dig out the function prototype.
	if (!Proto) {
	if (const ValueDecl *VD = dyn_cast<ValueDecl>(FDecl)) {
	QualType type = VD->getType().getNonReferenceType();
	if (auto pointerType = type->getAs<PointerType>())
	type = pointerType->getPointeeType();
	else if (auto blockType = type->getAs<BlockPointerType>())
	type = blockType->getPointeeType();
	// FIXME: data member pointers?

	// Dig out the function prototype, if there is one.
	Proto = type->getAs<FunctionProtoType>();
	}
	}

	// Fill in non-null argument information from the nullability
	// information on the parameter types (if we have them).
	if (Proto) {
	unsigned Index = 0;
	for (auto paramType : Proto->getParamTypes()) {
	if (isNonNullType(S.Context, paramType)) {
	if (NonNullArgs.empty())
	NonNullArgs.resize(Args.size());

	NonNullArgs.set(Index);
	}

	++Index;
	}
	}
	}

	// Check for non-null arguments.
	for (unsigned ArgIndex = 0, ArgIndexEnd = NonNullArgs.size();
	ArgIndex != ArgIndexEnd; ++ArgIndex) {
	if (NonNullArgs[ArgIndex])
	CheckNonNullArgument(S, Args[ArgIndex], CallSiteLoc);
	}
	}

	/// Handles the checks for format strings, non-POD arguments to vararg
	/// functions, NULL arguments passed to non-NULL parameters, and diagnose_if
	/// attributes.
	void Sema::checkCall(NamedDecl FDecl, const FunctionProtoType Proto,
	const Expr ThisArg, ArrayRef<const Expr > Args,
	bool IsMemberFunction, SourceLocation Loc,
	SourceRange Range, VariadicCallType CallType) {
	// FIXME: We should check as much as we can in the template definition.
	if (CurContext->isDependentContext())
	return;

	// Printf and scanf checking.
	llvm::SmallBitVector CheckedVarArgs;
	if (FDecl) {
	for (const auto *I : FDecl->specific_attrs<FormatAttr>()) {
	// Only create vector if there are format attributes.
	CheckedVarArgs.resize(Args.size());

	CheckFormatArguments(I, Args, IsMemberFunction, CallType, Loc, Range,
	CheckedVarArgs);
	}
	}

	// Refuse POD arguments that weren't caught by the format string
	// checks above.
	auto *FD = dyn_cast_or_null<FunctionDecl>(FDecl);
	if (CallType != VariadicDoesNotApply &&
	(!FD \|\| FD->getBuiltinID() != Builtin::BI__noop)) {
	unsigned NumParams = Proto ? Proto->getNumParams()
	: FDecl && isa<FunctionDecl>(FDecl)
	? cast<FunctionDecl>(FDecl)->getNumParams()
	: FDecl && isa<ObjCMethodDecl>(FDecl)
	? cast<ObjCMethodDecl>(FDecl)->param_size()
	: 0;

	for (unsigned ArgIdx = NumParams; ArgIdx < Args.size(); ++ArgIdx) {
	// Args[ArgIdx] can be null in malformed code.
	if (const Expr *Arg = Args[ArgIdx]) {
	if (CheckedVarArgs.empty() \|\| !CheckedVarArgs[ArgIdx])
	checkVariadicArgument(Arg, CallType);
	}
	}
	}

	if (FDecl \|\| Proto) {
	CheckNonNullArguments(*this, FDecl, Proto, Args, Loc);

	// Type safety checking.
	if (FDecl) {
	for (const auto *I : FDecl->specific_attrs<ArgumentWithTypeTagAttr>())
	CheckArgumentWithTypeTag(I, Args, Loc);
	}
	}

	if (FD)
	diagnoseArgDependentDiagnoseIfAttrs(FD, ThisArg, Args, Loc);
	}

	/// CheckConstructorCall - Check a constructor call for correctness and safety
	/// properties not enforced by the C type system.
	void Sema::CheckConstructorCall(FunctionDecl *FDecl,
	ArrayRef<const Expr *> Args,
	const FunctionProtoType *Proto,
	SourceLocation Loc) {
	VariadicCallType CallType =
	Proto->isVariadic() ? VariadicConstructor : VariadicDoesNotApply;
	checkCall(FDecl, Proto, /ThisArg=/nullptr, Args, /IsMemberFunction=/true,
	Loc, SourceRange(), CallType);
	}

	/// CheckFunctionCall - Check a direct function call for various correctness
	/// and safety properties not strictly enforced by the C type system.
	bool Sema::CheckFunctionCall(FunctionDecl FDecl, CallExpr TheCall,
	const FunctionProtoType *Proto) {
	bool IsMemberOperatorCall = isa<CXXOperatorCallExpr>(TheCall) &&
	isa<CXXMethodDecl>(FDecl);
	bool IsMemberFunction = isa<CXXMemberCallExpr>(TheCall) \|\|
	IsMemberOperatorCall;
	VariadicCallType CallType = getVariadicCallType(FDecl, Proto,
	TheCall->getCallee());
	Expr** Args = TheCall->getArgs();
	unsigned NumArgs = TheCall->getNumArgs();

	Expr *ImplicitThis = nullptr;
	if (IsMemberOperatorCall) {
	// If this is a call to a member operator, hide the first argument
	// from checkCall.
	// FIXME: Our choice of AST representation here is less than ideal.
	ImplicitThis = Args[0];
	++Args;
	--NumArgs;
	} else if (IsMemberFunction)
	ImplicitThis =
	cast<CXXMemberCallExpr>(TheCall)->getImplicitObjectArgument();

	checkCall(FDecl, Proto, ImplicitThis, llvm::makeArrayRef(Args, NumArgs),
	IsMemberFunction, TheCall->getRParenLoc(),
	TheCall->getCallee()->getSourceRange(), CallType);

	IdentifierInfo *FnInfo = FDecl->getIdentifier();
	// None of the checks below are needed for functions that don't have
	// simple names (e.g., C++ conversion functions).
	if (!FnInfo)
	return false;

	CheckAbsoluteValueFunction(TheCall, FDecl);
	CheckMaxUnsignedZero(TheCall, FDecl);

	if (getLangOpts().ObjC1)
	DiagnoseCStringFormatDirectiveInCFAPI(*this, FDecl, Args, NumArgs);

	unsigned CMId = FDecl->getMemoryFunctionKind();
	if (CMId == 0)
	return false;

	// Handle memory setting and copying functions.
	if (CMId == Builtin::BIstrlcpy \|\| CMId == Builtin::BIstrlcat)
	CheckStrlcpycatArguments(TheCall, FnInfo);
	else if (CMId == Builtin::BIstrncat)
	CheckStrncatArguments(TheCall, FnInfo);
	else
	CheckMemaccessArguments(TheCall, CMId, FnInfo);

	return false;
	}

	bool Sema::CheckObjCMethodCall(ObjCMethodDecl *Method, SourceLocation lbrac,
	ArrayRef<const Expr *> Args) {
	VariadicCallType CallType =
	Method->isVariadic() ? VariadicMethod : VariadicDoesNotApply;

	checkCall(Method, nullptr, /ThisArg=/nullptr, Args,
	/IsMemberFunction=/false, lbrac, Method->getSourceRange(),
	CallType);

	return false;
	}

	bool Sema::CheckPointerCall(NamedDecl NDecl, CallExpr TheCall,
	const FunctionProtoType *Proto) {
	QualType Ty;
	if (const auto *V = dyn_cast<VarDecl>(NDecl))
	Ty = V->getType().getNonReferenceType();
	else if (const auto *F = dyn_cast<FieldDecl>(NDecl))
	Ty = F->getType().getNonReferenceType();
	else
	return false;

	if (!Ty->isBlockPointerType() && !Ty->isFunctionPointerType() &&
	!Ty->isFunctionProtoType())
	return false;

	VariadicCallType CallType;
	if (!Proto \|\| !Proto->isVariadic()) {
	CallType = VariadicDoesNotApply;
	} else if (Ty->isBlockPointerType()) {
	CallType = VariadicBlock;
	} else { // Ty->isFunctionPointerType()
	CallType = VariadicFunction;
	}

	checkCall(NDecl, Proto, /ThisArg=/nullptr,
	llvm::makeArrayRef(TheCall->getArgs(), TheCall->getNumArgs()),
	/IsMemberFunction=/false, TheCall->getRParenLoc(),
	TheCall->getCallee()->getSourceRange(), CallType);

	return false;
	}

	/// Checks function calls when a FunctionDecl or a NamedDecl is not available,
	/// such as function pointers returned from functions.
	bool Sema::CheckOtherCall(CallExpr TheCall, const FunctionProtoType Proto) {
	VariadicCallType CallType = getVariadicCallType(/FDecl=/nullptr, Proto,
	TheCall->getCallee());
	checkCall(/FDecl=/nullptr, Proto, /ThisArg=/nullptr,
	llvm::makeArrayRef(TheCall->getArgs(), TheCall->getNumArgs()),
	/IsMemberFunction=/false, TheCall->getRParenLoc(),
	TheCall->getCallee()->getSourceRange(), CallType);

	return false;
	}

	static bool isValidOrderingForOp(int64_t Ordering, AtomicExpr::AtomicOp Op) {
	if (!llvm::isValidAtomicOrderingCABI(Ordering))
	return false;

	auto OrderingCABI = (llvm::AtomicOrderingCABI)Ordering;
	switch (Op) {
	case AtomicExpr::AO__c11_atomic_init:
	case AtomicExpr::AO__opencl_atomic_init:
	llvm_unreachable("There is no ordering argument for an init");

	case AtomicExpr::AO__c11_atomic_load:
	case AtomicExpr::AO__opencl_atomic_load:
	case AtomicExpr::AO__atomic_load_n:
	case AtomicExpr::AO__atomic_load:
	return OrderingCABI != llvm::AtomicOrderingCABI::release &&
	OrderingCABI != llvm::AtomicOrderingCABI::acq_rel;

	case AtomicExpr::AO__c11_atomic_store:
	case AtomicExpr::AO__opencl_atomic_store:
	case AtomicExpr::AO__atomic_store:
	case AtomicExpr::AO__atomic_store_n:
	return OrderingCABI != llvm::AtomicOrderingCABI::consume &&
	OrderingCABI != llvm::AtomicOrderingCABI::acquire &&
	OrderingCABI != llvm::AtomicOrderingCABI::acq_rel;

	default:
	return true;
	}
	}

	ExprResult Sema::SemaAtomicOpsOverloaded(ExprResult TheCallResult,
	AtomicExpr::AtomicOp Op) {
	CallExpr *TheCall = cast<CallExpr>(TheCallResult.get());
	DeclRefExpr *DRE =cast<DeclRefExpr>(TheCall->getCallee()->IgnoreParenCasts());

	// All the non-OpenCL operations take one of the following forms.
	// The OpenCL operations take the __c11 forms with one extra argument for
	// synchronization scope.
	enum {
	// C __c11_atomic_init(A *, C)
	Init,

	// C __c11_atomic_load(A *, int)
	Load,

	// void __atomic_load(A *, CP, int)
	LoadCopy,

	// void __atomic_store(A *, CP, int)
	Copy,

	// C __c11_atomic_add(A *, M, int)
	Arithmetic,

	// C __atomic_exchange_n(A *, CP, int)
	Xchg,

	// void __atomic_exchange(A , C , CP, int)
	GNUXchg,

	// bool __c11_atomic_compare_exchange_strong(A , C , CP, int, int)
	C11CmpXchg,

	// bool __atomic_compare_exchange(A , C , CP, bool, int, int)
	GNUCmpXchg
	} Form = Init;

	const unsigned NumForm = GNUCmpXchg + 1;
	const unsigned NumArgs[] = { 2, 2, 3, 3, 3, 3, 4, 5, 6 };
	const unsigned NumVals[] = { 1, 0, 1, 1, 1, 1, 2, 2, 3 };
	// where:
	// C is an appropriate type,
	// A is volatile _Atomic(C) for __c11 builtins and is C for GNU builtins,
	// CP is C for __c11 builtins and GNU _n builtins and is C * otherwise,
	// M is C if C is an integer, and ptrdiff_t if C is a pointer, and
	// the int parameters are for orderings.

	static_assert(sizeof(NumArgs)/sizeof(NumArgs[0]) == NumForm
	&& sizeof(NumVals)/sizeof(NumVals[0]) == NumForm,
	"need to update code for modified forms");
	static_assert(AtomicExpr::AO__c11_atomic_init == 0 &&
	AtomicExpr::AO__c11_atomic_fetch_xor + 1 ==
	AtomicExpr::AO__atomic_load,
	"need to update code for modified C11 atomics");
	bool IsOpenCL = Op >= AtomicExpr::AO__opencl_atomic_init &&
	Op <= AtomicExpr::AO__opencl_atomic_fetch_max;
	bool IsC11 = (Op >= AtomicExpr::AO__c11_atomic_init &&
	Op <= AtomicExpr::AO__c11_atomic_fetch_xor) \|\|
	IsOpenCL;
	bool IsN = Op == AtomicExpr::AO__atomic_load_n \|\|
	Op == AtomicExpr::AO__atomic_store_n \|\|
	Op == AtomicExpr::AO__atomic_exchange_n \|\|
	Op == AtomicExpr::AO__atomic_compare_exchange_n;
	bool IsAddSub = false;

	switch (Op) {
	case AtomicExpr::AO__c11_atomic_init:
	case AtomicExpr::AO__opencl_atomic_init:
	Form = Init;
	break;

	case AtomicExpr::AO__c11_atomic_load:
	case AtomicExpr::AO__opencl_atomic_load:
	case AtomicExpr::AO__atomic_load_n:
	Form = Load;
	break;

	case AtomicExpr::AO__atomic_load:
	Form = LoadCopy;
	break;

	case AtomicExpr::AO__c11_atomic_store:
	case AtomicExpr::AO__opencl_atomic_store:
	case AtomicExpr::AO__atomic_store:
	case AtomicExpr::AO__atomic_store_n:
	Form = Copy;
	break;

	case AtomicExpr::AO__c11_atomic_fetch_add:
	case AtomicExpr::AO__c11_atomic_fetch_sub:
	case AtomicExpr::AO__opencl_atomic_fetch_add:
	case AtomicExpr::AO__opencl_atomic_fetch_sub:
	case AtomicExpr::AO__opencl_atomic_fetch_min:
	case AtomicExpr::AO__opencl_atomic_fetch_max:
	case AtomicExpr::AO__atomic_fetch_add:
	case AtomicExpr::AO__atomic_fetch_sub:
	case AtomicExpr::AO__atomic_add_fetch:
	case AtomicExpr::AO__atomic_sub_fetch:
	IsAddSub = true;
	LLVM_FALLTHROUGH;
	case AtomicExpr::AO__c11_atomic_fetch_and:
	case AtomicExpr::AO__c11_atomic_fetch_or:
	case AtomicExpr::AO__c11_atomic_fetch_xor:
	case AtomicExpr::AO__opencl_atomic_fetch_and:
	case AtomicExpr::AO__opencl_atomic_fetch_or:
	case AtomicExpr::AO__opencl_atomic_fetch_xor:
	case AtomicExpr::AO__atomic_fetch_and:
	case AtomicExpr::AO__atomic_fetch_or:
	case AtomicExpr::AO__atomic_fetch_xor:
	case AtomicExpr::AO__atomic_fetch_nand:
	case AtomicExpr::AO__atomic_and_fetch:
	case AtomicExpr::AO__atomic_or_fetch:
	case AtomicExpr::AO__atomic_xor_fetch:
	case AtomicExpr::AO__atomic_nand_fetch:
	Form = Arithmetic;
	break;

	case AtomicExpr::AO__c11_atomic_exchange:
	case AtomicExpr::AO__opencl_atomic_exchange:
	case AtomicExpr::AO__atomic_exchange_n:
	Form = Xchg;
	break;

	case AtomicExpr::AO__atomic_exchange:
	Form = GNUXchg;
	break;

	case AtomicExpr::AO__c11_atomic_compare_exchange_strong:
	case AtomicExpr::AO__c11_atomic_compare_exchange_weak:
	case AtomicExpr::AO__opencl_atomic_compare_exchange_strong:
	case AtomicExpr::AO__opencl_atomic_compare_exchange_weak:
	Form = C11CmpXchg;
	break;

	case AtomicExpr::AO__atomic_compare_exchange:
	case AtomicExpr::AO__atomic_compare_exchange_n:
	Form = GNUCmpXchg;
	break;
	}

	unsigned AdjustedNumArgs = NumArgs[Form];
	if (IsOpenCL && Op != AtomicExpr::AO__opencl_atomic_init)
	++AdjustedNumArgs;
	// Check we have the right number of arguments.
	if (TheCall->getNumArgs() < AdjustedNumArgs) {
	Diag(TheCall->getLocEnd(), diag::err_typecheck_call_too_few_args)
	<< 0 << AdjustedNumArgs << TheCall->getNumArgs()
	<< TheCall->getCallee()->getSourceRange();
	return ExprError();
	} else if (TheCall->getNumArgs() > AdjustedNumArgs) {
	Diag(TheCall->getArg(AdjustedNumArgs)->getLocStart(),
	diag::err_typecheck_call_too_many_args)
	<< 0 << AdjustedNumArgs << TheCall->getNumArgs()
	<< TheCall->getCallee()->getSourceRange();
	return ExprError();
	}

	// Inspect the first argument of the atomic operation.
	Expr *Ptr = TheCall->getArg(0);
	ExprResult ConvertedPtr = DefaultFunctionArrayLvalueConversion(Ptr);
	if (ConvertedPtr.isInvalid())
	return ExprError();

	Ptr = ConvertedPtr.get();
	const PointerType *pointerType = Ptr->getType()->getAs<PointerType>();
	if (!pointerType) {
	Diag(DRE->getLocStart(), diag::err_atomic_builtin_must_be_pointer)
	<< Ptr->getType() << Ptr->getSourceRange();
	return ExprError();
	}

	// For a __c11 builtin, this should be a pointer to an _Atomic type.
	QualType AtomTy = pointerType->getPointeeType(); // 'A'
	QualType ValType = AtomTy; // 'C'
	if (IsC11) {
	if (!AtomTy->isAtomicType()) {
	Diag(DRE->getLocStart(), diag::err_atomic_op_needs_atomic)
	<< Ptr->getType() << Ptr->getSourceRange();
	return ExprError();
	}
	if (AtomTy.isConstQualified() \|\|
	AtomTy.getAddressSpace() == LangAS::opencl_constant) {
	Diag(DRE->getLocStart(), diag::err_atomic_op_needs_non_const_atomic)
	<< (AtomTy.isConstQualified() ? 0 : 1) << Ptr->getType()
	<< Ptr->getSourceRange();
	return ExprError();
	}
	ValType = AtomTy->getAs<AtomicType>()->getValueType();
	} else if (Form != Load && Form != LoadCopy) {
	if (ValType.isConstQualified()) {
	Diag(DRE->getLocStart(), diag::err_atomic_op_needs_non_const_pointer)
	<< Ptr->getType() << Ptr->getSourceRange();
	return ExprError();
	}
	}

	// For an arithmetic operation, the implied arithmetic must be well-formed.
	if (Form == Arithmetic) {
	// gcc does not enforce these rules for GNU atomics, but we do so for sanity.
	if (IsAddSub && !ValType->isIntegerType() && !ValType->isPointerType()) {
	Diag(DRE->getLocStart(), diag::err_atomic_op_needs_atomic_int_or_ptr)
	<< IsC11 << Ptr->getType() << Ptr->getSourceRange();
	return ExprError();
	}
	if (!IsAddSub && !ValType->isIntegerType()) {
	Diag(DRE->getLocStart(), diag::err_atomic_op_bitwise_needs_atomic_int)
	<< IsC11 << Ptr->getType() << Ptr->getSourceRange();
	return ExprError();
	}
	if (IsC11 && ValType->isPointerType() &&
	RequireCompleteType(Ptr->getLocStart(), ValType->getPointeeType(),
	diag::err_incomplete_type)) {
	return ExprError();
	}
	} else if (IsN && !ValType->isIntegerType() && !ValType->isPointerType()) {
	// For __atomic_*_n operations, the value type must be a scalar integral or
	// pointer type which is 1, 2, 4, 8 or 16 bytes in length.
	Diag(DRE->getLocStart(), diag::err_atomic_op_needs_atomic_int_or_ptr)
	<< IsC11 << Ptr->getType() << Ptr->getSourceRange();
	return ExprError();
	}

	if (!IsC11 && !AtomTy.isTriviallyCopyableType(Context) &&
	!AtomTy->isScalarType()) {
	// For GNU atomics, require a trivially-copyable type. This is not part of
	// the GNU atomics specification, but we enforce it for sanity.
	Diag(DRE->getLocStart(), diag::err_atomic_op_needs_trivial_copy)
	<< Ptr->getType() << Ptr->getSourceRange();
	return ExprError();
	}

	switch (ValType.getObjCLifetime()) {
	case Qualifiers::OCL_None:
	case Qualifiers::OCL_ExplicitNone:
	// okay
	break;

	case Qualifiers::OCL_Weak:
	case Qualifiers::OCL_Strong:
	case Qualifiers::OCL_Autoreleasing:
	// FIXME: Can this happen? By this point, ValType should be known
	// to be trivially copyable.
	Diag(DRE->getLocStart(), diag::err_arc_atomic_ownership)
	<< ValType << Ptr->getSourceRange();
	return ExprError();
	}

	// atomic_fetch_or takes a pointer to a volatile 'A'. We shouldn't let the
	// volatile-ness of the pointee-type inject itself into the result or the
	// other operands. Similarly atomic_load can take a pointer to a const 'A'.
	ValType.removeLocalVolatile();
	ValType.removeLocalConst();
	QualType ResultType = ValType;
	if (Form == Copy \|\| Form == LoadCopy \|\| Form == GNUXchg \|\|
	Form == Init)
	ResultType = Context.VoidTy;
	else if (Form == C11CmpXchg \|\| Form == GNUCmpXchg)
	ResultType = Context.BoolTy;

	// The type of a parameter passed 'by value'. In the GNU atomics, such
	// arguments are actually passed as pointers.
	QualType ByValType = ValType; // 'CP'
	if (!IsC11 && !IsN)
	ByValType = Ptr->getType();

	// The first argument --- the pointer --- has a fixed type; we
	// deduce the types of the rest of the arguments accordingly. Walk
	// the remaining arguments, converting them to the deduced value type.
	for (unsigned i = 1; i != TheCall->getNumArgs(); ++i) {
	QualType Ty;
	if (i < NumVals[Form] + 1) {
	switch (i) {
	case 1:
	// The second argument is the non-atomic operand. For arithmetic, this
	// is always passed by value, and for a compare_exchange it is always
	// passed by address. For the rest, GNU uses by-address and C11 uses
	// by-value.
	assert(Form != Load);
	if (Form == Init \|\| (Form == Arithmetic && ValType->isIntegerType()))
	Ty = ValType;
	else if (Form == Copy \|\| Form == Xchg)
	Ty = ByValType;
	else if (Form == Arithmetic)
	Ty = Context.getPointerDiffType();
	else {
	Expr *ValArg = TheCall->getArg(i);
	// Treat this argument as _Nonnull as we want to show a warning if
	// NULL is passed into it.
	CheckNonNullArgument(*this, ValArg, DRE->getLocStart());
	LangAS AS = LangAS::Default;
	// Keep address space of non-atomic pointer type.
	if (const PointerType *PtrTy =
	ValArg->getType()->getAs<PointerType>()) {
	AS = PtrTy->getPointeeType().getAddressSpace();
	}
	Ty = Context.getPointerType(
	Context.getAddrSpaceQualType(ValType.getUnqualifiedType(), AS));
	}
	break;
	case 2:
	// The third argument to compare_exchange / GNU exchange is a
	// (pointer to a) desired value.
	Ty = ByValType;
	break;
	case 3:
	// The fourth argument to GNU compare_exchange is a 'weak' flag.
	Ty = Context.BoolTy;
	break;
	}
	} else {
	// The order(s) and scope are always converted to int.
	Ty = Context.IntTy;
	}

	InitializedEntity Entity =
	InitializedEntity::InitializeParameter(Context, Ty, false);
	ExprResult Arg = TheCall->getArg(i);
	Arg = PerformCopyInitialization(Entity, SourceLocation(), Arg);
	if (Arg.isInvalid())
	return true;
	TheCall->setArg(i, Arg.get());
	}

	// Permute the arguments into a 'consistent' order.
	SmallVector<Expr*, 5> SubExprs;
	SubExprs.push_back(Ptr);
	switch (Form) {
	case Init:
	// Note, AtomicExpr::getVal1() has a special case for this atomic.
	SubExprs.push_back(TheCall->getArg(1)); // Val1
	break;
	case Load:
	SubExprs.push_back(TheCall->getArg(1)); // Order
	break;
	case LoadCopy:
	case Copy:
	case Arithmetic:
	case Xchg:
	SubExprs.push_back(TheCall->getArg(2)); // Order
	SubExprs.push_back(TheCall->getArg(1)); // Val1
	break;
	case GNUXchg:
	// Note, AtomicExpr::getVal2() has a special case for this atomic.
	SubExprs.push_back(TheCall->getArg(3)); // Order
	SubExprs.push_back(TheCall->getArg(1)); // Val1
	SubExprs.push_back(TheCall->getArg(2)); // Val2
	break;
	case C11CmpXchg:
	SubExprs.push_back(TheCall->getArg(3)); // Order
	SubExprs.push_back(TheCall->getArg(1)); // Val1
	SubExprs.push_back(TheCall->getArg(4)); // OrderFail
	SubExprs.push_back(TheCall->getArg(2)); // Val2
	break;
	case GNUCmpXchg:
	SubExprs.push_back(TheCall->getArg(4)); // Order
	SubExprs.push_back(TheCall->getArg(1)); // Val1
	SubExprs.push_back(TheCall->getArg(5)); // OrderFail
	SubExprs.push_back(TheCall->getArg(2)); // Val2
	SubExprs.push_back(TheCall->getArg(3)); // Weak
	break;
	}

	if (SubExprs.size() >= 2 && Form != Init) {
	llvm::APSInt Result(32);
	if (SubExprs[1]->isIntegerConstantExpr(Result, Context) &&
	!isValidOrderingForOp(Result.getSExtValue(), Op))
	Diag(SubExprs[1]->getLocStart(),
	diag::warn_atomic_op_has_invalid_memory_order)
	<< SubExprs[1]->getSourceRange();
	}

	if (auto ScopeModel = AtomicExpr::getScopeModel(Op)) {
	auto *Scope = TheCall->getArg(TheCall->getNumArgs() - 1);
	llvm::APSInt Result(32);
	if (Scope->isIntegerConstantExpr(Result, Context) &&
	!ScopeModel->isValid(Result.getZExtValue())) {
	Diag(Scope->getLocStart(), diag::err_atomic_op_has_invalid_synch_scope)
	<< Scope->getSourceRange();
	}
	SubExprs.push_back(Scope);
	}

	AtomicExpr *AE = new (Context) AtomicExpr(TheCall->getCallee()->getLocStart(),
	SubExprs, ResultType, Op,
	TheCall->getRParenLoc());

	if ((Op == AtomicExpr::AO__c11_atomic_load \|\|
	Op == AtomicExpr::AO__c11_atomic_store \|\|
	Op == AtomicExpr::AO__opencl_atomic_load \|\|
	Op == AtomicExpr::AO__opencl_atomic_store ) &&
	Context.AtomicUsesUnsupportedLibcall(AE))
	Diag(AE->getLocStart(), diag::err_atomic_load_store_uses_lib)
	<< ((Op == AtomicExpr::AO__c11_atomic_load \|\|
	Op == AtomicExpr::AO__opencl_atomic_load)
	? 0 : 1);

	return AE;
	}

	/// checkBuiltinArgument - Given a call to a builtin function, perform
	/// normal type-checking on the given argument, updating the call in
	/// place. This is useful when a builtin function requires custom
	/// type-checking for some of its arguments but not necessarily all of
	/// them.
	///
	/// Returns true on error.
	static bool checkBuiltinArgument(Sema &S, CallExpr *E, unsigned ArgIndex) {
	FunctionDecl *Fn = E->getDirectCallee();
	assert(Fn && "builtin call without direct callee!");

	ParmVarDecl *Param = Fn->getParamDecl(ArgIndex);
	InitializedEntity Entity =
	InitializedEntity::InitializeParameter(S.Context, Param);

	ExprResult Arg = E->getArg(0);
	Arg = S.PerformCopyInitialization(Entity, SourceLocation(), Arg);
	if (Arg.isInvalid())
	return true;

	E->setArg(ArgIndex, Arg.get());
	return false;
	}

	/// SemaBuiltinAtomicOverloaded - We have a call to a function like
	/// __sync_fetch_and_add, which is an overloaded function based on the pointer
	/// type of its first argument. The main ActOnCallExpr routines have already
	/// promoted the types of arguments because all of these calls are prototyped as
	/// void(...).
	///
	/// This function goes through and does final semantic checking for these
	/// builtins,
	ExprResult
	Sema::SemaBuiltinAtomicOverloaded(ExprResult TheCallResult) {
	CallExpr TheCall = (CallExpr )TheCallResult.get();
	DeclRefExpr *DRE =cast<DeclRefExpr>(TheCall->getCallee()->IgnoreParenCasts());
	FunctionDecl *FDecl = cast<FunctionDecl>(DRE->getDecl());

	// Ensure that we have at least one argument to do type inference from.
	if (TheCall->getNumArgs() < 1) {
	Diag(TheCall->getLocEnd(), diag::err_typecheck_call_too_few_args_at_least)
	<< 0 << 1 << TheCall->getNumArgs()
	<< TheCall->getCallee()->getSourceRange();
	return ExprError();
	}

	// Inspect the first argument of the atomic builtin. This should always be
	// a pointer type, whose element is an integral scalar or pointer type.
	// Because it is a pointer type, we don't have to worry about any implicit
	// casts here.
	// FIXME: We don't allow floating point scalars as input.
	Expr *FirstArg = TheCall->getArg(0);
	ExprResult FirstArgResult = DefaultFunctionArrayLvalueConversion(FirstArg);
	if (FirstArgResult.isInvalid())
	return ExprError();
	FirstArg = FirstArgResult.get();
	TheCall->setArg(0, FirstArg);

	const PointerType *pointerType = FirstArg->getType()->getAs<PointerType>();
	if (!pointerType) {
	Diag(DRE->getLocStart(), diag::err_atomic_builtin_must_be_pointer)
	<< FirstArg->getType() << FirstArg->getSourceRange();
	return ExprError();
	}

	QualType ValType = pointerType->getPointeeType();
	if (!ValType->isIntegerType() && !ValType->isAnyPointerType() &&
	!ValType->isBlockPointerType()) {
	Diag(DRE->getLocStart(), diag::err_atomic_builtin_must_be_pointer_intptr)
	<< FirstArg->getType() << FirstArg->getSourceRange();
	return ExprError();
	}

	switch (ValType.getObjCLifetime()) {
	case Qualifiers::OCL_None:
	case Qualifiers::OCL_ExplicitNone:
	// okay
	break;

	case Qualifiers::OCL_Weak:
	case Qualifiers::OCL_Strong:
	case Qualifiers::OCL_Autoreleasing:
	Diag(DRE->getLocStart(), diag::err_arc_atomic_ownership)
	<< ValType << FirstArg->getSourceRange();
	return ExprError();
	}

	// Strip any qualifiers off ValType.
	ValType = ValType.getUnqualifiedType();

	// The majority of builtins return a value, but a few have special return
	// types, so allow them to override appropriately below.
	QualType ResultType = ValType;

	// We need to figure out which concrete builtin this maps onto. For example,
	// __sync_fetch_and_add with a 2 byte object turns into
	// __sync_fetch_and_add_2.
	#define BUILTIN_ROW(x) \
	{ Builtin::BI##x##_1, Builtin::BI##x##_2, Builtin::BI##x##_4, \
	Builtin::BI##x##_8, Builtin::BI##x##_16 }

	static const unsigned BuiltinIndices[][5] = {
	BUILTIN_ROW(__sync_fetch_and_add),
	BUILTIN_ROW(__sync_fetch_and_sub),
	BUILTIN_ROW(__sync_fetch_and_or),
	BUILTIN_ROW(__sync_fetch_and_and),
	BUILTIN_ROW(__sync_fetch_and_xor),
	BUILTIN_ROW(__sync_fetch_and_nand),

	BUILTIN_ROW(__sync_add_and_fetch),
	BUILTIN_ROW(__sync_sub_and_fetch),
	BUILTIN_ROW(__sync_and_and_fetch),
	BUILTIN_ROW(__sync_or_and_fetch),
	BUILTIN_ROW(__sync_xor_and_fetch),
	BUILTIN_ROW(__sync_nand_and_fetch),

	BUILTIN_ROW(__sync_val_compare_and_swap),
	BUILTIN_ROW(__sync_bool_compare_and_swap),
	BUILTIN_ROW(__sync_lock_test_and_set),
	BUILTIN_ROW(__sync_lock_release),
	BUILTIN_ROW(__sync_swap)
	};
	#undef BUILTIN_ROW

	// Determine the index of the size.
	unsigned SizeIndex;
	switch (Context.getTypeSizeInChars(ValType).getQuantity()) {
	case 1: SizeIndex = 0; break;
	case 2: SizeIndex = 1; break;
	case 4: SizeIndex = 2; break;
	case 8: SizeIndex = 3; break;
	case 16: SizeIndex = 4; break;
	default:
	Diag(DRE->getLocStart(), diag::err_atomic_builtin_pointer_size)
	<< FirstArg->getType() << FirstArg->getSourceRange();
	return ExprError();
	}

	// Each of these builtins has one pointer argument, followed by some number of
	// values (0, 1 or 2) followed by a potentially empty varags list of stuff
	// that we ignore. Find out which row of BuiltinIndices to read from as well
	// as the number of fixed args.
	unsigned BuiltinID = FDecl->getBuiltinID();
	unsigned BuiltinIndex, NumFixed = 1;
	bool WarnAboutSemanticsChange = false;
	switch (BuiltinID) {
	default: llvm_unreachable("Unknown overloaded atomic builtin!");
	case Builtin::BI__sync_fetch_and_add:
	case Builtin::BI__sync_fetch_and_add_1:
	case Builtin::BI__sync_fetch_and_add_2:
	case Builtin::BI__sync_fetch_and_add_4:
	case Builtin::BI__sync_fetch_and_add_8:
	case Builtin::BI__sync_fetch_and_add_16:
	BuiltinIndex = 0;
	break;

	case Builtin::BI__sync_fetch_and_sub:
	case Builtin::BI__sync_fetch_and_sub_1:
	case Builtin::BI__sync_fetch_and_sub_2:
	case Builtin::BI__sync_fetch_and_sub_4:
	case Builtin::BI__sync_fetch_and_sub_8:
	case Builtin::BI__sync_fetch_and_sub_16:
	BuiltinIndex = 1;
	break;

	case Builtin::BI__sync_fetch_and_or:
	case Builtin::BI__sync_fetch_and_or_1:
	case Builtin::BI__sync_fetch_and_or_2:
	case Builtin::BI__sync_fetch_and_or_4:
	case Builtin::BI__sync_fetch_and_or_8:
	case Builtin::BI__sync_fetch_and_or_16:
	BuiltinIndex = 2;
	break;

	case Builtin::BI__sync_fetch_and_and:
	case Builtin::BI__sync_fetch_and_and_1:
	case Builtin::BI__sync_fetch_and_and_2:
	case Builtin::BI__sync_fetch_and_and_4:
	case Builtin::BI__sync_fetch_and_and_8:
	case Builtin::BI__sync_fetch_and_and_16:
	BuiltinIndex = 3;
	break;

	case Builtin::BI__sync_fetch_and_xor:
	case Builtin::BI__sync_fetch_and_xor_1:
	case Builtin::BI__sync_fetch_and_xor_2:
	case Builtin::BI__sync_fetch_and_xor_4:
	case Builtin::BI__sync_fetch_and_xor_8:
	case Builtin::BI__sync_fetch_and_xor_16:
	BuiltinIndex = 4;
	break;

	case Builtin::BI__sync_fetch_and_nand:
	case Builtin::BI__sync_fetch_and_nand_1:
	case Builtin::BI__sync_fetch_and_nand_2:
	case Builtin::BI__sync_fetch_and_nand_4:
	case Builtin::BI__sync_fetch_and_nand_8:
	case Builtin::BI__sync_fetch_and_nand_16:
	BuiltinIndex = 5;
	WarnAboutSemanticsChange = true;
	break;

	case Builtin::BI__sync_add_and_fetch:
	case Builtin::BI__sync_add_and_fetch_1:
	case Builtin::BI__sync_add_and_fetch_2:
	case Builtin::BI__sync_add_and_fetch_4:
	case Builtin::BI__sync_add_and_fetch_8:
	case Builtin::BI__sync_add_and_fetch_16:
	BuiltinIndex = 6;
	break;

	case Builtin::BI__sync_sub_and_fetch:
	case Builtin::BI__sync_sub_and_fetch_1:
	case Builtin::BI__sync_sub_and_fetch_2:
	case Builtin::BI__sync_sub_and_fetch_4:
	case Builtin::BI__sync_sub_and_fetch_8:
	case Builtin::BI__sync_sub_and_fetch_16:
	BuiltinIndex = 7;
	break;

	case Builtin::BI__sync_and_and_fetch:
	case Builtin::BI__sync_and_and_fetch_1:
	case Builtin::BI__sync_and_and_fetch_2:
	case Builtin::BI__sync_and_and_fetch_4:
	case Builtin::BI__sync_and_and_fetch_8:
	case Builtin::BI__sync_and_and_fetch_16:
	BuiltinIndex = 8;
	break;

	case Builtin::BI__sync_or_and_fetch:
	case Builtin::BI__sync_or_and_fetch_1:
	case Builtin::BI__sync_or_and_fetch_2:
	case Builtin::BI__sync_or_and_fetch_4:
	case Builtin::BI__sync_or_and_fetch_8:
	case Builtin::BI__sync_or_and_fetch_16:
	BuiltinIndex = 9;
	break;

	case Builtin::BI__sync_xor_and_fetch:
	case Builtin::BI__sync_xor_and_fetch_1:
	case Builtin::BI__sync_xor_and_fetch_2:
	case Builtin::BI__sync_xor_and_fetch_4:
	case Builtin::BI__sync_xor_and_fetch_8:
	case Builtin::BI__sync_xor_and_fetch_16:
	BuiltinIndex = 10;
	break;

	case Builtin::BI__sync_nand_and_fetch:
	case Builtin::BI__sync_nand_and_fetch_1:
	case Builtin::BI__sync_nand_and_fetch_2:
	case Builtin::BI__sync_nand_and_fetch_4:
	case Builtin::BI__sync_nand_and_fetch_8:
	case Builtin::BI__sync_nand_and_fetch_16:
	BuiltinIndex = 11;
	WarnAboutSemanticsChange = true;
	break;

	case Builtin::BI__sync_val_compare_and_swap:
	case Builtin::BI__sync_val_compare_and_swap_1:
	case Builtin::BI__sync_val_compare_and_swap_2:
	case Builtin::BI__sync_val_compare_and_swap_4:
	case Builtin::BI__sync_val_compare_and_swap_8:
	case Builtin::BI__sync_val_compare_and_swap_16:
	BuiltinIndex = 12;
	NumFixed = 2;
	break;

	case Builtin::BI__sync_bool_compare_and_swap:
	case Builtin::BI__sync_bool_compare_and_swap_1:
	case Builtin::BI__sync_bool_compare_and_swap_2:
	case Builtin::BI__sync_bool_compare_and_swap_4:
	case Builtin::BI__sync_bool_compare_and_swap_8:
	case Builtin::BI__sync_bool_compare_and_swap_16:
	BuiltinIndex = 13;
	NumFixed = 2;
	ResultType = Context.BoolTy;
	break;

	case Builtin::BI__sync_lock_test_and_set:
	case Builtin::BI__sync_lock_test_and_set_1:
	case Builtin::BI__sync_lock_test_and_set_2:
	case Builtin::BI__sync_lock_test_and_set_4:
	case Builtin::BI__sync_lock_test_and_set_8:
	case Builtin::BI__sync_lock_test_and_set_16:
	BuiltinIndex = 14;
	break;

	case Builtin::BI__sync_lock_release:
	case Builtin::BI__sync_lock_release_1:
	case Builtin::BI__sync_lock_release_2:
	case Builtin::BI__sync_lock_release_4:
	case Builtin::BI__sync_lock_release_8:
	case Builtin::BI__sync_lock_release_16:
	BuiltinIndex = 15;
	NumFixed = 0;
	ResultType = Context.VoidTy;
	break;

	case Builtin::BI__sync_swap:
	case Builtin::BI__sync_swap_1:
	case Builtin::BI__sync_swap_2:
	case Builtin::BI__sync_swap_4:
	case Builtin::BI__sync_swap_8:
	case Builtin::BI__sync_swap_16:
	BuiltinIndex = 16;
	break;
	}

	// Now that we know how many fixed arguments we expect, first check that we
	// have at least that many.
	if (TheCall->getNumArgs() < 1+NumFixed) {
	Diag(TheCall->getLocEnd(), diag::err_typecheck_call_too_few_args_at_least)
	<< 0 << 1+NumFixed << TheCall->getNumArgs()
	<< TheCall->getCallee()->getSourceRange();
	return ExprError();
	}

	if (WarnAboutSemanticsChange) {
	Diag(TheCall->getLocEnd(), diag::warn_sync_fetch_and_nand_semantics_change)
	<< TheCall->getCallee()->getSourceRange();
	}

	// Get the decl for the concrete builtin from this, we can tell what the
	// concrete integer type we should convert to is.
	unsigned NewBuiltinID = BuiltinIndices[BuiltinIndex][SizeIndex];
	const char *NewBuiltinName = Context.BuiltinInfo.getName(NewBuiltinID);
	FunctionDecl *NewBuiltinDecl;
	if (NewBuiltinID == BuiltinID)
	NewBuiltinDecl = FDecl;
	else {
	// Perform builtin lookup to avoid redeclaring it.
	DeclarationName DN(&Context.Idents.get(NewBuiltinName));
	LookupResult Res(*this, DN, DRE->getLocStart(), LookupOrdinaryName);
	LookupName(Res, TUScope, /AllowBuiltinCreation=/true);
	assert(Res.getFoundDecl());
	NewBuiltinDecl = dyn_cast<FunctionDecl>(Res.getFoundDecl());
	if (!NewBuiltinDecl)
	return ExprError();
	}

	// The first argument --- the pointer --- has a fixed type; we
	// deduce the types of the rest of the arguments accordingly. Walk
	// the remaining arguments, converting them to the deduced value type.
	for (unsigned i = 0; i != NumFixed; ++i) {
	ExprResult Arg = TheCall->getArg(i+1);

	// GCC does an implicit conversion to the pointer or integer ValType. This
	// can fail in some cases (1i -> int**), check for this error case now.
	// Initialize the argument.
	InitializedEntity Entity = InitializedEntity::InitializeParameter(Context,
	ValType, /consume/ false);
	Arg = PerformCopyInitialization(Entity, SourceLocation(), Arg);
	if (Arg.isInvalid())
	return ExprError();

	// Okay, we have something that can be converted to the right type. Check
	// to see if there is a potentially weird extension going on here. This can
	// happen when you do an atomic operation on something like an char* and
	// pass in 42. The 42 gets converted to char. This is even more strange
	// for things like 45.123 -> char, etc.
	// FIXME: Do this check.
	TheCall->setArg(i+1, Arg.get());
	}

	ASTContext& Context = this->getASTContext();

	// Create a new DeclRefExpr to refer to the new decl.
	DeclRefExpr* NewDRE = DeclRefExpr::Create(
	Context,
	DRE->getQualifierLoc(),
	SourceLocation(),
	NewBuiltinDecl,
	/enclosing/ false,
	DRE->getLocation(),
	Context.BuiltinFnTy,
	DRE->getValueKind());

	// Set the callee in the CallExpr.
	// FIXME: This loses syntactic information.
	QualType CalleePtrTy = Context.getPointerType(NewBuiltinDecl->getType());
	ExprResult PromotedCall = ImpCastExprToType(NewDRE, CalleePtrTy,
	CK_BuiltinFnToFnPtr);
	TheCall->setCallee(PromotedCall.get());

	// Change the result type of the call to match the original value type. This
	// is arbitrary, but the codegen for these builtins ins design to handle it
	// gracefully.
	TheCall->setType(ResultType);

	return TheCallResult;
	}

	/// SemaBuiltinNontemporalOverloaded - We have a call to
	/// __builtin_nontemporal_store or __builtin_nontemporal_load, which is an
	/// overloaded function based on the pointer type of its last argument.
	///
	/// This function goes through and does final semantic checking for these
	/// builtins.
	ExprResult Sema::SemaBuiltinNontemporalOverloaded(ExprResult TheCallResult) {
	CallExpr TheCall = (CallExpr )TheCallResult.get();
	DeclRefExpr *DRE =
	cast<DeclRefExpr>(TheCall->getCallee()->IgnoreParenCasts());
	FunctionDecl *FDecl = cast<FunctionDecl>(DRE->getDecl());
	unsigned BuiltinID = FDecl->getBuiltinID();
	assert((BuiltinID == Builtin::BI__builtin_nontemporal_store \|\|
	BuiltinID == Builtin::BI__builtin_nontemporal_load) &&
	"Unexpected nontemporal load/store builtin!");
	bool isStore = BuiltinID == Builtin::BI__builtin_nontemporal_store;
	unsigned numArgs = isStore ? 2 : 1;

	// Ensure that we have the proper number of arguments.
	if (checkArgCount(*this, TheCall, numArgs))
	return ExprError();

	// Inspect the last argument of the nontemporal builtin. This should always
	// be a pointer type, from which we imply the type of the memory access.
	// Because it is a pointer type, we don't have to worry about any implicit
	// casts here.
	Expr *PointerArg = TheCall->getArg(numArgs - 1);
	ExprResult PointerArgResult =
	DefaultFunctionArrayLvalueConversion(PointerArg);

	if (PointerArgResult.isInvalid())
	return ExprError();
	PointerArg = PointerArgResult.get();
	TheCall->setArg(numArgs - 1, PointerArg);

	const PointerType *pointerType = PointerArg->getType()->getAs<PointerType>();
	if (!pointerType) {
	Diag(DRE->getLocStart(), diag::err_nontemporal_builtin_must_be_pointer)
	<< PointerArg->getType() << PointerArg->getSourceRange();
	return ExprError();
	}

	QualType ValType = pointerType->getPointeeType();

	// Strip any qualifiers off ValType.
	ValType = ValType.getUnqualifiedType();
	if (!ValType->isIntegerType() && !ValType->isAnyPointerType() &&
	!ValType->isBlockPointerType() && !ValType->isFloatingType() &&
	!ValType->isVectorType()) {
	Diag(DRE->getLocStart(),
	diag::err_nontemporal_builtin_must_be_pointer_intfltptr_or_vector)
	<< PointerArg->getType() << PointerArg->getSourceRange();
	return ExprError();
	}

	if (!isStore) {
	TheCall->setType(ValType);
	return TheCallResult;
	}

	ExprResult ValArg = TheCall->getArg(0);
	InitializedEntity Entity = InitializedEntity::InitializeParameter(
	Context, ValType, /consume/ false);
	ValArg = PerformCopyInitialization(Entity, SourceLocation(), ValArg);
	if (ValArg.isInvalid())
	return ExprError();

	TheCall->setArg(0, ValArg.get());
	TheCall->setType(Context.VoidTy);
	return TheCallResult;
	}

	/// CheckObjCString - Checks that the argument to the builtin
	/// CFString constructor is correct
	/// Note: It might also make sense to do the UTF-16 conversion here (would
	/// simplify the backend).
	bool Sema::CheckObjCString(Expr *Arg) {
	Arg = Arg->IgnoreParenCasts();
	StringLiteral *Literal = dyn_cast<StringLiteral>(Arg);

	if (!Literal \|\| !Literal->isAscii()) {
	Diag(Arg->getLocStart(), diag::err_cfstring_literal_not_string_constant)
	<< Arg->getSourceRange();
	return true;
	}

	if (Literal->containsNonAsciiOrNull()) {
	StringRef String = Literal->getString();
	unsigned NumBytes = String.size();
	SmallVector<llvm::UTF16, 128> ToBuf(NumBytes);
	const llvm::UTF8 FromPtr = (const llvm::UTF8 )String.data();
	llvm::UTF16 *ToPtr = &ToBuf[0];

	llvm::ConversionResult Result =
	llvm::ConvertUTF8toUTF16(&FromPtr, FromPtr + NumBytes, &ToPtr,
	ToPtr + NumBytes, llvm::strictConversion);
	// Check for conversion failure.
	if (Result != llvm::conversionOK)
	Diag(Arg->getLocStart(),
	diag::warn_cfstring_truncated) << Arg->getSourceRange();
	}
	return false;
	}

	/// CheckObjCString - Checks that the format string argument to the os_log()
	/// and os_trace() functions is correct, and converts it to const char *.
	ExprResult Sema::CheckOSLogFormatStringArg(Expr *Arg) {
	Arg = Arg->IgnoreParenCasts();
	auto *Literal = dyn_cast<StringLiteral>(Arg);
	if (!Literal) {
	if (auto *ObjcLiteral = dyn_cast<ObjCStringLiteral>(Arg)) {
	Literal = ObjcLiteral->getString();
	}
	}

	if (!Literal \|\| (!Literal->isAscii() && !Literal->isUTF8())) {
	return ExprError(
	Diag(Arg->getLocStart(), diag::err_os_log_format_not_string_constant)
	<< Arg->getSourceRange());
	}

	ExprResult Result(Literal);
	QualType ResultTy = Context.getPointerType(Context.CharTy.withConst());
	InitializedEntity Entity =
	InitializedEntity::InitializeParameter(Context, ResultTy, false);
	Result = PerformCopyInitialization(Entity, SourceLocation(), Result);
	return Result;
	}

	/// Check that the user is calling the appropriate va_start builtin for the
	/// target and calling convention.
	static bool checkVAStartABI(Sema &S, unsigned BuiltinID, Expr *Fn) {
	const llvm::Triple &TT = S.Context.getTargetInfo().getTriple();
	bool IsX64 = TT.getArch() == llvm::Triple::x86_64;
	bool IsAArch64 = TT.getArch() == llvm::Triple::aarch64;
	bool IsWindows = TT.isOSWindows();
	bool IsMSVAStart = BuiltinID == Builtin::BI__builtin_ms_va_start;
	if (IsX64 \|\| IsAArch64) {
	CallingConv CC = CC_C;
	if (const FunctionDecl *FD = S.getCurFunctionDecl())
	CC = FD->getType()->getAs<FunctionType>()->getCallConv();
	if (IsMSVAStart) {
	// Don't allow this in System V ABI functions.
	if (CC == CC_X86_64SysV \|\| (!IsWindows && CC != CC_Win64))
	return S.Diag(Fn->getLocStart(),
	diag::err_ms_va_start_used_in_sysv_function);
	} else {
	// On x86-64/AArch64 Unix, don't allow this in Win64 ABI functions.
	// On x64 Windows, don't allow this in System V ABI functions.
	// (Yes, that means there's no corresponding way to support variadic
	// System V ABI functions on Windows.)
	if ((IsWindows && CC == CC_X86_64SysV) \|\|
	(!IsWindows && CC == CC_Win64))
	return S.Diag(Fn->getLocStart(),
	diag::err_va_start_used_in_wrong_abi_function)
	<< !IsWindows;
	}
	return false;
	}

	if (IsMSVAStart)
	return S.Diag(Fn->getLocStart(), diag::err_builtin_x64_aarch64_only);
	return false;
	}

	static bool checkVAStartIsInVariadicFunction(Sema &S, Expr *Fn,
	ParmVarDecl **LastParam = nullptr) {
	// Determine whether the current function, block, or obj-c method is variadic
	// and get its parameter list.
	bool IsVariadic = false;
	ArrayRef<ParmVarDecl *> Params;
	DeclContext *Caller = S.CurContext;
	if (auto *Block = dyn_cast<BlockDecl>(Caller)) {
	IsVariadic = Block->isVariadic();
	Params = Block->parameters();
	} else if (auto *FD = dyn_cast<FunctionDecl>(Caller)) {
	IsVariadic = FD->isVariadic();
	Params = FD->parameters();
	} else if (auto *MD = dyn_cast<ObjCMethodDecl>(Caller)) {
	IsVariadic = MD->isVariadic();
	// FIXME: This isn't correct for methods (results in bogus warning).
	Params = MD->parameters();
	} else if (isa<CapturedDecl>(Caller)) {
	// We don't support va_start in a CapturedDecl.
	S.Diag(Fn->getLocStart(), diag::err_va_start_captured_stmt);
	return true;
	} else {
	// This must be some other declcontext that parses exprs.
	S.Diag(Fn->getLocStart(), diag::err_va_start_outside_function);
	return true;
	}

	if (!IsVariadic) {
	S.Diag(Fn->getLocStart(), diag::err_va_start_fixed_function);
	return true;
	}

	if (LastParam)
	*LastParam = Params.empty() ? nullptr : Params.back();

	return false;
	}

	/// Check the arguments to '__builtin_va_start' or '__builtin_ms_va_start'
	/// for validity. Emit an error and return true on failure; return false
	/// on success.
	bool Sema::SemaBuiltinVAStart(unsigned BuiltinID, CallExpr *TheCall) {
	Expr *Fn = TheCall->getCallee();

	if (checkVAStartABI(*this, BuiltinID, Fn))
	return true;

	if (TheCall->getNumArgs() > 2) {
	Diag(TheCall->getArg(2)->getLocStart(),
	diag::err_typecheck_call_too_many_args)
	<< 0 /function call/ << 2 << TheCall->getNumArgs()
	<< Fn->getSourceRange()
	<< SourceRange(TheCall->getArg(2)->getLocStart(),
	(*(TheCall->arg_end()-1))->getLocEnd());
	return true;
	}

	if (TheCall->getNumArgs() < 2) {
	return Diag(TheCall->getLocEnd(),
	diag::err_typecheck_call_too_few_args_at_least)
	<< 0 /function call/ << 2 << TheCall->getNumArgs();
	}

	// Type-check the first argument normally.
	if (checkBuiltinArgument(*this, TheCall, 0))
	return true;

	// Check that the current function is variadic, and get its last parameter.
	ParmVarDecl *LastParam;
	if (checkVAStartIsInVariadicFunction(*this, Fn, &LastParam))
	return true;

	// Verify that the second argument to the builtin is the last argument of the
	// current function or method.
	bool SecondArgIsLastNamedArgument = false;
	const Expr *Arg = TheCall->getArg(1)->IgnoreParenCasts();

	// These are valid if SecondArgIsLastNamedArgument is false after the next
	// block.
	QualType Type;
	SourceLocation ParamLoc;
	bool IsCRegister = false;

	if (const DeclRefExpr *DR = dyn_cast<DeclRefExpr>(Arg)) {
	if (const ParmVarDecl *PV = dyn_cast<ParmVarDecl>(DR->getDecl())) {
	SecondArgIsLastNamedArgument = PV == LastParam;

	Type = PV->getType();
	ParamLoc = PV->getLocation();
	IsCRegister =
	PV->getStorageClass() == SC_Register && !getLangOpts().CPlusPlus;
	}
	}

	if (!SecondArgIsLastNamedArgument)
	Diag(TheCall->getArg(1)->getLocStart(),
	diag::warn_second_arg_of_va_start_not_last_named_param);
	else if (IsCRegister \|\| Type->isReferenceType() \|\|
	Type->isSpecificBuiltinType(BuiltinType::Float) \|\| [=] {
	// Promotable integers are UB, but enumerations need a bit of
	// extra checking to see what their promotable type actually is.
	if (!Type->isPromotableIntegerType())
	return false;
	if (!Type->isEnumeralType())
	return true;
	const EnumDecl *ED = Type->getAs<EnumType>()->getDecl();
	return !(ED &&
	Context.typesAreCompatible(ED->getPromotionType(), Type));
	}()) {
	unsigned Reason = 0;
	if (Type->isReferenceType()) Reason = 1;
	else if (IsCRegister) Reason = 2;
	Diag(Arg->getLocStart(), diag::warn_va_start_type_is_undefined) << Reason;
	Diag(ParamLoc, diag::note_parameter_type) << Type;
	}

	TheCall->setType(Context.VoidTy);
	return false;
	}

	bool Sema::SemaBuiltinVAStartARMMicrosoft(CallExpr *Call) {
	// void __va_start(va_list ap, const char named_addr, size_t slot_size,
	// const char *named_addr);

	Expr *Func = Call->getCallee();

	if (Call->getNumArgs() < 3)
	return Diag(Call->getLocEnd(),
	diag::err_typecheck_call_too_few_args_at_least)
	<< 0 /function call/ << 3 << Call->getNumArgs();

	// Type-check the first argument normally.
	if (checkBuiltinArgument(*this, Call, 0))
	return true;

	// Check that the current function is variadic.
	if (checkVAStartIsInVariadicFunction(*this, Func))
	return true;

	// __va_start on Windows does not validate the parameter qualifiers

	const Expr *Arg1 = Call->getArg(1)->IgnoreParens();
	const Type *Arg1Ty = Arg1->getType().getCanonicalType().getTypePtr();

	const Expr *Arg2 = Call->getArg(2)->IgnoreParens();
	const Type *Arg2Ty = Arg2->getType().getCanonicalType().getTypePtr();

	const QualType &ConstCharPtrTy =
	Context.getPointerType(Context.CharTy.withConst());
	if (!Arg1Ty->isPointerType() \|\|
	Arg1Ty->getPointeeType().withoutLocalFastQualifiers() != Context.CharTy)
	Diag(Arg1->getLocStart(), diag::err_typecheck_convert_incompatible)
	<< Arg1->getType() << ConstCharPtrTy
	<< 1 /* different class */
	<< 0 /* qualifier difference */
	<< 3 /* parameter mismatch */
	<< 2 << Arg1->getType() << ConstCharPtrTy;

	const QualType SizeTy = Context.getSizeType();
	if (Arg2Ty->getCanonicalTypeInternal().withoutLocalFastQualifiers() != SizeTy)
	Diag(Arg2->getLocStart(), diag::err_typecheck_convert_incompatible)
	<< Arg2->getType() << SizeTy
	<< 1 /* different class */
	<< 0 /* qualifier difference */
	<< 3 /* parameter mismatch */
	<< 3 << Arg2->getType() << SizeTy;

	return false;
	}

	/// SemaBuiltinUnorderedCompare - Handle functions like __builtin_isgreater and
	/// friends. This is declared to take (...), so we have to check everything.
	bool Sema::SemaBuiltinUnorderedCompare(CallExpr *TheCall) {
	if (TheCall->getNumArgs() < 2)
	return Diag(TheCall->getLocEnd(), diag::err_typecheck_call_too_few_args)
	<< 0 << 2 << TheCall->getNumArgs()/function call/;
	if (TheCall->getNumArgs() > 2)
	return Diag(TheCall->getArg(2)->getLocStart(),
	diag::err_typecheck_call_too_many_args)
	<< 0 /function call/ << 2 << TheCall->getNumArgs()
	<< SourceRange(TheCall->getArg(2)->getLocStart(),
	(*(TheCall->arg_end()-1))->getLocEnd());

	ExprResult OrigArg0 = TheCall->getArg(0);
	ExprResult OrigArg1 = TheCall->getArg(1);

	// Do standard promotions between the two arguments, returning their common
	// type.
	QualType Res = UsualArithmeticConversions(OrigArg0, OrigArg1, false);
	if (OrigArg0.isInvalid() \|\| OrigArg1.isInvalid())
	return true;

	// Make sure any conversions are pushed back into the call; this is
	// type safe since unordered compare builtins are declared as "_Bool
	// foo(...)".
	TheCall->setArg(0, OrigArg0.get());
	TheCall->setArg(1, OrigArg1.get());

	if (OrigArg0.get()->isTypeDependent() \|\| OrigArg1.get()->isTypeDependent())
	return false;

	// If the common type isn't a real floating type, then the arguments were
	// invalid for this operation.
	if (Res.isNull() \|\| !Res->isRealFloatingType())
	return Diag(OrigArg0.get()->getLocStart(),
	diag::err_typecheck_call_invalid_ordered_compare)
	<< OrigArg0.get()->getType() << OrigArg1.get()->getType()
	<< SourceRange(OrigArg0.get()->getLocStart(), OrigArg1.get()->getLocEnd());

	return false;
	}

	/// SemaBuiltinSemaBuiltinFPClassification - Handle functions like
	/// __builtin_isnan and friends. This is declared to take (...), so we have
	/// to check everything. We expect the last argument to be a floating point
	/// value.
	bool Sema::SemaBuiltinFPClassification(CallExpr *TheCall, unsigned NumArgs) {
	if (TheCall->getNumArgs() < NumArgs)
	return Diag(TheCall->getLocEnd(), diag::err_typecheck_call_too_few_args)
	<< 0 << NumArgs << TheCall->getNumArgs()/function call/;
	if (TheCall->getNumArgs() > NumArgs)
	return Diag(TheCall->getArg(NumArgs)->getLocStart(),
	diag::err_typecheck_call_too_many_args)
	<< 0 /function call/ << NumArgs << TheCall->getNumArgs()
	<< SourceRange(TheCall->getArg(NumArgs)->getLocStart(),
	(*(TheCall->arg_end()-1))->getLocEnd());

	Expr *OrigArg = TheCall->getArg(NumArgs-1);

	if (OrigArg->isTypeDependent())
	return false;

	// This operation requires a non-_Complex floating-point number.
	if (!OrigArg->getType()->isRealFloatingType())
	return Diag(OrigArg->getLocStart(),
	diag::err_typecheck_call_invalid_unary_fp)
	<< OrigArg->getType() << OrigArg->getSourceRange();

	// If this is an implicit conversion from float -> float or double, remove it.
	if (ImplicitCastExpr *Cast = dyn_cast<ImplicitCastExpr>(OrigArg)) {
	// Only remove standard FloatCasts, leaving other casts inplace
	if (Cast->getCastKind() == CK_FloatingCast) {
	Expr *CastArg = Cast->getSubExpr();
	if (CastArg->getType()->isSpecificBuiltinType(BuiltinType::Float)) {
	assert((Cast->getType()->isSpecificBuiltinType(BuiltinType::Double) \|\|
	Cast->getType()->isSpecificBuiltinType(BuiltinType::Float)) &&
	"promotion from float to either float or double is the only expected cast here");
	Cast->setSubExpr(nullptr);
	TheCall->setArg(NumArgs-1, CastArg);
	}
	}
	}

	return false;
	}

	// Customized Sema Checking for VSX builtins that have the following signature:
	// vector [...] builtinName(vector [...], vector [...], const int);
	// Which takes the same type of vectors (any legal vector type) for the first
	// two arguments and takes compile time constant for the third argument.
	// Example builtins are :
	// vector double vec_xxpermdi(vector double, vector double, int);
	// vector short vec_xxsldwi(vector short, vector short, int);
	bool Sema::SemaBuiltinVSX(CallExpr *TheCall) {
	unsigned ExpectedNumArgs = 3;
	if (TheCall->getNumArgs() < ExpectedNumArgs)
	return Diag(TheCall->getLocEnd(),
	diag::err_typecheck_call_too_few_args_at_least)
	<< 0 /function call/ << ExpectedNumArgs << TheCall->getNumArgs()
	<< TheCall->getSourceRange();

	if (TheCall->getNumArgs() > ExpectedNumArgs)
	return Diag(TheCall->getLocEnd(),
	diag::err_typecheck_call_too_many_args_at_most)
	<< 0 /function call/ << ExpectedNumArgs << TheCall->getNumArgs()
	<< TheCall->getSourceRange();

	// Check the third argument is a compile time constant
	llvm::APSInt Value;
	if(!TheCall->getArg(2)->isIntegerConstantExpr(Value, Context))
	return Diag(TheCall->getLocStart(),
	diag::err_vsx_builtin_nonconstant_argument)
	<< 3 /* argument index */ << TheCall->getDirectCallee()
	<< SourceRange(TheCall->getArg(2)->getLocStart(),
	TheCall->getArg(2)->getLocEnd());

	QualType Arg1Ty = TheCall->getArg(0)->getType();
	QualType Arg2Ty = TheCall->getArg(1)->getType();

	// Check the type of argument 1 and argument 2 are vectors.
	SourceLocation BuiltinLoc = TheCall->getLocStart();
	if ((!Arg1Ty->isVectorType() && !Arg1Ty->isDependentType()) \|\|
	(!Arg2Ty->isVectorType() && !Arg2Ty->isDependentType())) {
	return Diag(BuiltinLoc, diag::err_vec_builtin_non_vector)
	<< TheCall->getDirectCallee()
	<< SourceRange(TheCall->getArg(0)->getLocStart(),
	TheCall->getArg(1)->getLocEnd());
	}

	// Check the first two arguments are the same type.
	if (!Context.hasSameUnqualifiedType(Arg1Ty, Arg2Ty)) {
	return Diag(BuiltinLoc, diag::err_vec_builtin_incompatible_vector)
	<< TheCall->getDirectCallee()
	<< SourceRange(TheCall->getArg(0)->getLocStart(),
	TheCall->getArg(1)->getLocEnd());
	}

	// When default clang type checking is turned off and the customized type
	// checking is used, the returning type of the function must be explicitly
	// set. Otherwise it is _Bool by default.
	TheCall->setType(Arg1Ty);

	return false;
	}

	/// SemaBuiltinShuffleVector - Handle __builtin_shufflevector.
	// This is declared to take (...), so we have to check everything.
	ExprResult Sema::SemaBuiltinShuffleVector(CallExpr *TheCall) {
	if (TheCall->getNumArgs() < 2)
	return ExprError(Diag(TheCall->getLocEnd(),
	diag::err_typecheck_call_too_few_args_at_least)
	<< 0 /function call/ << 2 << TheCall->getNumArgs()
	<< TheCall->getSourceRange());

	// Determine which of the following types of shufflevector we're checking:
	// 1) unary, vector mask: (lhs, mask)
	// 2) binary, scalar mask: (lhs, rhs, index, ..., index)
	QualType resType = TheCall->getArg(0)->getType();
	unsigned numElements = 0;

	if (!TheCall->getArg(0)->isTypeDependent() &&
	!TheCall->getArg(1)->isTypeDependent()) {
	QualType LHSType = TheCall->getArg(0)->getType();
	QualType RHSType = TheCall->getArg(1)->getType();

	if (!LHSType->isVectorType() \|\| !RHSType->isVectorType())
	return ExprError(Diag(TheCall->getLocStart(),
	diag::err_vec_builtin_non_vector)
	<< TheCall->getDirectCallee()
	<< SourceRange(TheCall->getArg(0)->getLocStart(),
	TheCall->getArg(1)->getLocEnd()));

	numElements = LHSType->getAs<VectorType>()->getNumElements();
	unsigned numResElements = TheCall->getNumArgs() - 2;

	// Check to see if we have a call with 2 vector arguments, the unary shuffle
	// with mask. If so, verify that RHS is an integer vector type with the
	// same number of elts as lhs.
	if (TheCall->getNumArgs() == 2) {
	if (!RHSType->hasIntegerRepresentation() \|\|
	RHSType->getAs<VectorType>()->getNumElements() != numElements)
	return ExprError(Diag(TheCall->getLocStart(),
	diag::err_vec_builtin_incompatible_vector)
	<< TheCall->getDirectCallee()
	<< SourceRange(TheCall->getArg(1)->getLocStart(),
	TheCall->getArg(1)->getLocEnd()));
	} else if (!Context.hasSameUnqualifiedType(LHSType, RHSType)) {
	return ExprError(Diag(TheCall->getLocStart(),
	diag::err_vec_builtin_incompatible_vector)
	<< TheCall->getDirectCallee()
	<< SourceRange(TheCall->getArg(0)->getLocStart(),
	TheCall->getArg(1)->getLocEnd()));
	} else if (numElements != numResElements) {
	QualType eltType = LHSType->getAs<VectorType>()->getElementType();
	resType = Context.getVectorType(eltType, numResElements,
	VectorType::GenericVector);
	}
	}

	for (unsigned i = 2; i < TheCall->getNumArgs(); i++) {
	if (TheCall->getArg(i)->isTypeDependent() \|\|
	TheCall->getArg(i)->isValueDependent())
	continue;

	llvm::APSInt Result(32);
	if (!TheCall->getArg(i)->isIntegerConstantExpr(Result, Context))
	return ExprError(Diag(TheCall->getLocStart(),
	diag::err_shufflevector_nonconstant_argument)
	<< TheCall->getArg(i)->getSourceRange());

	// Allow -1 which will be translated to undef in the IR.
	if (Result.isSigned() && Result.isAllOnesValue())
	continue;

	if (Result.getActiveBits() > 64 \|\| Result.getZExtValue() >= numElements*2)
	return ExprError(Diag(TheCall->getLocStart(),
	diag::err_shufflevector_argument_too_large)
	<< TheCall->getArg(i)->getSourceRange());
	}

	SmallVector<Expr*, 32> exprs;

	for (unsigned i = 0, e = TheCall->getNumArgs(); i != e; i++) {
	exprs.push_back(TheCall->getArg(i));
	TheCall->setArg(i, nullptr);
	}

	return new (Context) ShuffleVectorExpr(Context, exprs, resType,
	TheCall->getCallee()->getLocStart(),
	TheCall->getRParenLoc());
	}

	/// SemaConvertVectorExpr - Handle __builtin_convertvector
	ExprResult Sema::SemaConvertVectorExpr(Expr E, TypeSourceInfo TInfo,
	SourceLocation BuiltinLoc,
	SourceLocation RParenLoc) {
	ExprValueKind VK = VK_RValue;
	ExprObjectKind OK = OK_Ordinary;
	QualType DstTy = TInfo->getType();
	QualType SrcTy = E->getType();

	if (!SrcTy->isVectorType() && !SrcTy->isDependentType())
	return ExprError(Diag(BuiltinLoc,
	diag::err_convertvector_non_vector)
	<< E->getSourceRange());
	if (!DstTy->isVectorType() && !DstTy->isDependentType())
	return ExprError(Diag(BuiltinLoc,
	diag::err_convertvector_non_vector_type));

	if (!SrcTy->isDependentType() && !DstTy->isDependentType()) {
	unsigned SrcElts = SrcTy->getAs<VectorType>()->getNumElements();
	unsigned DstElts = DstTy->getAs<VectorType>()->getNumElements();
	if (SrcElts != DstElts)
	return ExprError(Diag(BuiltinLoc,
	diag::err_convertvector_incompatible_vector)
	<< E->getSourceRange());
	}

	return new (Context)
	ConvertVectorExpr(E, TInfo, DstTy, VK, OK, BuiltinLoc, RParenLoc);
	}

	/// SemaBuiltinPrefetch - Handle __builtin_prefetch.
	// This is declared to take (const void*, ...) and can take two
	// optional constant int args.
	bool Sema::SemaBuiltinPrefetch(CallExpr *TheCall) {
	unsigned NumArgs = TheCall->getNumArgs();

	if (NumArgs > 3)
	return Diag(TheCall->getLocEnd(),
	diag::err_typecheck_call_too_many_args_at_most)
	<< 0 /function call/ << 3 << NumArgs
	<< TheCall->getSourceRange();

	// Argument 0 is checked for us and the remaining arguments must be
	// constant integers.
	for (unsigned i = 1; i != NumArgs; ++i)
	if (SemaBuiltinConstantArgRange(TheCall, i, 0, i == 1 ? 1 : 3))
	return true;

	return false;
	}

	/// SemaBuiltinAssume - Handle __assume (MS Extension).
	// __assume does not evaluate its arguments, and should warn if its argument
	// has side effects.
	bool Sema::SemaBuiltinAssume(CallExpr *TheCall) {
	Expr *Arg = TheCall->getArg(0);
	if (Arg->isInstantiationDependent()) return false;

	if (Arg->HasSideEffects(Context))
	Diag(Arg->getLocStart(), diag::warn_assume_side_effects)
	<< Arg->getSourceRange()
	<< cast<FunctionDecl>(TheCall->getCalleeDecl())->getIdentifier();

	return false;
	}

	/// Handle __builtin_alloca_with_align. This is declared
	/// as (size_t, size_t) where the second size_t must be a power of 2 greater
	/// than 8.
	bool Sema::SemaBuiltinAllocaWithAlign(CallExpr *TheCall) {
	// The alignment must be a constant integer.
	Expr *Arg = TheCall->getArg(1);

	// We can't check the value of a dependent argument.
	if (!Arg->isTypeDependent() && !Arg->isValueDependent()) {
	if (const auto *UE =
	dyn_cast<UnaryExprOrTypeTraitExpr>(Arg->IgnoreParenImpCasts()))
	if (UE->getKind() == UETT_AlignOf)
	Diag(TheCall->getLocStart(), diag::warn_alloca_align_alignof)
	<< Arg->getSourceRange();

	llvm::APSInt Result = Arg->EvaluateKnownConstInt(Context);

	if (!Result.isPowerOf2())
	return Diag(TheCall->getLocStart(),
	diag::err_alignment_not_power_of_two)
	<< Arg->getSourceRange();

	if (Result < Context.getCharWidth())
	return Diag(TheCall->getLocStart(), diag::err_alignment_too_small)
	<< (unsigned)Context.getCharWidth()
	<< Arg->getSourceRange();

	if (Result > std::numeric_limits<int32_t>::max())
	return Diag(TheCall->getLocStart(), diag::err_alignment_too_big)
	<< std::numeric_limits<int32_t>::max()
	<< Arg->getSourceRange();
	}

	return false;
	}

	/// Handle __builtin_assume_aligned. This is declared
	/// as (const void*, size_t, ...) and can take one optional constant int arg.
	bool Sema::SemaBuiltinAssumeAligned(CallExpr *TheCall) {
	unsigned NumArgs = TheCall->getNumArgs();

	if (NumArgs > 3)
	return Diag(TheCall->getLocEnd(),
	diag::err_typecheck_call_too_many_args_at_most)
	<< 0 /function call/ << 3 << NumArgs
	<< TheCall->getSourceRange();

	// The alignment must be a constant integer.
	Expr *Arg = TheCall->getArg(1);

	// We can't check the value of a dependent argument.
	if (!Arg->isTypeDependent() && !Arg->isValueDependent()) {
	llvm::APSInt Result;
	if (SemaBuiltinConstantArg(TheCall, 1, Result))
	return true;

	if (!Result.isPowerOf2())
	return Diag(TheCall->getLocStart(),
	diag::err_alignment_not_power_of_two)
	<< Arg->getSourceRange();
	}

	if (NumArgs > 2) {
	ExprResult Arg(TheCall->getArg(2));
	InitializedEntity Entity = InitializedEntity::InitializeParameter(Context,
	Context.getSizeType(), false);
	Arg = PerformCopyInitialization(Entity, SourceLocation(), Arg);
	if (Arg.isInvalid()) return true;
	TheCall->setArg(2, Arg.get());
	}

	return false;
	}

	bool Sema::SemaBuiltinOSLogFormat(CallExpr *TheCall) {
	unsigned BuiltinID =
	cast<FunctionDecl>(TheCall->getCalleeDecl())->getBuiltinID();
	bool IsSizeCall = BuiltinID == Builtin::BI__builtin_os_log_format_buffer_size;

	unsigned NumArgs = TheCall->getNumArgs();
	unsigned NumRequiredArgs = IsSizeCall ? 1 : 2;
	if (NumArgs < NumRequiredArgs) {
	return Diag(TheCall->getLocEnd(), diag::err_typecheck_call_too_few_args)
	<< 0 /* function call */ << NumRequiredArgs << NumArgs
	<< TheCall->getSourceRange();
	}
	if (NumArgs >= NumRequiredArgs + 0x100) {
	return Diag(TheCall->getLocEnd(),
	diag::err_typecheck_call_too_many_args_at_most)
	<< 0 /* function call */ << (NumRequiredArgs + 0xff) << NumArgs
	<< TheCall->getSourceRange();
	}
	unsigned i = 0;

	// For formatting call, check buffer arg.
	if (!IsSizeCall) {
	ExprResult Arg(TheCall->getArg(i));
	InitializedEntity Entity = InitializedEntity::InitializeParameter(
	Context, Context.VoidPtrTy, false);
	Arg = PerformCopyInitialization(Entity, SourceLocation(), Arg);
	if (Arg.isInvalid())
	return true;
	TheCall->setArg(i, Arg.get());
	i++;
	}

	// Check string literal arg.
	unsigned FormatIdx = i;
	{
	ExprResult Arg = CheckOSLogFormatStringArg(TheCall->getArg(i));
	if (Arg.isInvalid())
	return true;
	TheCall->setArg(i, Arg.get());
	i++;
	}

	// Make sure variadic args are scalar.
	unsigned FirstDataArg = i;
	while (i < NumArgs) {
	ExprResult Arg = DefaultVariadicArgumentPromotion(
	TheCall->getArg(i), VariadicFunction, nullptr);
	if (Arg.isInvalid())
	return true;
	CharUnits ArgSize = Context.getTypeSizeInChars(Arg.get()->getType());
	if (ArgSize.getQuantity() >= 0x100) {
	return Diag(Arg.get()->getLocEnd(), diag::err_os_log_argument_too_big)
	<< i << (int)ArgSize.getQuantity() << 0xff
	<< TheCall->getSourceRange();
	}
	TheCall->setArg(i, Arg.get());
	i++;
	}

	// Check formatting specifiers. NOTE: We're only doing this for the non-size
	// call to avoid duplicate diagnostics.
	if (!IsSizeCall) {
	llvm::SmallBitVector CheckedVarArgs(NumArgs, false);
	ArrayRef<const Expr *> Args(TheCall->getArgs(), TheCall->getNumArgs());
	bool Success = CheckFormatArguments(
	Args, /HasVAListArg/ false, FormatIdx, FirstDataArg, FST_OSLog,
	VariadicFunction, TheCall->getLocStart(), SourceRange(),
	CheckedVarArgs);
	if (!Success)
	return true;
	}

	if (IsSizeCall) {
	TheCall->setType(Context.getSizeType());
	} else {
	TheCall->setType(Context.VoidPtrTy);
	}
	return false;
	}

	/// SemaBuiltinConstantArg - Handle a check if argument ArgNum of CallExpr
	/// TheCall is a constant expression.
	bool Sema::SemaBuiltinConstantArg(CallExpr *TheCall, int ArgNum,
	llvm::APSInt &Result) {
	Expr *Arg = TheCall->getArg(ArgNum);
	DeclRefExpr *DRE =cast<DeclRefExpr>(TheCall->getCallee()->IgnoreParenCasts());
	FunctionDecl *FDecl = cast<FunctionDecl>(DRE->getDecl());

	if (Arg->isTypeDependent() \|\| Arg->isValueDependent()) return false;

	if (!Arg->isIntegerConstantExpr(Result, Context))
	return Diag(TheCall->getLocStart(), diag::err_constant_integer_arg_type)
	<< FDecl->getDeclName() << Arg->getSourceRange();

	return false;
	}

	/// SemaBuiltinConstantArgRange - Handle a check if argument ArgNum of CallExpr
	/// TheCall is a constant expression in the range [Low, High].
	bool Sema::SemaBuiltinConstantArgRange(CallExpr *TheCall, int ArgNum,
	int Low, int High) {
	llvm::APSInt Result;

	// We can't check the value of a dependent argument.
	Expr *Arg = TheCall->getArg(ArgNum);
	if (Arg->isTypeDependent() \|\| Arg->isValueDependent())
	return false;

	// Check constant-ness first.
	if (SemaBuiltinConstantArg(TheCall, ArgNum, Result))
	return true;

	if (Result.getSExtValue() < Low \|\| Result.getSExtValue() > High)
	return Diag(TheCall->getLocStart(), diag::err_argument_invalid_range)
	<< Low << High << Arg->getSourceRange();

	return false;
	}

	/// SemaBuiltinConstantArgMultiple - Handle a check if argument ArgNum of CallExpr
	/// TheCall is a constant expression is a multiple of Num..
	bool Sema::SemaBuiltinConstantArgMultiple(CallExpr *TheCall, int ArgNum,
	unsigned Num) {
	llvm::APSInt Result;

	// We can't check the value of a dependent argument.
	Expr *Arg = TheCall->getArg(ArgNum);
	if (Arg->isTypeDependent() \|\| Arg->isValueDependent())
	return false;

	// Check constant-ness first.
	if (SemaBuiltinConstantArg(TheCall, ArgNum, Result))
	return true;

	if (Result.getSExtValue() % Num != 0)
	return Diag(TheCall->getLocStart(), diag::err_argument_not_multiple)
	<< Num << Arg->getSourceRange();

	return false;
	}

	/// SemaBuiltinARMSpecialReg - Handle a check if argument ArgNum of CallExpr
	/// TheCall is an ARM/AArch64 special register string literal.
	bool Sema::SemaBuiltinARMSpecialReg(unsigned BuiltinID, CallExpr *TheCall,
	int ArgNum, unsigned ExpectedFieldNum,
	bool AllowName) {
	bool IsARMBuiltin = BuiltinID == ARM::BI__builtin_arm_rsr64 \|\|
	BuiltinID == ARM::BI__builtin_arm_wsr64 \|\|
	BuiltinID == ARM::BI__builtin_arm_rsr \|\|
	BuiltinID == ARM::BI__builtin_arm_rsrp \|\|
	BuiltinID == ARM::BI__builtin_arm_wsr \|\|
	BuiltinID == ARM::BI__builtin_arm_wsrp;
	bool IsAArch64Builtin = BuiltinID == AArch64::BI__builtin_arm_rsr64 \|\|
	BuiltinID == AArch64::BI__builtin_arm_wsr64 \|\|
	BuiltinID == AArch64::BI__builtin_arm_rsr \|\|
	BuiltinID == AArch64::BI__builtin_arm_rsrp \|\|
	BuiltinID == AArch64::BI__builtin_arm_wsr \|\|
	BuiltinID == AArch64::BI__builtin_arm_wsrp;
	assert((IsARMBuiltin \|\| IsAArch64Builtin) && "Unexpected ARM builtin.");

	// We can't check the value of a dependent argument.
	Expr *Arg = TheCall->getArg(ArgNum);
	if (Arg->isTypeDependent() \|\| Arg->isValueDependent())
	return false;

	// Check if the argument is a string literal.
	if (!isa<StringLiteral>(Arg->IgnoreParenImpCasts()))
	return Diag(TheCall->getLocStart(), diag::err_expr_not_string_literal)
	<< Arg->getSourceRange();

	// Check the type of special register given.
	StringRef Reg = cast<StringLiteral>(Arg->IgnoreParenImpCasts())->getString();
	SmallVector<StringRef, 6> Fields;
	Reg.split(Fields, ":");

	if (Fields.size() != ExpectedFieldNum && !(AllowName && Fields.size() == 1))
	return Diag(TheCall->getLocStart(), diag::err_arm_invalid_specialreg)
	<< Arg->getSourceRange();

	// If the string is the name of a register then we cannot check that it is
	// valid here but if the string is of one the forms described in ACLE then we
	// can check that the supplied fields are integers and within the valid
	// ranges.
	if (Fields.size() > 1) {
	bool FiveFields = Fields.size() == 5;

	bool ValidString = true;
	if (IsARMBuiltin) {
	ValidString &= Fields[0].startswith_lower("cp") \|\|
	Fields[0].startswith_lower("p");
	if (ValidString)
	Fields[0] =
	Fields[0].drop_front(Fields[0].startswith_lower("cp") ? 2 : 1);

	ValidString &= Fields[2].startswith_lower("c");
	if (ValidString)
	Fields[2] = Fields[2].drop_front(1);

	if (FiveFields) {
	ValidString &= Fields[3].startswith_lower("c");
	if (ValidString)
	Fields[3] = Fields[3].drop_front(1);
	}
	}

	SmallVector<int, 5> Ranges;
	if (FiveFields)
	Ranges.append({IsAArch64Builtin ? 1 : 15, 7, 15, 15, 7});
	else
	Ranges.append({15, 7, 15});

	for (unsigned i=0; i<Fields.size(); ++i) {
	int IntField;
	ValidString &= !Fields[i].getAsInteger(10, IntField);
	ValidString &= (IntField >= 0 && IntField <= Ranges[i]);
	}

	if (!ValidString)
	return Diag(TheCall->getLocStart(), diag::err_arm_invalid_specialreg)
	<< Arg->getSourceRange();
	} else if (IsAArch64Builtin && Fields.size() == 1) {
	// If the register name is one of those that appear in the condition below
	// and the special register builtin being used is one of the write builtins,
	// then we require that the argument provided for writing to the register
	// is an integer constant expression. This is because it will be lowered to
	// an MSR (immediate) instruction, so we need to know the immediate at
	// compile time.
	if (TheCall->getNumArgs() != 2)
	return false;

	std::string RegLower = Reg.lower();
	if (RegLower != "spsel" && RegLower != "daifset" && RegLower != "daifclr" &&
	RegLower != "pan" && RegLower != "uao")
	return false;

	return SemaBuiltinConstantArgRange(TheCall, 1, 0, 15);
	}

	return false;
	}

	/// SemaBuiltinLongjmp - Handle __builtin_longjmp(void *env[5], int val).
	/// This checks that the target supports __builtin_longjmp and
	/// that val is a constant 1.
	bool Sema::SemaBuiltinLongjmp(CallExpr *TheCall) {
	if (!Context.getTargetInfo().hasSjLjLowering())
	return Diag(TheCall->getLocStart(), diag::err_builtin_longjmp_unsupported)
	<< SourceRange(TheCall->getLocStart(), TheCall->getLocEnd());

	Expr *Arg = TheCall->getArg(1);
	llvm::APSInt Result;

	// TODO: This is less than ideal. Overload this to take a value.
	if (SemaBuiltinConstantArg(TheCall, 1, Result))
	return true;

	if (Result != 1)
	return Diag(TheCall->getLocStart(), diag::err_builtin_longjmp_invalid_val)
	<< SourceRange(Arg->getLocStart(), Arg->getLocEnd());

	return false;
	}

	/// SemaBuiltinSetjmp - Handle __builtin_setjmp(void *env[5]).
	/// This checks that the target supports __builtin_setjmp.
	bool Sema::SemaBuiltinSetjmp(CallExpr *TheCall) {
	if (!Context.getTargetInfo().hasSjLjLowering())
	return Diag(TheCall->getLocStart(), diag::err_builtin_setjmp_unsupported)
	<< SourceRange(TheCall->getLocStart(), TheCall->getLocEnd());
	return false;
	}

	namespace {

	class UncoveredArgHandler {
	enum { Unknown = -1, AllCovered = -2 };

	signed FirstUncoveredArg = Unknown;
	SmallVector<const Expr *, 4> DiagnosticExprs;

	public:
	UncoveredArgHandler() = default;

	bool hasUncoveredArg() const {
	return (FirstUncoveredArg >= 0);
	}

	unsigned getUncoveredArg() const {
	assert(hasUncoveredArg() && "no uncovered argument");
	return FirstUncoveredArg;
	}

	void setAllCovered() {
	// A string has been found with all arguments covered, so clear out
	// the diagnostics.
	DiagnosticExprs.clear();
	FirstUncoveredArg = AllCovered;
	}

	void Update(signed NewFirstUncoveredArg, const Expr *StrExpr) {
	assert(NewFirstUncoveredArg >= 0 && "Outside range");

	// Don't update if a previous string covers all arguments.
	if (FirstUncoveredArg == AllCovered)
	return;

	// UncoveredArgHandler tracks the highest uncovered argument index
	// and with it all the strings that match this index.
	if (NewFirstUncoveredArg == FirstUncoveredArg)
	DiagnosticExprs.push_back(StrExpr);
	else if (NewFirstUncoveredArg > FirstUncoveredArg) {
	DiagnosticExprs.clear();
	DiagnosticExprs.push_back(StrExpr);
	FirstUncoveredArg = NewFirstUncoveredArg;
	}
	}

	void Diagnose(Sema &S, bool IsFunctionCall, const Expr *ArgExpr);
	};

	enum StringLiteralCheckType {
	SLCT_NotALiteral,
	SLCT_UncheckedLiteral,
	SLCT_CheckedLiteral
	};

	} // namespace

	static void sumOffsets(llvm::APSInt &Offset, llvm::APSInt Addend,
	BinaryOperatorKind BinOpKind,
	bool AddendIsRight) {
	unsigned BitWidth = Offset.getBitWidth();
	unsigned AddendBitWidth = Addend.getBitWidth();
	// There might be negative interim results.
	if (Addend.isUnsigned()) {
	Addend = Addend.zext(++AddendBitWidth);
	Addend.setIsSigned(true);
	}
	// Adjust the bit width of the APSInts.
	if (AddendBitWidth > BitWidth) {
	Offset = Offset.sext(AddendBitWidth);
	BitWidth = AddendBitWidth;
	} else if (BitWidth > AddendBitWidth) {
	Addend = Addend.sext(BitWidth);
	}

	bool Ov = false;
	llvm::APSInt ResOffset = Offset;
	if (BinOpKind == BO_Add)
	ResOffset = Offset.sadd_ov(Addend, Ov);
	else {
	assert(AddendIsRight && BinOpKind == BO_Sub &&
	"operator must be add or sub with addend on the right");
	ResOffset = Offset.ssub_ov(Addend, Ov);
	}

	// We add an offset to a pointer here so we should support an offset as big as
	// possible.
	if (Ov) {
	assert(BitWidth <= std::numeric_limits<unsigned>::max() / 2 &&
	"index (intermediate) result too big");
	Offset = Offset.sext(2 * BitWidth);
	sumOffsets(Offset, Addend, BinOpKind, AddendIsRight);
	return;
	}

	Offset = ResOffset;
	}

	namespace {

	// This is a wrapper class around StringLiteral to support offsetted string
	// literals as format strings. It takes the offset into account when returning
	// the string and its length or the source locations to display notes correctly.
	class FormatStringLiteral {
	const StringLiteral *FExpr;
	int64_t Offset;

	public:
	FormatStringLiteral(const StringLiteral *fexpr, int64_t Offset = 0)
	: FExpr(fexpr), Offset(Offset) {}

	StringRef getString() const {
	return FExpr->getString().drop_front(Offset);
	}

	unsigned getByteLength() const {
	return FExpr->getByteLength() - getCharByteWidth() * Offset;
	}

	unsigned getLength() const { return FExpr->getLength() - Offset; }
	unsigned getCharByteWidth() const { return FExpr->getCharByteWidth(); }

	StringLiteral::StringKind getKind() const { return FExpr->getKind(); }

	QualType getType() const { return FExpr->getType(); }

	bool isAscii() const { return FExpr->isAscii(); }
	bool isWide() const { return FExpr->isWide(); }
	bool isUTF8() const { return FExpr->isUTF8(); }
	bool isUTF16() const { return FExpr->isUTF16(); }
	bool isUTF32() const { return FExpr->isUTF32(); }
	bool isPascal() const { return FExpr->isPascal(); }

	SourceLocation getLocationOfByte(
	unsigned ByteNo, const SourceManager &SM, const LangOptions &Features,
	const TargetInfo &Target, unsigned *StartToken = nullptr,
	unsigned *StartTokenByteOffset = nullptr) const {
	return FExpr->getLocationOfByte(ByteNo + Offset, SM, Features, Target,
	StartToken, StartTokenByteOffset);
	}

	SourceLocation getLocStart() const LLVM_READONLY {
	return FExpr->getLocStart().getLocWithOffset(Offset);
	}

	SourceLocation getLocEnd() const LLVM_READONLY { return FExpr->getLocEnd(); }
	};

	} // namespace

	static void CheckFormatString(Sema &S, const FormatStringLiteral *FExpr,
	const Expr *OrigFormatExpr,
	ArrayRef<const Expr *> Args,
	bool HasVAListArg, unsigned format_idx,
	unsigned firstDataArg,
	Sema::FormatStringType Type,
	bool inFunctionCall,
	Sema::VariadicCallType CallType,
	llvm::SmallBitVector &CheckedVarArgs,
	UncoveredArgHandler &UncoveredArg);

	// Determine if an expression is a string literal or constant string.
	// If this function returns false on the arguments to a function expecting a
	// format string, we will usually need to emit a warning.
	// True string literals are then checked by CheckFormatString.
	static StringLiteralCheckType
	checkFormatStringExpr(Sema &S, const Expr E, ArrayRef<const Expr > Args,
	bool HasVAListArg, unsigned format_idx,
	unsigned firstDataArg, Sema::FormatStringType Type,
	Sema::VariadicCallType CallType, bool InFunctionCall,
	llvm::SmallBitVector &CheckedVarArgs,
	UncoveredArgHandler &UncoveredArg,
	llvm::APSInt Offset) {
	tryAgain:
	assert(Offset.isSigned() && "invalid offset");

	if (E->isTypeDependent() \|\| E->isValueDependent())
	return SLCT_NotALiteral;

	E = E->IgnoreParenCasts();

	if (E->isNullPointerConstant(S.Context, Expr::NPC_ValueDependentIsNotNull))
	// Technically -Wformat-nonliteral does not warn about this case.
	// The behavior of printf and friends in this case is implementation
	// dependent. Ideally if the format string cannot be null then
	// it should have a 'nonnull' attribute in the function prototype.
	return SLCT_UncheckedLiteral;

	switch (E->getStmtClass()) {
	case Stmt::BinaryConditionalOperatorClass:
	case Stmt::ConditionalOperatorClass: {
	// The expression is a literal if both sub-expressions were, and it was
	// completely checked only if both sub-expressions were checked.
	const AbstractConditionalOperator *C =
	cast<AbstractConditionalOperator>(E);

	// Determine whether it is necessary to check both sub-expressions, for
	// example, because the condition expression is a constant that can be
	// evaluated at compile time.
	bool CheckLeft = true, CheckRight = true;

	bool Cond;
	if (C->getCond()->EvaluateAsBooleanCondition(Cond, S.getASTContext())) {
	if (Cond)
	CheckRight = false;
	else
	CheckLeft = false;
	}

	// We need to maintain the offsets for the right and the left hand side
	// separately to check if every possible indexed expression is a valid
	// string literal. They might have different offsets for different string
	// literals in the end.
	StringLiteralCheckType Left;
	if (!CheckLeft)
	Left = SLCT_UncheckedLiteral;
	else {
	Left = checkFormatStringExpr(S, C->getTrueExpr(), Args,
	HasVAListArg, format_idx, firstDataArg,
	Type, CallType, InFunctionCall,
	CheckedVarArgs, UncoveredArg, Offset);
	if (Left == SLCT_NotALiteral \|\| !CheckRight) {
	return Left;
	}
	}

	StringLiteralCheckType Right =
	checkFormatStringExpr(S, C->getFalseExpr(), Args,
	HasVAListArg, format_idx, firstDataArg,
	Type, CallType, InFunctionCall, CheckedVarArgs,
	UncoveredArg, Offset);

	return (CheckLeft && Left < Right) ? Left : Right;
	}

	case Stmt::ImplicitCastExprClass:
	E = cast<ImplicitCastExpr>(E)->getSubExpr();
	goto tryAgain;

	case Stmt::OpaqueValueExprClass:
	if (const Expr *src = cast<OpaqueValueExpr>(E)->getSourceExpr()) {
	E = src;
	goto tryAgain;
	}
	return SLCT_NotALiteral;

	case Stmt::PredefinedExprClass:
	// While __func__, etc., are technically not string literals, they
	// cannot contain format specifiers and thus are not a security
	// liability.
	return SLCT_UncheckedLiteral;

	case Stmt::DeclRefExprClass: {
	const DeclRefExpr *DR = cast<DeclRefExpr>(E);

	// As an exception, do not flag errors for variables binding to
	// const string literals.
	if (const VarDecl *VD = dyn_cast<VarDecl>(DR->getDecl())) {
	bool isConstant = false;
	QualType T = DR->getType();

	if (const ArrayType *AT = S.Context.getAsArrayType(T)) {
	isConstant = AT->getElementType().isConstant(S.Context);
	} else if (const PointerType *PT = T->getAs<PointerType>()) {
	isConstant = T.isConstant(S.Context) &&
	PT->getPointeeType().isConstant(S.Context);
	} else if (T->isObjCObjectPointerType()) {
	// In ObjC, there is usually no "const ObjectPointer" type,
	// so don't check if the pointee type is constant.
	isConstant = T.isConstant(S.Context);
	}

	if (isConstant) {
	if (const Expr *Init = VD->getAnyInitializer()) {
	// Look through initializers like const char c[] = { "foo" }
	if (const InitListExpr *InitList = dyn_cast<InitListExpr>(Init)) {
	if (InitList->isStringLiteralInit())
	Init = InitList->getInit(0)->IgnoreParenImpCasts();
	}
	return checkFormatStringExpr(S, Init, Args,
	HasVAListArg, format_idx,
	firstDataArg, Type, CallType,
	/InFunctionCall/ false, CheckedVarArgs,
	UncoveredArg, Offset);
	}
	}

	// For vprintf* functions (i.e., HasVAListArg==true), we add a
	// special check to see if the format string is a function parameter
	// of the function calling the printf function. If the function
	// has an attribute indicating it is a printf-like function, then we
	// should suppress warnings concerning non-literals being used in a call
	// to a vprintf function. For example:
	//
	// void
	// logmessage(char const *fmt __attribute__ (format (printf, 1, 2)), ...){
	// va_list ap;
	// va_start(ap, fmt);
	// vprintf(fmt, ap); // Do NOT emit a warning about "fmt".
	// ...
	// }
	if (HasVAListArg) {
	if (const ParmVarDecl *PV = dyn_cast<ParmVarDecl>(VD)) {
	if (const NamedDecl *ND = dyn_cast<NamedDecl>(PV->getDeclContext())) {
	int PVIndex = PV->getFunctionScopeIndex() + 1;
	for (const auto *PVFormat : ND->specific_attrs<FormatAttr>()) {
	// adjust for implicit parameter
	if (const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(ND))
	if (MD->isInstance())
	++PVIndex;
	// We also check if the formats are compatible.
	// We can't pass a 'scanf' string to a 'printf' function.
	if (PVIndex == PVFormat->getFormatIdx() &&
	Type == S.GetFormatStringType(PVFormat))
	return SLCT_UncheckedLiteral;
	}
	}
	}
	}
	}

	return SLCT_NotALiteral;
	}

	case Stmt::CallExprClass:
	case Stmt::CXXMemberCallExprClass: {
	const CallExpr *CE = cast<CallExpr>(E);
	if (const NamedDecl *ND = dyn_cast_or_null<NamedDecl>(CE->getCalleeDecl())) {
	if (const FormatArgAttr *FA = ND->getAttr<FormatArgAttr>()) {
	unsigned ArgIndex = FA->getFormatIdx();
	if (const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(ND))
	if (MD->isInstance())
	--ArgIndex;
	const Expr *Arg = CE->getArg(ArgIndex - 1);

	return checkFormatStringExpr(S, Arg, Args,
	HasVAListArg, format_idx, firstDataArg,
	Type, CallType, InFunctionCall,
	CheckedVarArgs, UncoveredArg, Offset);
	} else if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(ND)) {
	unsigned BuiltinID = FD->getBuiltinID();
	if (BuiltinID == Builtin::BI__builtin___CFStringMakeConstantString \|\|
	BuiltinID == Builtin::BI__builtin___NSStringMakeConstantString) {
	const Expr *Arg = CE->getArg(0);
	return checkFormatStringExpr(S, Arg, Args,
	HasVAListArg, format_idx,
	firstDataArg, Type, CallType,
	InFunctionCall, CheckedVarArgs,
	UncoveredArg, Offset);
	}
	}
	}

	return SLCT_NotALiteral;
	}
	case Stmt::ObjCMessageExprClass: {
	const auto *ME = cast<ObjCMessageExpr>(E);
	if (const auto *ND = ME->getMethodDecl()) {
	if (const auto *FA = ND->getAttr<FormatArgAttr>()) {
	unsigned ArgIndex = FA->getFormatIdx();
	const Expr *Arg = ME->getArg(ArgIndex - 1);
	return checkFormatStringExpr(
	S, Arg, Args, HasVAListArg, format_idx, firstDataArg, Type,
	CallType, InFunctionCall, CheckedVarArgs, UncoveredArg, Offset);
	}
	}

	return SLCT_NotALiteral;
	}
	case Stmt::ObjCStringLiteralClass:
	case Stmt::StringLiteralClass: {
	const StringLiteral *StrE = nullptr;

	if (const ObjCStringLiteral *ObjCFExpr = dyn_cast<ObjCStringLiteral>(E))
	StrE = ObjCFExpr->getString();
	else
	StrE = cast<StringLiteral>(E);

	if (StrE) {
	if (Offset.isNegative() \|\| Offset > StrE->getLength()) {
	// TODO: It would be better to have an explicit warning for out of
	// bounds literals.
	return SLCT_NotALiteral;
	}
	FormatStringLiteral FStr(StrE, Offset.sextOrTrunc(64).getSExtValue());
	CheckFormatString(S, &FStr, E, Args, HasVAListArg, format_idx,
	firstDataArg, Type, InFunctionCall, CallType,
	CheckedVarArgs, UncoveredArg);
	return SLCT_CheckedLiteral;
	}

	return SLCT_NotALiteral;
	}
	case Stmt::BinaryOperatorClass: {
	llvm::APSInt LResult;
	llvm::APSInt RResult;

	const BinaryOperator *BinOp = cast<BinaryOperator>(E);

	// A string literal + an int offset is still a string literal.
	if (BinOp->isAdditiveOp()) {
	bool LIsInt = BinOp->getLHS()->EvaluateAsInt(LResult, S.Context);
	bool RIsInt = BinOp->getRHS()->EvaluateAsInt(RResult, S.Context);

	if (LIsInt != RIsInt) {
	BinaryOperatorKind BinOpKind = BinOp->getOpcode();

	if (LIsInt) {
	if (BinOpKind == BO_Add) {
	sumOffsets(Offset, LResult, BinOpKind, RIsInt);
	E = BinOp->getRHS();
	goto tryAgain;
	}
	} else {
	sumOffsets(Offset, RResult, BinOpKind, RIsInt);
	E = BinOp->getLHS();
	goto tryAgain;
	}
	}
	}

	return SLCT_NotALiteral;
	}
	case Stmt::UnaryOperatorClass: {
	const UnaryOperator *UnaOp = cast<UnaryOperator>(E);
	auto ASE = dyn_cast<ArraySubscriptExpr>(UnaOp->getSubExpr());
	if (UnaOp->getOpcode() == UO_AddrOf && ASE) {
	llvm::APSInt IndexResult;
	if (ASE->getRHS()->EvaluateAsInt(IndexResult, S.Context)) {
	sumOffsets(Offset, IndexResult, BO_Add, /RHS is int/ true);
	E = ASE->getBase();
	goto tryAgain;
	}
	}

	return SLCT_NotALiteral;
	}

	default:
	return SLCT_NotALiteral;
	}
	}

	Sema::FormatStringType Sema::GetFormatStringType(const FormatAttr *Format) {
	return llvm::StringSwitch<FormatStringType>(Format->getType()->getName())
	.Case("scanf", FST_Scanf)
	.Cases("printf", "printf0", FST_Printf)
	.Cases("NSString", "CFString", FST_NSString)
	.Case("strftime", FST_Strftime)
	.Case("strfmon", FST_Strfmon)
	.Cases("kprintf", "cmn_err", "vcmn_err", "zcmn_err", FST_Kprintf)
	.Case("freebsd_kprintf", FST_FreeBSDKPrintf)
	.Case("os_trace", FST_OSLog)
	.Case("os_log", FST_OSLog)
	.Default(FST_Unknown);
	}

	/// CheckFormatArguments - Check calls to printf and scanf (and similar
	/// functions) for correct use of format strings.
	/// Returns true if a format string has been fully checked.
	bool Sema::CheckFormatArguments(const FormatAttr *Format,
	ArrayRef<const Expr *> Args,
	bool IsCXXMember,
	VariadicCallType CallType,
	SourceLocation Loc, SourceRange Range,
	llvm::SmallBitVector &CheckedVarArgs) {
	FormatStringInfo FSI;
	if (getFormatStringInfo(Format, IsCXXMember, &FSI))
	return CheckFormatArguments(Args, FSI.HasVAListArg, FSI.FormatIdx,
	FSI.FirstDataArg, GetFormatStringType(Format),
	CallType, Loc, Range, CheckedVarArgs);
	return false;
	}

	bool Sema::CheckFormatArguments(ArrayRef<const Expr *> Args,
	bool HasVAListArg, unsigned format_idx,
	unsigned firstDataArg, FormatStringType Type,
	VariadicCallType CallType,
	SourceLocation Loc, SourceRange Range,
	llvm::SmallBitVector &CheckedVarArgs) {
	// CHECK: printf/scanf-like function is called with no format string.
	if (format_idx >= Args.size()) {
	Diag(Loc, diag::warn_missing_format_string) << Range;
	return false;
	}

	const Expr *OrigFormatExpr = Args[format_idx]->IgnoreParenCasts();

	// CHECK: format string is not a string literal.
	//
	// Dynamically generated format strings are difficult to
	// automatically vet at compile time. Requiring that format strings
	// are string literals: (1) permits the checking of format strings by
	// the compiler and thereby (2) can practically remove the source of
	// many format string exploits.

	// Format string can be either ObjC string (e.g. @"%d") or
	// C string (e.g. "%d")
	// ObjC string uses the same format specifiers as C string, so we can use
	// the same format string checking logic for both ObjC and C strings.
	UncoveredArgHandler UncoveredArg;
	StringLiteralCheckType CT =
	checkFormatStringExpr(*this, OrigFormatExpr, Args, HasVAListArg,
	format_idx, firstDataArg, Type, CallType,
	/IsFunctionCall/ true, CheckedVarArgs,
	UncoveredArg,
	/no string offset/ llvm::APSInt(64, false) = 0);

	// Generate a diagnostic where an uncovered argument is detected.
	if (UncoveredArg.hasUncoveredArg()) {
	unsigned ArgIdx = UncoveredArg.getUncoveredArg() + firstDataArg;
	assert(ArgIdx < Args.size() && "ArgIdx outside bounds");
	UncoveredArg.Diagnose(this, /IsFunctionCall*/true, Args[ArgIdx]);
	}

	if (CT != SLCT_NotALiteral)
	// Literal format string found, check done!
	return CT == SLCT_CheckedLiteral;

	// Strftime is particular as it always uses a single 'time' argument,
	// so it is safe to pass a non-literal string.
	if (Type == FST_Strftime)
	return false;

	// Do not emit diag when the string param is a macro expansion and the
	// format is either NSString or CFString. This is a hack to prevent
	// diag when using the NSLocalizedString and CFCopyLocalizedString macros
	// which are usually used in place of NS and CF string literals.
	SourceLocation FormatLoc = Args[format_idx]->getLocStart();
	if (Type == FST_NSString && SourceMgr.isInSystemMacro(FormatLoc))
	return false;

	// If there are no arguments specified, warn with -Wformat-security, otherwise
	// warn only with -Wformat-nonliteral.
	if (Args.size() == firstDataArg) {
	Diag(FormatLoc, diag::warn_format_nonliteral_noargs)
	<< OrigFormatExpr->getSourceRange();
	switch (Type) {
	default:
	break;
	case FST_Kprintf:
	case FST_FreeBSDKPrintf:
	case FST_Printf:
	Diag(FormatLoc, diag::note_format_security_fixit)
	<< FixItHint::CreateInsertion(FormatLoc, "\"%s\", ");
	break;
	case FST_NSString:
	Diag(FormatLoc, diag::note_format_security_fixit)
	<< FixItHint::CreateInsertion(FormatLoc, "@\"%@\", ");
	break;
	}
	} else {
	Diag(FormatLoc, diag::warn_format_nonliteral)
	<< OrigFormatExpr->getSourceRange();
	}
	return false;
	}

	namespace {

	class CheckFormatHandler : public analyze_format_string::FormatStringHandler {
	protected:
	Sema &S;
	const FormatStringLiteral *FExpr;
	const Expr *OrigFormatExpr;
	const Sema::FormatStringType FSType;
	const unsigned FirstDataArg;
	const unsigned NumDataArgs;
	const char *Beg; // Start of format string.
	const bool HasVAListArg;
	ArrayRef<const Expr *> Args;
	unsigned FormatIdx;
	llvm::SmallBitVector CoveredArgs;
	bool usesPositionalArgs = false;
	bool atFirstArg = true;
	bool inFunctionCall;
	Sema::VariadicCallType CallType;
	llvm::SmallBitVector &CheckedVarArgs;
	UncoveredArgHandler &UncoveredArg;

	public:
	CheckFormatHandler(Sema &s, const FormatStringLiteral *fexpr,
	const Expr *origFormatExpr,
	const Sema::FormatStringType type, unsigned firstDataArg,
	unsigned numDataArgs, const char *beg, bool hasVAListArg,
	ArrayRef<const Expr *> Args, unsigned formatIdx,
	bool inFunctionCall, Sema::VariadicCallType callType,
	llvm::SmallBitVector &CheckedVarArgs,
	UncoveredArgHandler &UncoveredArg)
	: S(s), FExpr(fexpr), OrigFormatExpr(origFormatExpr), FSType(type),
	FirstDataArg(firstDataArg), NumDataArgs(numDataArgs), Beg(beg),
	HasVAListArg(hasVAListArg), Args(Args), FormatIdx(formatIdx),
	inFunctionCall(inFunctionCall), CallType(callType),
	CheckedVarArgs(CheckedVarArgs), UncoveredArg(UncoveredArg) {
	CoveredArgs.resize(numDataArgs);
	CoveredArgs.reset();
	}

	void DoneProcessing();

	void HandleIncompleteSpecifier(const char *startSpecifier,
	unsigned specifierLen) override;

	void HandleInvalidLengthModifier(
	const analyze_format_string::FormatSpecifier &FS,
	const analyze_format_string::ConversionSpecifier &CS,
	const char *startSpecifier, unsigned specifierLen,
	unsigned DiagID);

	void HandleNonStandardLengthModifier(
	const analyze_format_string::FormatSpecifier &FS,
	const char *startSpecifier, unsigned specifierLen);

	void HandleNonStandardConversionSpecifier(
	const analyze_format_string::ConversionSpecifier &CS,
	const char *startSpecifier, unsigned specifierLen);

	void HandlePosition(const char *startPos, unsigned posLen) override;

	void HandleInvalidPosition(const char *startSpecifier,
	unsigned specifierLen,
	analyze_format_string::PositionContext p) override;

	void HandleZeroPosition(const char *startPos, unsigned posLen) override;

	void HandleNullChar(const char *nullCharacter) override;

	template <typename Range>
	static void
	EmitFormatDiagnostic(Sema &S, bool inFunctionCall, const Expr *ArgumentExpr,
	const PartialDiagnostic &PDiag, SourceLocation StringLoc,
	bool IsStringLocation, Range StringRange,
	ArrayRef<FixItHint> Fixit = None);

	protected:
	bool HandleInvalidConversionSpecifier(unsigned argIndex, SourceLocation Loc,
	const char *startSpec,
	unsigned specifierLen,
	const char *csStart, unsigned csLen);

	void HandlePositionalNonpositionalArgs(SourceLocation Loc,
	const char *startSpec,
	unsigned specifierLen);

	SourceRange getFormatStringRange();
	CharSourceRange getSpecifierRange(const char *startSpecifier,
	unsigned specifierLen);
	SourceLocation getLocationOfByte(const char *x);

	const Expr *getDataArg(unsigned i) const;

	bool CheckNumArgs(const analyze_format_string::FormatSpecifier &FS,
	const analyze_format_string::ConversionSpecifier &CS,
	const char *startSpecifier, unsigned specifierLen,
	unsigned argIndex);

	template <typename Range>
	void EmitFormatDiagnostic(PartialDiagnostic PDiag, SourceLocation StringLoc,
	bool IsStringLocation, Range StringRange,
	ArrayRef<FixItHint> Fixit = None);
	};

	} // namespace

	SourceRange CheckFormatHandler::getFormatStringRange() {
	return OrigFormatExpr->getSourceRange();
	}

	CharSourceRange CheckFormatHandler::
	getSpecifierRange(const char *startSpecifier, unsigned specifierLen) {
	SourceLocation Start = getLocationOfByte(startSpecifier);
	SourceLocation End = getLocationOfByte(startSpecifier + specifierLen - 1);

	// Advance the end SourceLocation by one due to half-open ranges.
	End = End.getLocWithOffset(1);

	return CharSourceRange::getCharRange(Start, End);
	}

	SourceLocation CheckFormatHandler::getLocationOfByte(const char *x) {
	return FExpr->getLocationOfByte(x - Beg, S.getSourceManager(),
	S.getLangOpts(), S.Context.getTargetInfo());
	}

	void CheckFormatHandler::HandleIncompleteSpecifier(const char *startSpecifier,
	unsigned specifierLen){
	EmitFormatDiagnostic(S.PDiag(diag::warn_printf_incomplete_specifier),
	getLocationOfByte(startSpecifier),
	/IsStringLocation/true,
	getSpecifierRange(startSpecifier, specifierLen));
	}

	void CheckFormatHandler::HandleInvalidLengthModifier(
	const analyze_format_string::FormatSpecifier &FS,
	const analyze_format_string::ConversionSpecifier &CS,
	const char *startSpecifier, unsigned specifierLen, unsigned DiagID) {
	using namespace analyze_format_string;

	const LengthModifier &LM = FS.getLengthModifier();
	CharSourceRange LMRange = getSpecifierRange(LM.getStart(), LM.getLength());

	// See if we know how to fix this length modifier.
	Optional<LengthModifier> FixedLM = FS.getCorrectedLengthModifier();
	if (FixedLM) {
	EmitFormatDiagnostic(S.PDiag(DiagID) << LM.toString() << CS.toString(),
	getLocationOfByte(LM.getStart()),
	/IsStringLocation/true,
	getSpecifierRange(startSpecifier, specifierLen));

	S.Diag(getLocationOfByte(LM.getStart()), diag::note_format_fix_specifier)
	<< FixedLM->toString()
	<< FixItHint::CreateReplacement(LMRange, FixedLM->toString());

	} else {
	FixItHint Hint;
	if (DiagID == diag::warn_format_nonsensical_length)
	Hint = FixItHint::CreateRemoval(LMRange);

	EmitFormatDiagnostic(S.PDiag(DiagID) << LM.toString() << CS.toString(),
	getLocationOfByte(LM.getStart()),
	/IsStringLocation/true,
	getSpecifierRange(startSpecifier, specifierLen),
	Hint);
	}
	}

	void CheckFormatHandler::HandleNonStandardLengthModifier(
	const analyze_format_string::FormatSpecifier &FS,
	const char *startSpecifier, unsigned specifierLen) {
	using namespace analyze_format_string;

	const LengthModifier &LM = FS.getLengthModifier();
	CharSourceRange LMRange = getSpecifierRange(LM.getStart(), LM.getLength());

	// See if we know how to fix this length modifier.
	Optional<LengthModifier> FixedLM = FS.getCorrectedLengthModifier();
	if (FixedLM) {
	EmitFormatDiagnostic(S.PDiag(diag::warn_format_non_standard)
	<< LM.toString() << 0,
	getLocationOfByte(LM.getStart()),
	/IsStringLocation/true,
	getSpecifierRange(startSpecifier, specifierLen));

	S.Diag(getLocationOfByte(LM.getStart()), diag::note_format_fix_specifier)
	<< FixedLM->toString()
	<< FixItHint::CreateReplacement(LMRange, FixedLM->toString());

	} else {
	EmitFormatDiagnostic(S.PDiag(diag::warn_format_non_standard)
	<< LM.toString() << 0,
	getLocationOfByte(LM.getStart()),
	/IsStringLocation/true,
	getSpecifierRange(startSpecifier, specifierLen));
	}
	}

	void CheckFormatHandler::HandleNonStandardConversionSpecifier(
	const analyze_format_string::ConversionSpecifier &CS,
	const char *startSpecifier, unsigned specifierLen) {
	using namespace analyze_format_string;

	// See if we know how to fix this conversion specifier.
	Optional<ConversionSpecifier> FixedCS = CS.getStandardSpecifier();
	if (FixedCS) {
	EmitFormatDiagnostic(S.PDiag(diag::warn_format_non_standard)
	<< CS.toString() << /conversion specifier/1,
	getLocationOfByte(CS.getStart()),
	/IsStringLocation/true,
	getSpecifierRange(startSpecifier, specifierLen));

	CharSourceRange CSRange = getSpecifierRange(CS.getStart(), CS.getLength());
	S.Diag(getLocationOfByte(CS.getStart()), diag::note_format_fix_specifier)
	<< FixedCS->toString()
	<< FixItHint::CreateReplacement(CSRange, FixedCS->toString());
	} else {
	EmitFormatDiagnostic(S.PDiag(diag::warn_format_non_standard)
	<< CS.toString() << /conversion specifier/1,
	getLocationOfByte(CS.getStart()),
	/IsStringLocation/true,
	getSpecifierRange(startSpecifier, specifierLen));
	}
	}

	void CheckFormatHandler::HandlePosition(const char *startPos,
	unsigned posLen) {
	EmitFormatDiagnostic(S.PDiag(diag::warn_format_non_standard_positional_arg),
	getLocationOfByte(startPos),
	/IsStringLocation/true,
	getSpecifierRange(startPos, posLen));
	}

	void
	CheckFormatHandler::HandleInvalidPosition(const char *startPos, unsigned posLen,
	analyze_format_string::PositionContext p) {
	EmitFormatDiagnostic(S.PDiag(diag::warn_format_invalid_positional_specifier)
	<< (unsigned) p,
	getLocationOfByte(startPos), /IsStringLocation/true,
	getSpecifierRange(startPos, posLen));
	}

	void CheckFormatHandler::HandleZeroPosition(const char *startPos,
	unsigned posLen) {
	EmitFormatDiagnostic(S.PDiag(diag::warn_format_zero_positional_specifier),
	getLocationOfByte(startPos),
	/IsStringLocation/true,
	getSpecifierRange(startPos, posLen));
	}

	void CheckFormatHandler::HandleNullChar(const char *nullCharacter) {
	if (!isa<ObjCStringLiteral>(OrigFormatExpr)) {
	// The presence of a null character is likely an error.
	EmitFormatDiagnostic(
	S.PDiag(diag::warn_printf_format_string_contains_null_char),
	getLocationOfByte(nullCharacter), /IsStringLocation/true,
	getFormatStringRange());
	}
	}

	// Note that this may return NULL if there was an error parsing or building
	// one of the argument expressions.
	const Expr *CheckFormatHandler::getDataArg(unsigned i) const {
	return Args[FirstDataArg + i];
	}

	void CheckFormatHandler::DoneProcessing() {
	// Does the number of data arguments exceed the number of
	// format conversions in the format string?
	if (!HasVAListArg) {
	// Find any arguments that weren't covered.
	CoveredArgs.flip();
	signed notCoveredArg = CoveredArgs.find_first();
	if (notCoveredArg >= 0) {
	assert((unsigned)notCoveredArg < NumDataArgs);
	UncoveredArg.Update(notCoveredArg, OrigFormatExpr);
	} else {
	UncoveredArg.setAllCovered();
	}
	}
	}

	void UncoveredArgHandler::Diagnose(Sema &S, bool IsFunctionCall,
	const Expr *ArgExpr) {
	assert(hasUncoveredArg() && DiagnosticExprs.size() > 0 &&
	"Invalid state");

	if (!ArgExpr)
	return;

	SourceLocation Loc = ArgExpr->getLocStart();

	if (S.getSourceManager().isInSystemMacro(Loc))
	return;

	PartialDiagnostic PDiag = S.PDiag(diag::warn_printf_data_arg_not_used);
	for (auto E : DiagnosticExprs)
	PDiag << E->getSourceRange();

	CheckFormatHandler::EmitFormatDiagnostic(
	S, IsFunctionCall, DiagnosticExprs[0],
	PDiag, Loc, /IsStringLocation/false,
	DiagnosticExprs[0]->getSourceRange());
	}

	bool
	CheckFormatHandler::HandleInvalidConversionSpecifier(unsigned argIndex,
	SourceLocation Loc,
	const char *startSpec,
	unsigned specifierLen,
	const char *csStart,
	unsigned csLen) {
	bool keepGoing = true;
	if (argIndex < NumDataArgs) {
	// Consider the argument coverered, even though the specifier doesn't
	// make sense.
	CoveredArgs.set(argIndex);
	}
	else {
	// If argIndex exceeds the number of data arguments we
	// don't issue a warning because that is just a cascade of warnings (and
	// they may have intended '%%' anyway). We don't want to continue processing
	// the format string after this point, however, as we will like just get
	// gibberish when trying to match arguments.
	keepGoing = false;
	}

	StringRef Specifier(csStart, csLen);

	// If the specifier in non-printable, it could be the first byte of a UTF-8
	// sequence. In that case, print the UTF-8 code point. If not, print the byte
	// hex value.
	std::string CodePointStr;
	if (!llvm::sys::locale::isPrint(*csStart)) {
	llvm::UTF32 CodePoint;
	const llvm::UTF8 B = reinterpret_cast<const llvm::UTF8 >(&csStart);
	const llvm::UTF8 *E =
	reinterpret_cast<const llvm::UTF8 *>(csStart + csLen);
	llvm::ConversionResult Result =
	llvm::convertUTF8Sequence(B, E, &CodePoint, llvm::strictConversion);

	if (Result != llvm::conversionOK) {
	unsigned char FirstChar = *csStart;
	CodePoint = (llvm::UTF32)FirstChar;
	}

	llvm::raw_string_ostream OS(CodePointStr);
	if (CodePoint < 256)
	OS << "\\x" << llvm::format("%02x", CodePoint);
	else if (CodePoint <= 0xFFFF)
	OS << "\\u" << llvm::format("%04x", CodePoint);
	else
	OS << "\\U" << llvm::format("%08x", CodePoint);
	OS.flush();
	Specifier = CodePointStr;
	}

	EmitFormatDiagnostic(
	S.PDiag(diag::warn_format_invalid_conversion) << Specifier, Loc,
	/IsStringLocation/ true, getSpecifierRange(startSpec, specifierLen));

	return keepGoing;
	}

	void
	CheckFormatHandler::HandlePositionalNonpositionalArgs(SourceLocation Loc,
	const char *startSpec,
	unsigned specifierLen) {
	EmitFormatDiagnostic(
	S.PDiag(diag::warn_format_mix_positional_nonpositional_args),
	Loc, /isStringLoc/true, getSpecifierRange(startSpec, specifierLen));
	}

	bool
	CheckFormatHandler::CheckNumArgs(
	const analyze_format_string::FormatSpecifier &FS,
	const analyze_format_string::ConversionSpecifier &CS,
	const char *startSpecifier, unsigned specifierLen, unsigned argIndex) {

	if (argIndex >= NumDataArgs) {
	PartialDiagnostic PDiag = FS.usesPositionalArg()
	? (S.PDiag(diag::warn_printf_positional_arg_exceeds_data_args)
	<< (argIndex+1) << NumDataArgs)
	: S.PDiag(diag::warn_printf_insufficient_data_args);
	EmitFormatDiagnostic(
	PDiag, getLocationOfByte(CS.getStart()), /IsStringLocation/true,
	getSpecifierRange(startSpecifier, specifierLen));

	// Since more arguments than conversion tokens are given, by extension
	// all arguments are covered, so mark this as so.
	UncoveredArg.setAllCovered();
	return false;
	}
	return true;
	}

	template<typename Range>
	void CheckFormatHandler::EmitFormatDiagnostic(PartialDiagnostic PDiag,
	SourceLocation Loc,
	bool IsStringLocation,
	Range StringRange,
	ArrayRef<FixItHint> FixIt) {
	EmitFormatDiagnostic(S, inFunctionCall, Args[FormatIdx], PDiag,
	Loc, IsStringLocation, StringRange, FixIt);
	}

	/// \brief If the format string is not within the funcion call, emit a note
	/// so that the function call and string are in diagnostic messages.
	///
	/// \param InFunctionCall if true, the format string is within the function
	/// call and only one diagnostic message will be produced. Otherwise, an
	/// extra note will be emitted pointing to location of the format string.
	///
	/// \param ArgumentExpr the expression that is passed as the format string
	/// argument in the function call. Used for getting locations when two
	/// diagnostics are emitted.
	///
	/// \param PDiag the callee should already have provided any strings for the
	/// diagnostic message. This function only adds locations and fixits
	/// to diagnostics.
	///
	/// \param Loc primary location for diagnostic. If two diagnostics are
	/// required, one will be at Loc and a new SourceLocation will be created for
	/// the other one.
	///
	/// \param IsStringLocation if true, Loc points to the format string should be
	/// used for the note. Otherwise, Loc points to the argument list and will
	/// be used with PDiag.
	///
	/// \param StringRange some or all of the string to highlight. This is
	/// templated so it can accept either a CharSourceRange or a SourceRange.
	///
	/// \param FixIt optional fix it hint for the format string.
	template <typename Range>
	void CheckFormatHandler::EmitFormatDiagnostic(
	Sema &S, bool InFunctionCall, const Expr *ArgumentExpr,
	const PartialDiagnostic &PDiag, SourceLocation Loc, bool IsStringLocation,
	Range StringRange, ArrayRef<FixItHint> FixIt) {
	if (InFunctionCall) {
	const Sema::SemaDiagnosticBuilder &D = S.Diag(Loc, PDiag);
	D << StringRange;
	D << FixIt;
	} else {
	S.Diag(IsStringLocation ? ArgumentExpr->getExprLoc() : Loc, PDiag)
	<< ArgumentExpr->getSourceRange();

	const Sema::SemaDiagnosticBuilder &Note =
	S.Diag(IsStringLocation ? Loc : StringRange.getBegin(),
	diag::note_format_string_defined);

	Note << StringRange;
	Note << FixIt;
	}
	}

	//===--- CHECK: Printf format string checking ------------------------------===//

	namespace {

	class CheckPrintfHandler : public CheckFormatHandler {
	public:
	CheckPrintfHandler(Sema &s, const FormatStringLiteral *fexpr,
	const Expr *origFormatExpr,
	const Sema::FormatStringType type, unsigned firstDataArg,
	unsigned numDataArgs, bool isObjC, const char *beg,
	bool hasVAListArg, ArrayRef<const Expr *> Args,
	unsigned formatIdx, bool inFunctionCall,
	Sema::VariadicCallType CallType,
	llvm::SmallBitVector &CheckedVarArgs,
	UncoveredArgHandler &UncoveredArg)
	: CheckFormatHandler(s, fexpr, origFormatExpr, type, firstDataArg,
	numDataArgs, beg, hasVAListArg, Args, formatIdx,
	inFunctionCall, CallType, CheckedVarArgs,
	UncoveredArg) {}

	bool isObjCContext() const { return FSType == Sema::FST_NSString; }

	/// Returns true if '%@' specifiers are allowed in the format string.
	bool allowsObjCArg() const {
	return FSType == Sema::FST_NSString \|\| FSType == Sema::FST_OSLog \|\|
	FSType == Sema::FST_OSTrace;
	}

	bool HandleInvalidPrintfConversionSpecifier(
	const analyze_printf::PrintfSpecifier &FS,
	const char *startSpecifier,
	unsigned specifierLen) override;

	bool HandlePrintfSpecifier(const analyze_printf::PrintfSpecifier &FS,
	const char *startSpecifier,
	unsigned specifierLen) override;
	bool checkFormatExpr(const analyze_printf::PrintfSpecifier &FS,
	const char *StartSpecifier,
	unsigned SpecifierLen,
	const Expr *E);

	bool HandleAmount(const analyze_format_string::OptionalAmount &Amt, unsigned k,
	const char *startSpecifier, unsigned specifierLen);
	void HandleInvalidAmount(const analyze_printf::PrintfSpecifier &FS,
	const analyze_printf::OptionalAmount &Amt,
	unsigned type,
	const char *startSpecifier, unsigned specifierLen);
	void HandleFlag(const analyze_printf::PrintfSpecifier &FS,
	const analyze_printf::OptionalFlag &flag,
	const char *startSpecifier, unsigned specifierLen);
	void HandleIgnoredFlag(const analyze_printf::PrintfSpecifier &FS,
	const analyze_printf::OptionalFlag &ignoredFlag,
	const analyze_printf::OptionalFlag &flag,
	const char *startSpecifier, unsigned specifierLen);
	bool checkForCStrMembers(const analyze_printf::ArgType &AT,
	const Expr *E);

	void HandleEmptyObjCModifierFlag(const char *startFlag,
	unsigned flagLen) override;

	void HandleInvalidObjCModifierFlag(const char *startFlag,
	unsigned flagLen) override;

	void HandleObjCFlagsWithNonObjCConversion(const char *flagsStart,
	const char *flagsEnd,
	const char *conversionPosition)
	override;
	};

	} // namespace

	bool CheckPrintfHandler::HandleInvalidPrintfConversionSpecifier(
	const analyze_printf::PrintfSpecifier &FS,
	const char *startSpecifier,
	unsigned specifierLen) {
	const analyze_printf::PrintfConversionSpecifier &CS =
	FS.getConversionSpecifier();

	return HandleInvalidConversionSpecifier(FS.getArgIndex(),
	getLocationOfByte(CS.getStart()),
	startSpecifier, specifierLen,
	CS.getStart(), CS.getLength());
	}

	bool CheckPrintfHandler::HandleAmount(
	const analyze_format_string::OptionalAmount &Amt,
	unsigned k, const char *startSpecifier,
	unsigned specifierLen) {
	if (Amt.hasDataArgument()) {
	if (!HasVAListArg) {
	unsigned argIndex = Amt.getArgIndex();
	if (argIndex >= NumDataArgs) {
	EmitFormatDiagnostic(S.PDiag(diag::warn_printf_asterisk_missing_arg)
	<< k,
	getLocationOfByte(Amt.getStart()),
	/IsStringLocation/true,
	getSpecifierRange(startSpecifier, specifierLen));
	// Don't do any more checking. We will just emit
	// spurious errors.
	return false;
	}

	// Type check the data argument. It should be an 'int'.
	// Although not in conformance with C99, we also allow the argument to be
	// an 'unsigned int' as that is a reasonably safe case. GCC also
	// doesn't emit a warning for that case.
	CoveredArgs.set(argIndex);
	const Expr *Arg = getDataArg(argIndex);
	if (!Arg)
	return false;

	QualType T = Arg->getType();

	const analyze_printf::ArgType &AT = Amt.getArgType(S.Context);
	assert(AT.isValid());

	if (!AT.matchesType(S.Context, T)) {
	EmitFormatDiagnostic(S.PDiag(diag::warn_printf_asterisk_wrong_type)
	<< k << AT.getRepresentativeTypeName(S.Context)
	<< T << Arg->getSourceRange(),
	getLocationOfByte(Amt.getStart()),
	/IsStringLocation/true,
	getSpecifierRange(startSpecifier, specifierLen));
	// Don't do any more checking. We will just emit
	// spurious errors.
	return false;
	}
	}
	}
	return true;
	}

	void CheckPrintfHandler::HandleInvalidAmount(
	const analyze_printf::PrintfSpecifier &FS,
	const analyze_printf::OptionalAmount &Amt,
	unsigned type,
	const char *startSpecifier,
	unsigned specifierLen) {
	const analyze_printf::PrintfConversionSpecifier &CS =
	FS.getConversionSpecifier();

	FixItHint fixit =
	Amt.getHowSpecified() == analyze_printf::OptionalAmount::Constant
	? FixItHint::CreateRemoval(getSpecifierRange(Amt.getStart(),
	Amt.getConstantLength()))
	: FixItHint();

	EmitFormatDiagnostic(S.PDiag(diag::warn_printf_nonsensical_optional_amount)
	<< type << CS.toString(),
	getLocationOfByte(Amt.getStart()),
	/IsStringLocation/true,
	getSpecifierRange(startSpecifier, specifierLen),
	fixit);
	}

	void CheckPrintfHandler::HandleFlag(const analyze_printf::PrintfSpecifier &FS,
	const analyze_printf::OptionalFlag &flag,
	const char *startSpecifier,
	unsigned specifierLen) {
	// Warn about pointless flag with a fixit removal.
	const analyze_printf::PrintfConversionSpecifier &CS =
	FS.getConversionSpecifier();
	EmitFormatDiagnostic(S.PDiag(diag::warn_printf_nonsensical_flag)
	<< flag.toString() << CS.toString(),
	getLocationOfByte(flag.getPosition()),
	/IsStringLocation/true,
	getSpecifierRange(startSpecifier, specifierLen),
	FixItHint::CreateRemoval(
	getSpecifierRange(flag.getPosition(), 1)));
	}

	void CheckPrintfHandler::HandleIgnoredFlag(
	const analyze_printf::PrintfSpecifier &FS,
	const analyze_printf::OptionalFlag &ignoredFlag,
	const analyze_printf::OptionalFlag &flag,
	const char *startSpecifier,
	unsigned specifierLen) {
	// Warn about ignored flag with a fixit removal.
	EmitFormatDiagnostic(S.PDiag(diag::warn_printf_ignored_flag)
	<< ignoredFlag.toString() << flag.toString(),
	getLocationOfByte(ignoredFlag.getPosition()),
	/IsStringLocation/true,
	getSpecifierRange(startSpecifier, specifierLen),
	FixItHint::CreateRemoval(
	getSpecifierRange(ignoredFlag.getPosition(), 1)));
	}

	void CheckPrintfHandler::HandleEmptyObjCModifierFlag(const char *startFlag,
	unsigned flagLen) {
	// Warn about an empty flag.
	EmitFormatDiagnostic(S.PDiag(diag::warn_printf_empty_objc_flag),
	getLocationOfByte(startFlag),
	/IsStringLocation/true,
	getSpecifierRange(startFlag, flagLen));
	}

	void CheckPrintfHandler::HandleInvalidObjCModifierFlag(const char *startFlag,
	unsigned flagLen) {
	// Warn about an invalid flag.
	auto Range = getSpecifierRange(startFlag, flagLen);
	StringRef flag(startFlag, flagLen);
	EmitFormatDiagnostic(S.PDiag(diag::warn_printf_invalid_objc_flag) << flag,
	getLocationOfByte(startFlag),
	/IsStringLocation/true,
	Range, FixItHint::CreateRemoval(Range));
	}

	void CheckPrintfHandler::HandleObjCFlagsWithNonObjCConversion(
	const char flagsStart, const char flagsEnd, const char *conversionPosition) {
	// Warn about using '[...]' without a '@' conversion.
	auto Range = getSpecifierRange(flagsStart, flagsEnd - flagsStart + 1);
	auto diag = diag::warn_printf_ObjCflags_without_ObjCConversion;
	EmitFormatDiagnostic(S.PDiag(diag) << StringRef(conversionPosition, 1),
	getLocationOfByte(conversionPosition),
	/IsStringLocation/true,
	Range, FixItHint::CreateRemoval(Range));
	}

	// Determines if the specified is a C++ class or struct containing
	// a member with the specified name and kind (e.g. a CXXMethodDecl named
	// "c_str()").
	template<typename MemberKind>
	static llvm::SmallPtrSet<MemberKind*, 1>
	CXXRecordMembersNamed(StringRef Name, Sema &S, QualType Ty) {
	const RecordType *RT = Ty->getAs<RecordType>();
	llvm::SmallPtrSet<MemberKind*, 1> Results;

	if (!RT)
	return Results;
	const CXXRecordDecl *RD = dyn_cast<CXXRecordDecl>(RT->getDecl());
	if (!RD \|\| !RD->getDefinition())
	return Results;

	LookupResult R(S, &S.Context.Idents.get(Name), SourceLocation(),
	Sema::LookupMemberName);
	R.suppressDiagnostics();

	// We just need to include all members of the right kind turned up by the
	// filter, at this point.
	if (S.LookupQualifiedName(R, RT->getDecl()))
	for (LookupResult::iterator I = R.begin(), E = R.end(); I != E; ++I) {
	NamedDecl decl = (I)->getUnderlyingDecl();
	if (MemberKind *FK = dyn_cast<MemberKind>(decl))
	Results.insert(FK);
	}
	return Results;
	}

	/// Check if we could call '.c_str()' on an object.
	///
	/// FIXME: This returns the wrong results in some cases (if cv-qualifiers don't
	/// allow the call, or if it would be ambiguous).
	bool Sema::hasCStrMethod(const Expr *E) {
	using MethodSet = llvm::SmallPtrSet<CXXMethodDecl *, 1>;

	MethodSet Results =
	CXXRecordMembersNamed<CXXMethodDecl>("c_str", *this, E->getType());
	for (MethodSet::iterator MI = Results.begin(), ME = Results.end();
	MI != ME; ++MI)
	if ((*MI)->getMinRequiredArguments() == 0)
	return true;
	return false;
	}

	// Check if a (w)string was passed when a (w)char* was needed, and offer a
	// better diagnostic if so. AT is assumed to be valid.
	// Returns true when a c_str() conversion method is found.
	bool CheckPrintfHandler::checkForCStrMembers(
	const analyze_printf::ArgType &AT, const Expr *E) {
	using MethodSet = llvm::SmallPtrSet<CXXMethodDecl *, 1>;

	MethodSet Results =
	CXXRecordMembersNamed<CXXMethodDecl>("c_str", S, E->getType());

	for (MethodSet::iterator MI = Results.begin(), ME = Results.end();
	MI != ME; ++MI) {
	const CXXMethodDecl Method = MI;
	if (Method->getMinRequiredArguments() == 0 &&
	AT.matchesType(S.Context, Method->getReturnType())) {
	// FIXME: Suggest parens if the expression needs them.
	SourceLocation EndLoc = S.getLocForEndOfToken(E->getLocEnd());
	S.Diag(E->getLocStart(), diag::note_printf_c_str)
	<< "c_str()"
	<< FixItHint::CreateInsertion(EndLoc, ".c_str()");
	return true;
	}
	}

	return false;
	}

	bool
	CheckPrintfHandler::HandlePrintfSpecifier(const analyze_printf::PrintfSpecifier
	&FS,
	const char *startSpecifier,
	unsigned specifierLen) {
	using namespace analyze_format_string;
	using namespace analyze_printf;

	const PrintfConversionSpecifier &CS = FS.getConversionSpecifier();

	if (FS.consumesDataArgument()) {
	if (atFirstArg) {
	atFirstArg = false;
	usesPositionalArgs = FS.usesPositionalArg();
	}
	else if (usesPositionalArgs != FS.usesPositionalArg()) {
	HandlePositionalNonpositionalArgs(getLocationOfByte(CS.getStart()),
	startSpecifier, specifierLen);
	return false;
	}
	}

	// First check if the field width, precision, and conversion specifier
	// have matching data arguments.
	if (!HandleAmount(FS.getFieldWidth(), /* field width */ 0,
	startSpecifier, specifierLen)) {
	return false;
	}

	if (!HandleAmount(FS.getPrecision(), /* precision */ 1,
	startSpecifier, specifierLen)) {
	return false;
	}

	if (!CS.consumesDataArgument()) {
	// FIXME: Technically specifying a precision or field width here
	// makes no sense. Worth issuing a warning at some point.
	return true;
	}

	// Consume the argument.
	unsigned argIndex = FS.getArgIndex();
	if (argIndex < NumDataArgs) {
	// The check to see if the argIndex is valid will come later.
	// We set the bit here because we may exit early from this
	// function if we encounter some other error.
	CoveredArgs.set(argIndex);
	}

	// FreeBSD kernel extensions.
	if (CS.getKind() == ConversionSpecifier::FreeBSDbArg \|\|
	CS.getKind() == ConversionSpecifier::FreeBSDDArg) {
	// We need at least two arguments.
	if (!CheckNumArgs(FS, CS, startSpecifier, specifierLen, argIndex + 1))
	return false;

	// Claim the second argument.
	CoveredArgs.set(argIndex + 1);

	// Type check the first argument (int for %b, pointer for %D)
	const Expr *Ex = getDataArg(argIndex);
	const analyze_printf::ArgType &AT =
	(CS.getKind() == ConversionSpecifier::FreeBSDbArg) ?
	ArgType(S.Context.IntTy) : ArgType::CPointerTy;
	if (AT.isValid() && !AT.matchesType(S.Context, Ex->getType()))
	EmitFormatDiagnostic(
	S.PDiag(diag::warn_format_conversion_argument_type_mismatch)
	<< AT.getRepresentativeTypeName(S.Context) << Ex->getType()
	<< false << Ex->getSourceRange(),
	Ex->getLocStart(), /IsStringLocation/false,
	getSpecifierRange(startSpecifier, specifierLen));

	// Type check the second argument (char * for both %b and %D)
	Ex = getDataArg(argIndex + 1);
	const analyze_printf::ArgType &AT2 = ArgType::CStrTy;
	if (AT2.isValid() && !AT2.matchesType(S.Context, Ex->getType()))
	EmitFormatDiagnostic(
	S.PDiag(diag::warn_format_conversion_argument_type_mismatch)
	<< AT2.getRepresentativeTypeName(S.Context) << Ex->getType()
	<< false << Ex->getSourceRange(),
	Ex->getLocStart(), /IsStringLocation/false,
	getSpecifierRange(startSpecifier, specifierLen));

	return true;
	}

	// Check for using an Objective-C specific conversion specifier
	// in a non-ObjC literal.
	if (!allowsObjCArg() && CS.isObjCArg()) {
	return HandleInvalidPrintfConversionSpecifier(FS, startSpecifier,
	specifierLen);
	}

	// %P can only be used with os_log.
	if (FSType != Sema::FST_OSLog && CS.getKind() == ConversionSpecifier::PArg) {
	return HandleInvalidPrintfConversionSpecifier(FS, startSpecifier,
	specifierLen);
	}

	// %n is not allowed with os_log.
	if (FSType == Sema::FST_OSLog && CS.getKind() == ConversionSpecifier::nArg) {
	EmitFormatDiagnostic(S.PDiag(diag::warn_os_log_format_narg),
	getLocationOfByte(CS.getStart()),
	/IsStringLocation/ false,
	getSpecifierRange(startSpecifier, specifierLen));

	return true;
	}

	// Only scalars are allowed for os_trace.
	if (FSType == Sema::FST_OSTrace &&
	(CS.getKind() == ConversionSpecifier::PArg \|\|
	CS.getKind() == ConversionSpecifier::sArg \|\|
	CS.getKind() == ConversionSpecifier::ObjCObjArg)) {
	return HandleInvalidPrintfConversionSpecifier(FS, startSpecifier,
	specifierLen);
	}

	// Check for use of public/private annotation outside of os_log().
	if (FSType != Sema::FST_OSLog) {
	if (FS.isPublic().isSet()) {
	EmitFormatDiagnostic(S.PDiag(diag::warn_format_invalid_annotation)
	<< "public",
	getLocationOfByte(FS.isPublic().getPosition()),
	/IsStringLocation/ false,
	getSpecifierRange(startSpecifier, specifierLen));
	}
	if (FS.isPrivate().isSet()) {
	EmitFormatDiagnostic(S.PDiag(diag::warn_format_invalid_annotation)
	<< "private",
	getLocationOfByte(FS.isPrivate().getPosition()),
	/IsStringLocation/ false,
	getSpecifierRange(startSpecifier, specifierLen));
	}
	}

	// Check for invalid use of field width
	if (!FS.hasValidFieldWidth()) {
	HandleInvalidAmount(FS, FS.getFieldWidth(), /* field width */ 0,
	startSpecifier, specifierLen);
	}

	// Check for invalid use of precision
	if (!FS.hasValidPrecision()) {
	HandleInvalidAmount(FS, FS.getPrecision(), /* precision */ 1,
	startSpecifier, specifierLen);
	}

	// Precision is mandatory for %P specifier.
	if (CS.getKind() == ConversionSpecifier::PArg &&
	FS.getPrecision().getHowSpecified() == OptionalAmount::NotSpecified) {
	EmitFormatDiagnostic(S.PDiag(diag::warn_format_P_no_precision),
	getLocationOfByte(startSpecifier),
	/IsStringLocation/ false,
	getSpecifierRange(startSpecifier, specifierLen));
	}

	// Check each flag does not conflict with any other component.
	if (!FS.hasValidThousandsGroupingPrefix())
	HandleFlag(FS, FS.hasThousandsGrouping(), startSpecifier, specifierLen);
	if (!FS.hasValidLeadingZeros())
	HandleFlag(FS, FS.hasLeadingZeros(), startSpecifier, specifierLen);
	if (!FS.hasValidPlusPrefix())
	HandleFlag(FS, FS.hasPlusPrefix(), startSpecifier, specifierLen);
	if (!FS.hasValidSpacePrefix())
	HandleFlag(FS, FS.hasSpacePrefix(), startSpecifier, specifierLen);
	if (!FS.hasValidAlternativeForm())
	HandleFlag(FS, FS.hasAlternativeForm(), startSpecifier, specifierLen);
	if (!FS.hasValidLeftJustified())
	HandleFlag(FS, FS.isLeftJustified(), startSpecifier, specifierLen);

	// Check that flags are not ignored by another flag
	if (FS.hasSpacePrefix() && FS.hasPlusPrefix()) // ' ' ignored by '+'
	HandleIgnoredFlag(FS, FS.hasSpacePrefix(), FS.hasPlusPrefix(),
	startSpecifier, specifierLen);
	if (FS.hasLeadingZeros() && FS.isLeftJustified()) // '0' ignored by '-'
	HandleIgnoredFlag(FS, FS.hasLeadingZeros(), FS.isLeftJustified(),
	startSpecifier, specifierLen);

	// Check the length modifier is valid with the given conversion specifier.
	if (!FS.hasValidLengthModifier(S.getASTContext().getTargetInfo()))
	HandleInvalidLengthModifier(FS, CS, startSpecifier, specifierLen,
	diag::warn_format_nonsensical_length);
	else if (!FS.hasStandardLengthModifier())
	HandleNonStandardLengthModifier(FS, startSpecifier, specifierLen);
	else if (!FS.hasStandardLengthConversionCombination())
	HandleInvalidLengthModifier(FS, CS, startSpecifier, specifierLen,
	diag::warn_format_non_standard_conversion_spec);

	if (!FS.hasStandardConversionSpecifier(S.getLangOpts()))
	HandleNonStandardConversionSpecifier(CS, startSpecifier, specifierLen);

	// The remaining checks depend on the data arguments.
	if (HasVAListArg)
	return true;

	if (!CheckNumArgs(FS, CS, startSpecifier, specifierLen, argIndex))
	return false;

	const Expr *Arg = getDataArg(argIndex);
	if (!Arg)
	return true;

	return checkFormatExpr(FS, startSpecifier, specifierLen, Arg);
	}

	static bool requiresParensToAddCast(const Expr *E) {
	// FIXME: We should have a general way to reason about operator
	// precedence and whether parens are actually needed here.
	// Take care of a few common cases where they aren't.
	const Expr *Inside = E->IgnoreImpCasts();
	if (const PseudoObjectExpr *POE = dyn_cast<PseudoObjectExpr>(Inside))
	Inside = POE->getSyntacticForm()->IgnoreImpCasts();

	switch (Inside->getStmtClass()) {
	case Stmt::ArraySubscriptExprClass:
	case Stmt::CallExprClass:
	case Stmt::CharacterLiteralClass:
	case Stmt::CXXBoolLiteralExprClass:
	case Stmt::DeclRefExprClass:
	case Stmt::FloatingLiteralClass:
	case Stmt::IntegerLiteralClass:
	case Stmt::MemberExprClass:
	case Stmt::ObjCArrayLiteralClass:
	case Stmt::ObjCBoolLiteralExprClass:
	case Stmt::ObjCBoxedExprClass:
	case Stmt::ObjCDictionaryLiteralClass:
	case Stmt::ObjCEncodeExprClass:
	case Stmt::ObjCIvarRefExprClass:
	case Stmt::ObjCMessageExprClass:
	case Stmt::ObjCPropertyRefExprClass:
	case Stmt::ObjCStringLiteralClass:
	case Stmt::ObjCSubscriptRefExprClass:
	case Stmt::ParenExprClass:
	case Stmt::StringLiteralClass:
	case Stmt::UnaryOperatorClass:
	return false;
	default:
	return true;
	}
	}

	static std::pair<QualType, StringRef>
	shouldNotPrintDirectly(const ASTContext &Context,
	QualType IntendedTy,
	const Expr *E) {
	// Use a 'while' to peel off layers of typedefs.
	QualType TyTy = IntendedTy;
	while (const TypedefType *UserTy = TyTy->getAs<TypedefType>()) {
	StringRef Name = UserTy->getDecl()->getName();
	QualType CastTy = llvm::StringSwitch<QualType>(Name)
	.Case("CFIndex", Context.getNSIntegerType())
	.Case("NSInteger", Context.getNSIntegerType())
	.Case("NSUInteger", Context.getNSUIntegerType())
	.Case("SInt32", Context.IntTy)
	.Case("UInt32", Context.UnsignedIntTy)
	.Default(QualType());

	if (!CastTy.isNull())
	return std::make_pair(CastTy, Name);

	TyTy = UserTy->desugar();
	}

	// Strip parens if necessary.
	if (const ParenExpr *PE = dyn_cast<ParenExpr>(E))
	return shouldNotPrintDirectly(Context,
	PE->getSubExpr()->getType(),
	PE->getSubExpr());

	// If this is a conditional expression, then its result type is constructed
	// via usual arithmetic conversions and thus there might be no necessary
	// typedef sugar there. Recurse to operands to check for NSInteger &
	// Co. usage condition.
	if (const ConditionalOperator *CO = dyn_cast<ConditionalOperator>(E)) {
	QualType TrueTy, FalseTy;
	StringRef TrueName, FalseName;

	std::tie(TrueTy, TrueName) =
	shouldNotPrintDirectly(Context,
	CO->getTrueExpr()->getType(),
	CO->getTrueExpr());
	std::tie(FalseTy, FalseName) =
	shouldNotPrintDirectly(Context,
	CO->getFalseExpr()->getType(),
	CO->getFalseExpr());

	if (TrueTy == FalseTy)
	return std::make_pair(TrueTy, TrueName);
	else if (TrueTy.isNull())
	return std::make_pair(FalseTy, FalseName);
	else if (FalseTy.isNull())
	return std::make_pair(TrueTy, TrueName);
	}

	return std::make_pair(QualType(), StringRef());
	}

	bool
	CheckPrintfHandler::checkFormatExpr(const analyze_printf::PrintfSpecifier &FS,
	const char *StartSpecifier,
	unsigned SpecifierLen,
	const Expr *E) {
	using namespace analyze_format_string;
	using namespace analyze_printf;

	// Now type check the data expression that matches the
	// format specifier.
	const analyze_printf::ArgType &AT = FS.getArgType(S.Context, isObjCContext());
	if (!AT.isValid())
	return true;

	QualType ExprTy = E->getType();
	while (const TypeOfExprType *TET = dyn_cast<TypeOfExprType>(ExprTy)) {
	ExprTy = TET->getUnderlyingExpr()->getType();
	}

	analyze_printf::ArgType::MatchKind match = AT.matchesType(S.Context, ExprTy);

	if (match == analyze_printf::ArgType::Match) {
	return true;
	}

	// Look through argument promotions for our error message's reported type.
	// This includes the integral and floating promotions, but excludes array
	// and function pointer decay; seeing that an argument intended to be a
	// string has type 'char [6]' is probably more confusing than 'char *'.
	if (const ImplicitCastExpr *ICE = dyn_cast<ImplicitCastExpr>(E)) {
	if (ICE->getCastKind() == CK_IntegralCast \|\|
	ICE->getCastKind() == CK_FloatingCast) {
	E = ICE->getSubExpr();
	ExprTy = E->getType();

	// Check if we didn't match because of an implicit cast from a 'char'
	// or 'short' to an 'int'. This is done because printf is a varargs
	// function.
	if (ICE->getType() == S.Context.IntTy \|\|
	ICE->getType() == S.Context.UnsignedIntTy) {
	// All further checking is done on the subexpression.
	if (AT.matchesType(S.Context, ExprTy))
	return true;
	}
	}
	} else if (const CharacterLiteral *CL = dyn_cast<CharacterLiteral>(E)) {
	// Special case for 'a', which has type 'int' in C.
	// Note, however, that we do /not/ want to treat multibyte constants like
	// 'MooV' as characters! This form is deprecated but still exists.
	if (ExprTy == S.Context.IntTy)
	if (llvm::isUIntN(S.Context.getCharWidth(), CL->getValue()))
	ExprTy = S.Context.CharTy;
	}

	// Look through enums to their underlying type.
	bool IsEnum = false;
	if (auto EnumTy = ExprTy->getAs<EnumType>()) {
	ExprTy = EnumTy->getDecl()->getIntegerType();
	IsEnum = true;
	}

	// %C in an Objective-C context prints a unichar, not a wchar_t.
	// If the argument is an integer of some kind, believe the %C and suggest
	// a cast instead of changing the conversion specifier.
	QualType IntendedTy = ExprTy;
	if (isObjCContext() &&
	FS.getConversionSpecifier().getKind() == ConversionSpecifier::CArg) {
	if (ExprTy->isIntegralOrUnscopedEnumerationType() &&
	!ExprTy->isCharType()) {
	// 'unichar' is defined as a typedef of unsigned short, but we should
	// prefer using the typedef if it is visible.
	IntendedTy = S.Context.UnsignedShortTy;

	// While we are here, check if the value is an IntegerLiteral that happens
	// to be within the valid range.
	if (const IntegerLiteral *IL = dyn_cast<IntegerLiteral>(E)) {
	const llvm::APInt &V = IL->getValue();
	if (V.getActiveBits() <= S.Context.getTypeSize(IntendedTy))
	return true;
	}

	LookupResult Result(S, &S.Context.Idents.get("unichar"), E->getLocStart(),
	Sema::LookupOrdinaryName);
	if (S.LookupName(Result, S.getCurScope())) {
	NamedDecl *ND = Result.getFoundDecl();
	if (TypedefNameDecl *TD = dyn_cast<TypedefNameDecl>(ND))
	if (TD->getUnderlyingType() == IntendedTy)
	IntendedTy = S.Context.getTypedefType(TD);
	}
	}
	}

	// Special-case some of Darwin's platform-independence types by suggesting
	// casts to primitive types that are known to be large enough.
	bool ShouldNotPrintDirectly = false; StringRef CastTyName;
	if (S.Context.getTargetInfo().getTriple().isOSDarwin()) {
	QualType CastTy;
	std::tie(CastTy, CastTyName) = shouldNotPrintDirectly(S.Context, IntendedTy, E);
	if (!CastTy.isNull()) {
	IntendedTy = CastTy;
	ShouldNotPrintDirectly = true;
	}
	}

	// We may be able to offer a FixItHint if it is a supported type.
	PrintfSpecifier fixedFS = FS;
	bool success =
	fixedFS.fixType(IntendedTy, S.getLangOpts(), S.Context, isObjCContext());

	if (success) {
	// Get the fix string from the fixed format specifier
	SmallString<16> buf;
	llvm::raw_svector_ostream os(buf);
	fixedFS.toString(os);

	CharSourceRange SpecRange = getSpecifierRange(StartSpecifier, SpecifierLen);

	if (IntendedTy == ExprTy && !ShouldNotPrintDirectly) {
	unsigned diag = diag::warn_format_conversion_argument_type_mismatch;
	if (match == analyze_format_string::ArgType::NoMatchPedantic) {
	diag = diag::warn_format_conversion_argument_type_mismatch_pedantic;
	}
	// In this case, the specifier is wrong and should be changed to match
	// the argument.
	EmitFormatDiagnostic(S.PDiag(diag)
	<< AT.getRepresentativeTypeName(S.Context)
	<< IntendedTy << IsEnum << E->getSourceRange(),
	E->getLocStart(),
	/IsStringLocation/ false, SpecRange,
	FixItHint::CreateReplacement(SpecRange, os.str()));
	} else {
	// The canonical type for formatting this value is different from the
	// actual type of the expression. (This occurs, for example, with Darwin's
	// NSInteger on 32-bit platforms, where it is typedef'd as 'int', but
	// should be printed as 'long' for 64-bit compatibility.)
	// Rather than emitting a normal format/argument mismatch, we want to
	// add a cast to the recommended type (and correct the format string
	// if necessary).
	SmallString<16> CastBuf;
	llvm::raw_svector_ostream CastFix(CastBuf);
	CastFix << "(";
	IntendedTy.print(CastFix, S.Context.getPrintingPolicy());
	CastFix << ")";

	SmallVector<FixItHint,4> Hints;
	if (!AT.matchesType(S.Context, IntendedTy) \|\| ShouldNotPrintDirectly)
	Hints.push_back(FixItHint::CreateReplacement(SpecRange, os.str()));

	if (const CStyleCastExpr *CCast = dyn_cast<CStyleCastExpr>(E)) {
	// If there's already a cast present, just replace it.
	SourceRange CastRange(CCast->getLParenLoc(), CCast->getRParenLoc());
	Hints.push_back(FixItHint::CreateReplacement(CastRange, CastFix.str()));

	} else if (!requiresParensToAddCast(E)) {
	// If the expression has high enough precedence,
	// just write the C-style cast.
	Hints.push_back(FixItHint::CreateInsertion(E->getLocStart(),
	CastFix.str()));
	} else {
	// Otherwise, add parens around the expression as well as the cast.
	CastFix << "(";
	Hints.push_back(FixItHint::CreateInsertion(E->getLocStart(),
	CastFix.str()));

	SourceLocation After = S.getLocForEndOfToken(E->getLocEnd());
	Hints.push_back(FixItHint::CreateInsertion(After, ")"));
	}

	if (ShouldNotPrintDirectly) {
	// The expression has a type that should not be printed directly.
	// We extract the name from the typedef because we don't want to show
	// the underlying type in the diagnostic.
	StringRef Name;
	if (const TypedefType *TypedefTy = dyn_cast<TypedefType>(ExprTy))
	Name = TypedefTy->getDecl()->getName();
	else
	Name = CastTyName;
	EmitFormatDiagnostic(S.PDiag(diag::warn_format_argument_needs_cast)
	<< Name << IntendedTy << IsEnum
	<< E->getSourceRange(),
	E->getLocStart(), /IsStringLocation=/false,
	SpecRange, Hints);
	} else {
	// In this case, the expression could be printed using a different
	// specifier, but we've decided that the specifier is probably correct
	// and we should cast instead. Just use the normal warning message.
	EmitFormatDiagnostic(
	S.PDiag(diag::warn_format_conversion_argument_type_mismatch)
	<< AT.getRepresentativeTypeName(S.Context) << ExprTy << IsEnum
	<< E->getSourceRange(),
	E->getLocStart(), /IsStringLocation/false,
	SpecRange, Hints);
	}
	}
	} else {
	const CharSourceRange &CSR = getSpecifierRange(StartSpecifier,
	SpecifierLen);
	// Since the warning for passing non-POD types to variadic functions
	// was deferred until now, we emit a warning for non-POD
	// arguments here.
	switch (S.isValidVarArgType(ExprTy)) {
	case Sema::VAK_Valid:
	case Sema::VAK_ValidInCXX11: {
	unsigned diag = diag::warn_format_conversion_argument_type_mismatch;
	if (match == analyze_printf::ArgType::NoMatchPedantic) {
	diag = diag::warn_format_conversion_argument_type_mismatch_pedantic;
	}

	EmitFormatDiagnostic(
	S.PDiag(diag) << AT.getRepresentativeTypeName(S.Context) << ExprTy
	<< IsEnum << CSR << E->getSourceRange(),
	E->getLocStart(), /IsStringLocation/ false, CSR);
	break;
	}
	case Sema::VAK_Undefined:
	case Sema::VAK_MSVCUndefined:
	EmitFormatDiagnostic(
	S.PDiag(diag::warn_non_pod_vararg_with_format_string)
	<< S.getLangOpts().CPlusPlus11
	<< ExprTy
	<< CallType
	<< AT.getRepresentativeTypeName(S.Context)
	<< CSR
	<< E->getSourceRange(),
	E->getLocStart(), /IsStringLocation/false, CSR);
	checkForCStrMembers(AT, E);
	break;

	case Sema::VAK_Invalid:
	if (ExprTy->isObjCObjectType())
	EmitFormatDiagnostic(
	S.PDiag(diag::err_cannot_pass_objc_interface_to_vararg_format)
	<< S.getLangOpts().CPlusPlus11
	<< ExprTy
	<< CallType
	<< AT.getRepresentativeTypeName(S.Context)
	<< CSR
	<< E->getSourceRange(),
	E->getLocStart(), /IsStringLocation/false, CSR);
	else
	// FIXME: If this is an initializer list, suggest removing the braces
	// or inserting a cast to the target type.
	S.Diag(E->getLocStart(), diag::err_cannot_pass_to_vararg_format)
	<< isa<InitListExpr>(E) << ExprTy << CallType
	<< AT.getRepresentativeTypeName(S.Context)
	<< E->getSourceRange();
	break;
	}

	assert(FirstDataArg + FS.getArgIndex() < CheckedVarArgs.size() &&
	"format string specifier index out of range");
	CheckedVarArgs[FirstDataArg + FS.getArgIndex()] = true;
	}

	return true;
	}

	//===--- CHECK: Scanf format string checking ------------------------------===//

	namespace {

	class CheckScanfHandler : public CheckFormatHandler {
	public:
	CheckScanfHandler(Sema &s, const FormatStringLiteral *fexpr,
	const Expr *origFormatExpr, Sema::FormatStringType type,
	unsigned firstDataArg, unsigned numDataArgs,
	const char *beg, bool hasVAListArg,
	ArrayRef<const Expr *> Args, unsigned formatIdx,
	bool inFunctionCall, Sema::VariadicCallType CallType,
	llvm::SmallBitVector &CheckedVarArgs,
	UncoveredArgHandler &UncoveredArg)
	: CheckFormatHandler(s, fexpr, origFormatExpr, type, firstDataArg,
	numDataArgs, beg, hasVAListArg, Args, formatIdx,
	inFunctionCall, CallType, CheckedVarArgs,
	UncoveredArg) {}

	bool HandleScanfSpecifier(const analyze_scanf::ScanfSpecifier &FS,
	const char *startSpecifier,
	unsigned specifierLen) override;

	bool HandleInvalidScanfConversionSpecifier(
	const analyze_scanf::ScanfSpecifier &FS,
	const char *startSpecifier,
	unsigned specifierLen) override;

	void HandleIncompleteScanList(const char start, const char end) override;
	};

	} // namespace

	void CheckScanfHandler::HandleIncompleteScanList(const char *start,
	const char *end) {
	EmitFormatDiagnostic(S.PDiag(diag::warn_scanf_scanlist_incomplete),
	getLocationOfByte(end), /IsStringLocation/true,
	getSpecifierRange(start, end - start));
	}

	bool CheckScanfHandler::HandleInvalidScanfConversionSpecifier(
	const analyze_scanf::ScanfSpecifier &FS,
	const char *startSpecifier,
	unsigned specifierLen) {
	const analyze_scanf::ScanfConversionSpecifier &CS =
	FS.getConversionSpecifier();

	return HandleInvalidConversionSpecifier(FS.getArgIndex(),
	getLocationOfByte(CS.getStart()),
	startSpecifier, specifierLen,
	CS.getStart(), CS.getLength());
	}

	bool CheckScanfHandler::HandleScanfSpecifier(
	const analyze_scanf::ScanfSpecifier &FS,
	const char *startSpecifier,
	unsigned specifierLen) {
	using namespace analyze_scanf;
	using namespace analyze_format_string;

	const ScanfConversionSpecifier &CS = FS.getConversionSpecifier();

	// Handle case where '%' and '*' don't consume an argument. These shouldn't
	// be used to decide if we are using positional arguments consistently.
	if (FS.consumesDataArgument()) {
	if (atFirstArg) {
	atFirstArg = false;
	usesPositionalArgs = FS.usesPositionalArg();
	}
	else if (usesPositionalArgs != FS.usesPositionalArg()) {
	HandlePositionalNonpositionalArgs(getLocationOfByte(CS.getStart()),
	startSpecifier, specifierLen);
	return false;
	}
	}

	// Check if the field with is non-zero.
	const OptionalAmount &Amt = FS.getFieldWidth();
	if (Amt.getHowSpecified() == OptionalAmount::Constant) {
	if (Amt.getConstantAmount() == 0) {
	const CharSourceRange &R = getSpecifierRange(Amt.getStart(),
	Amt.getConstantLength());
	EmitFormatDiagnostic(S.PDiag(diag::warn_scanf_nonzero_width),
	getLocationOfByte(Amt.getStart()),
	/IsStringLocation/true, R,
	FixItHint::CreateRemoval(R));
	}
	}

	if (!FS.consumesDataArgument()) {
	// FIXME: Technically specifying a precision or field width here
	// makes no sense. Worth issuing a warning at some point.
	return true;
	}

	// Consume the argument.
	unsigned argIndex = FS.getArgIndex();
	if (argIndex < NumDataArgs) {
	// The check to see if the argIndex is valid will come later.
	// We set the bit here because we may exit early from this
	// function if we encounter some other error.
	CoveredArgs.set(argIndex);
	}

	// Check the length modifier is valid with the given conversion specifier.
	if (!FS.hasValidLengthModifier(S.getASTContext().getTargetInfo()))
	HandleInvalidLengthModifier(FS, CS, startSpecifier, specifierLen,
	diag::warn_format_nonsensical_length);
	else if (!FS.hasStandardLengthModifier())
	HandleNonStandardLengthModifier(FS, startSpecifier, specifierLen);
	else if (!FS.hasStandardLengthConversionCombination())
	HandleInvalidLengthModifier(FS, CS, startSpecifier, specifierLen,
	diag::warn_format_non_standard_conversion_spec);

	if (!FS.hasStandardConversionSpecifier(S.getLangOpts()))
	HandleNonStandardConversionSpecifier(CS, startSpecifier, specifierLen);

	// The remaining checks depend on the data arguments.
	if (HasVAListArg)
	return true;

	if (!CheckNumArgs(FS, CS, startSpecifier, specifierLen, argIndex))
	return false;

	// Check that the argument type matches the format specifier.
	const Expr *Ex = getDataArg(argIndex);
	if (!Ex)
	return true;

	const analyze_format_string::ArgType &AT = FS.getArgType(S.Context);

	if (!AT.isValid()) {
	return true;
	}

	analyze_format_string::ArgType::MatchKind match =
	AT.matchesType(S.Context, Ex->getType());
	if (match == analyze_format_string::ArgType::Match) {
	return true;
	}

	ScanfSpecifier fixedFS = FS;
	bool success = fixedFS.fixType(Ex->getType(), Ex->IgnoreImpCasts()->getType(),
	S.getLangOpts(), S.Context);

	unsigned diag = diag::warn_format_conversion_argument_type_mismatch;
	if (match == analyze_format_string::ArgType::NoMatchPedantic) {
	diag = diag::warn_format_conversion_argument_type_mismatch_pedantic;
	}

	if (success) {
	// Get the fix string from the fixed format specifier.
	SmallString<128> buf;
	llvm::raw_svector_ostream os(buf);
	fixedFS.toString(os);

	EmitFormatDiagnostic(
	S.PDiag(diag) << AT.getRepresentativeTypeName(S.Context)
	<< Ex->getType() << false << Ex->getSourceRange(),
	Ex->getLocStart(),
	/IsStringLocation/ false,
	getSpecifierRange(startSpecifier, specifierLen),
	FixItHint::CreateReplacement(
	getSpecifierRange(startSpecifier, specifierLen), os.str()));
	} else {
	EmitFormatDiagnostic(S.PDiag(diag)
	<< AT.getRepresentativeTypeName(S.Context)
	<< Ex->getType() << false << Ex->getSourceRange(),
	Ex->getLocStart(),
	/IsStringLocation/ false,
	getSpecifierRange(startSpecifier, specifierLen));
	}

	return true;
	}

	static void CheckFormatString(Sema &S, const FormatStringLiteral *FExpr,
	const Expr *OrigFormatExpr,
	ArrayRef<const Expr *> Args,
	bool HasVAListArg, unsigned format_idx,
	unsigned firstDataArg,
	Sema::FormatStringType Type,
	bool inFunctionCall,
	Sema::VariadicCallType CallType,
	llvm::SmallBitVector &CheckedVarArgs,
	UncoveredArgHandler &UncoveredArg) {
	// CHECK: is the format string a wide literal?
	if (!FExpr->isAscii() && !FExpr->isUTF8()) {
	CheckFormatHandler::EmitFormatDiagnostic(
	S, inFunctionCall, Args[format_idx],
	S.PDiag(diag::warn_format_string_is_wide_literal), FExpr->getLocStart(),
	/IsStringLocation/true, OrigFormatExpr->getSourceRange());
	return;
	}

	// Str - The format string. NOTE: this is NOT null-terminated!
	StringRef StrRef = FExpr->getString();
	const char *Str = StrRef.data();
	// Account for cases where the string literal is truncated in a declaration.
	const ConstantArrayType *T =
	S.Context.getAsConstantArrayType(FExpr->getType());
	assert(T && "String literal not of constant array type!");
	size_t TypeSize = T->getSize().getZExtValue();
	size_t StrLen = std::min(std::max(TypeSize, size_t(1)) - 1, StrRef.size());
	const unsigned numDataArgs = Args.size() - firstDataArg;

	// Emit a warning if the string literal is truncated and does not contain an
	// embedded null character.
	if (TypeSize <= StrRef.size() &&
	StrRef.substr(0, TypeSize).find('\0') == StringRef::npos) {
	CheckFormatHandler::EmitFormatDiagnostic(
	S, inFunctionCall, Args[format_idx],
	S.PDiag(diag::warn_printf_format_string_not_null_terminated),
	FExpr->getLocStart(),
	/IsStringLocation=/true, OrigFormatExpr->getSourceRange());
	return;
	}

	// CHECK: empty format string?
	if (StrLen == 0 && numDataArgs > 0) {
	CheckFormatHandler::EmitFormatDiagnostic(
	S, inFunctionCall, Args[format_idx],
	S.PDiag(diag::warn_empty_format_string), FExpr->getLocStart(),
	/IsStringLocation/true, OrigFormatExpr->getSourceRange());
	return;
	}

	if (Type == Sema::FST_Printf \|\| Type == Sema::FST_NSString \|\|
	Type == Sema::FST_FreeBSDKPrintf \|\| Type == Sema::FST_OSLog \|\|
	Type == Sema::FST_OSTrace) {
	CheckPrintfHandler H(
	S, FExpr, OrigFormatExpr, Type, firstDataArg, numDataArgs,
	(Type == Sema::FST_NSString \|\| Type == Sema::FST_OSTrace), Str,
	HasVAListArg, Args, format_idx, inFunctionCall, CallType,
	CheckedVarArgs, UncoveredArg);

	if (!analyze_format_string::ParsePrintfString(H, Str, Str + StrLen,
	S.getLangOpts(),
	S.Context.getTargetInfo(),
	Type == Sema::FST_FreeBSDKPrintf))
	H.DoneProcessing();
	} else if (Type == Sema::FST_Scanf) {
	CheckScanfHandler H(S, FExpr, OrigFormatExpr, Type, firstDataArg,
	numDataArgs, Str, HasVAListArg, Args, format_idx,
	inFunctionCall, CallType, CheckedVarArgs, UncoveredArg);

	if (!analyze_format_string::ParseScanfString(H, Str, Str + StrLen,
	S.getLangOpts(),
	S.Context.getTargetInfo()))
	H.DoneProcessing();
	} // TODO: handle other formats
	}

	bool Sema::FormatStringHasSArg(const StringLiteral *FExpr) {
	// Str - The format string. NOTE: this is NOT null-terminated!
	StringRef StrRef = FExpr->getString();
	const char *Str = StrRef.data();
	// Account for cases where the string literal is truncated in a declaration.
	const ConstantArrayType *T = Context.getAsConstantArrayType(FExpr->getType());
	assert(T && "String literal not of constant array type!");
	size_t TypeSize = T->getSize().getZExtValue();
	size_t StrLen = std::min(std::max(TypeSize, size_t(1)) - 1, StrRef.size());
	return analyze_format_string::ParseFormatStringHasSArg(Str, Str + StrLen,
	getLangOpts(),
	Context.getTargetInfo());
	}

	//===--- CHECK: Warn on use of wrong absolute value function. -------------===//

	// Returns the related absolute value function that is larger, of 0 if one
	// does not exist.
	static unsigned getLargerAbsoluteValueFunction(unsigned AbsFunction) {
	switch (AbsFunction) {
	default:
	return 0;

	case Builtin::BI__builtin_abs:
	return Builtin::BI__builtin_labs;
	case Builtin::BI__builtin_labs:
	return Builtin::BI__builtin_llabs;
	case Builtin::BI__builtin_llabs:
	return 0;

	case Builtin::BI__builtin_fabsf:
	return Builtin::BI__builtin_fabs;
	case Builtin::BI__builtin_fabs:
	return Builtin::BI__builtin_fabsl;
	case Builtin::BI__builtin_fabsl:
	return 0;

	case Builtin::BI__builtin_cabsf:
	return Builtin::BI__builtin_cabs;
	case Builtin::BI__builtin_cabs:
	return Builtin::BI__builtin_cabsl;
	case Builtin::BI__builtin_cabsl:
	return 0;

	case Builtin::BIabs:
	return Builtin::BIlabs;
	case Builtin::BIlabs:
	return Builtin::BIllabs;
	case Builtin::BIllabs:
	return 0;

	case Builtin::BIfabsf:
	return Builtin::BIfabs;
	case Builtin::BIfabs:
	return Builtin::BIfabsl;
	case Builtin::BIfabsl:
	return 0;

	case Builtin::BIcabsf:
	return Builtin::BIcabs;
	case Builtin::BIcabs:
	return Builtin::BIcabsl;
	case Builtin::BIcabsl:
	return 0;
	}
	}

	// Returns the argument type of the absolute value function.
	static QualType getAbsoluteValueArgumentType(ASTContext &Context,
	unsigned AbsType) {
	if (AbsType == 0)
	return QualType();

	ASTContext::GetBuiltinTypeError Error = ASTContext::GE_None;
	QualType BuiltinType = Context.GetBuiltinType(AbsType, Error);
	if (Error != ASTContext::GE_None)
	return QualType();

	const FunctionProtoType *FT = BuiltinType->getAs<FunctionProtoType>();
	if (!FT)
	return QualType();

	if (FT->getNumParams() != 1)
	return QualType();

	return FT->getParamType(0);
	}

	// Returns the best absolute value function, or zero, based on type and
	// current absolute value function.
	static unsigned getBestAbsFunction(ASTContext &Context, QualType ArgType,
	unsigned AbsFunctionKind) {
	unsigned BestKind = 0;
	uint64_t ArgSize = Context.getTypeSize(ArgType);
	for (unsigned Kind = AbsFunctionKind; Kind != 0;
	Kind = getLargerAbsoluteValueFunction(Kind)) {
	QualType ParamType = getAbsoluteValueArgumentType(Context, Kind);
	if (Context.getTypeSize(ParamType) >= ArgSize) {
	if (BestKind == 0)
	BestKind = Kind;
	else if (Context.hasSameType(ParamType, ArgType)) {
	BestKind = Kind;
	break;
	}
	}
	}
	return BestKind;
	}

	enum AbsoluteValueKind {
	AVK_Integer,
	AVK_Floating,
	AVK_Complex
	};

	static AbsoluteValueKind getAbsoluteValueKind(QualType T) {
	if (T->isIntegralOrEnumerationType())
	return AVK_Integer;
	if (T->isRealFloatingType())
	return AVK_Floating;
	if (T->isAnyComplexType())
	return AVK_Complex;

	llvm_unreachable("Type not integer, floating, or complex");
	}

	// Changes the absolute value function to a different type. Preserves whether
	// the function is a builtin.
	static unsigned changeAbsFunction(unsigned AbsKind,
	AbsoluteValueKind ValueKind) {
	switch (ValueKind) {
	case AVK_Integer:
	switch (AbsKind) {
	default:
	return 0;
	case Builtin::BI__builtin_fabsf:
	case Builtin::BI__builtin_fabs:
	case Builtin::BI__builtin_fabsl:
	case Builtin::BI__builtin_cabsf:
	case Builtin::BI__builtin_cabs:
	case Builtin::BI__builtin_cabsl:
	return Builtin::BI__builtin_abs;
	case Builtin::BIfabsf:
	case Builtin::BIfabs:
	case Builtin::BIfabsl:
	case Builtin::BIcabsf:
	case Builtin::BIcabs:
	case Builtin::BIcabsl:
	return Builtin::BIabs;
	}
	case AVK_Floating:
	switch (AbsKind) {
	default:
	return 0;
	case Builtin::BI__builtin_abs:
	case Builtin::BI__builtin_labs:
	case Builtin::BI__builtin_llabs:
	case Builtin::BI__builtin_cabsf:
	case Builtin::BI__builtin_cabs:
	case Builtin::BI__builtin_cabsl:
	return Builtin::BI__builtin_fabsf;
	case Builtin::BIabs:
	case Builtin::BIlabs:
	case Builtin::BIllabs:
	case Builtin::BIcabsf:
	case Builtin::BIcabs:
	case Builtin::BIcabsl:
	return Builtin::BIfabsf;
	}
	case AVK_Complex:
	switch (AbsKind) {
	default:
	return 0;
	case Builtin::BI__builtin_abs:
	case Builtin::BI__builtin_labs:
	case Builtin::BI__builtin_llabs:
	case Builtin::BI__builtin_fabsf:
	case Builtin::BI__builtin_fabs:
	case Builtin::BI__builtin_fabsl:
	return Builtin::BI__builtin_cabsf;
	case Builtin::BIabs:
	case Builtin::BIlabs:
	case Builtin::BIllabs:
	case Builtin::BIfabsf:
	case Builtin::BIfabs:
	case Builtin::BIfabsl:
	return Builtin::BIcabsf;
	}
	}
	llvm_unreachable("Unable to convert function");
	}

	static unsigned getAbsoluteValueFunctionKind(const FunctionDecl *FDecl) {
	const IdentifierInfo *FnInfo = FDecl->getIdentifier();
	if (!FnInfo)
	return 0;

	switch (FDecl->getBuiltinID()) {
	default:
	return 0;
	case Builtin::BI__builtin_abs:
	case Builtin::BI__builtin_fabs:
	case Builtin::BI__builtin_fabsf:
	case Builtin::BI__builtin_fabsl:
	case Builtin::BI__builtin_labs:
	case Builtin::BI__builtin_llabs:
	case Builtin::BI__builtin_cabs:
	case Builtin::BI__builtin_cabsf:
	case Builtin::BI__builtin_cabsl:
	case Builtin::BIabs:
	case Builtin::BIlabs:
	case Builtin::BIllabs:
	case Builtin::BIfabs:
	case Builtin::BIfabsf:
	case Builtin::BIfabsl:
	case Builtin::BIcabs:
	case Builtin::BIcabsf:
	case Builtin::BIcabsl:
	return FDecl->getBuiltinID();
	}
	llvm_unreachable("Unknown Builtin type");
	}

	// If the replacement is valid, emit a note with replacement function.
	// Additionally, suggest including the proper header if not already included.
	static void emitReplacement(Sema &S, SourceLocation Loc, SourceRange Range,
	unsigned AbsKind, QualType ArgType) {
	bool EmitHeaderHint = true;
	const char *HeaderName = nullptr;
	const char *FunctionName = nullptr;
	if (S.getLangOpts().CPlusPlus && !ArgType->isAnyComplexType()) {
	FunctionName = "std::abs";
	if (ArgType->isIntegralOrEnumerationType()) {
	HeaderName = "cstdlib";
	} else if (ArgType->isRealFloatingType()) {
	HeaderName = "cmath";
	} else {
	llvm_unreachable("Invalid Type");
	}

	// Lookup all std::abs
	if (NamespaceDecl *Std = S.getStdNamespace()) {
	LookupResult R(S, &S.Context.Idents.get("abs"), Loc, Sema::LookupAnyName);
	R.suppressDiagnostics();
	S.LookupQualifiedName(R, Std);

	for (const auto *I : R) {
	const FunctionDecl *FDecl = nullptr;
	if (const UsingShadowDecl *UsingD = dyn_cast<UsingShadowDecl>(I)) {
	FDecl = dyn_cast<FunctionDecl>(UsingD->getTargetDecl());
	} else {
	FDecl = dyn_cast<FunctionDecl>(I);
	}
	if (!FDecl)
	continue;

	// Found std::abs(), check that they are the right ones.
	if (FDecl->getNumParams() != 1)
	continue;

	// Check that the parameter type can handle the argument.
	QualType ParamType = FDecl->getParamDecl(0)->getType();
	if (getAbsoluteValueKind(ArgType) == getAbsoluteValueKind(ParamType) &&
	S.Context.getTypeSize(ArgType) <=
	S.Context.getTypeSize(ParamType)) {
	// Found a function, don't need the header hint.
	EmitHeaderHint = false;
	break;
	}
	}
	}
	} else {
	FunctionName = S.Context.BuiltinInfo.getName(AbsKind);
	HeaderName = S.Context.BuiltinInfo.getHeaderName(AbsKind);

	if (HeaderName) {
	DeclarationName DN(&S.Context.Idents.get(FunctionName));
	LookupResult R(S, DN, Loc, Sema::LookupAnyName);
	R.suppressDiagnostics();
	S.LookupName(R, S.getCurScope());

	if (R.isSingleResult()) {
	FunctionDecl *FD = dyn_cast<FunctionDecl>(R.getFoundDecl());
	if (FD && FD->getBuiltinID() == AbsKind) {
	EmitHeaderHint = false;
	} else {
	return;
	}
	} else if (!R.empty()) {
	return;
	}
	}
	}

	S.Diag(Loc, diag::note_replace_abs_function)
	<< FunctionName << FixItHint::CreateReplacement(Range, FunctionName);

	if (!HeaderName)
	return;

	if (!EmitHeaderHint)
	return;

	S.Diag(Loc, diag::note_include_header_or_declare) << HeaderName
	<< FunctionName;
	}

	template <std::size_t StrLen>
	static bool IsStdFunction(const FunctionDecl *FDecl,
	const char (&Str)[StrLen]) {
	if (!FDecl)
	return false;
	if (!FDecl->getIdentifier() \|\| !FDecl->getIdentifier()->isStr(Str))
	return false;
	if (!FDecl->isInStdNamespace())
	return false;

	return true;
	}

	// Warn when using the wrong abs() function.
	void Sema::CheckAbsoluteValueFunction(const CallExpr *Call,
	const FunctionDecl *FDecl) {
	if (Call->getNumArgs() != 1)
	return;

	unsigned AbsKind = getAbsoluteValueFunctionKind(FDecl);
	bool IsStdAbs = IsStdFunction(FDecl, "abs");
	if (AbsKind == 0 && !IsStdAbs)
	return;

	QualType ArgType = Call->getArg(0)->IgnoreParenImpCasts()->getType();
	QualType ParamType = Call->getArg(0)->getType();

	// Unsigned types cannot be negative. Suggest removing the absolute value
	// function call.
	if (ArgType->isUnsignedIntegerType()) {
	const char *FunctionName =
	IsStdAbs ? "std::abs" : Context.BuiltinInfo.getName(AbsKind);
	Diag(Call->getExprLoc(), diag::warn_unsigned_abs) << ArgType << ParamType;
	Diag(Call->getExprLoc(), diag::note_remove_abs)
	<< FunctionName
	<< FixItHint::CreateRemoval(Call->getCallee()->getSourceRange());
	return;
	}

	// Taking the absolute value of a pointer is very suspicious, they probably
	// wanted to index into an array, dereference a pointer, call a function, etc.
	if (ArgType->isPointerType() \|\| ArgType->canDecayToPointerType()) {
	unsigned DiagType = 0;
	if (ArgType->isFunctionType())
	DiagType = 1;
	else if (ArgType->isArrayType())
	DiagType = 2;

	Diag(Call->getExprLoc(), diag::warn_pointer_abs) << DiagType << ArgType;
	return;
	}

	// std::abs has overloads which prevent most of the absolute value problems
	// from occurring.
	if (IsStdAbs)
	return;

	AbsoluteValueKind ArgValueKind = getAbsoluteValueKind(ArgType);
	AbsoluteValueKind ParamValueKind = getAbsoluteValueKind(ParamType);

	// The argument and parameter are the same kind. Check if they are the right
	// size.
	if (ArgValueKind == ParamValueKind) {
	if (Context.getTypeSize(ArgType) <= Context.getTypeSize(ParamType))
	return;

	unsigned NewAbsKind = getBestAbsFunction(Context, ArgType, AbsKind);
	Diag(Call->getExprLoc(), diag::warn_abs_too_small)
	<< FDecl << ArgType << ParamType;

	if (NewAbsKind == 0)
	return;

	emitReplacement(*this, Call->getExprLoc(),
	Call->getCallee()->getSourceRange(), NewAbsKind, ArgType);
	return;
	}

	// ArgValueKind != ParamValueKind
	// The wrong type of absolute value function was used. Attempt to find the
	// proper one.
	unsigned NewAbsKind = changeAbsFunction(AbsKind, ArgValueKind);
	NewAbsKind = getBestAbsFunction(Context, ArgType, NewAbsKind);
	if (NewAbsKind == 0)
	return;

	Diag(Call->getExprLoc(), diag::warn_wrong_absolute_value_type)
	<< FDecl << ParamValueKind << ArgValueKind;

	emitReplacement(*this, Call->getExprLoc(),
	Call->getCallee()->getSourceRange(), NewAbsKind, ArgType);
	}

	//===--- CHECK: Warn on use of std::max and unsigned zero. r---------------===//
	void Sema::CheckMaxUnsignedZero(const CallExpr *Call,
	const FunctionDecl *FDecl) {
	if (!Call \|\| !FDecl) return;

	// Ignore template specializations and macros.
	if (inTemplateInstantiation()) return;
	if (Call->getExprLoc().isMacroID()) return;

	// Only care about the one template argument, two function parameter std::max
	if (Call->getNumArgs() != 2) return;
	if (!IsStdFunction(FDecl, "max")) return;
	const auto * ArgList = FDecl->getTemplateSpecializationArgs();
	if (!ArgList) return;
	if (ArgList->size() != 1) return;

	// Check that template type argument is unsigned integer.
	const auto& TA = ArgList->get(0);
	if (TA.getKind() != TemplateArgument::Type) return;
	QualType ArgType = TA.getAsType();
	if (!ArgType->isUnsignedIntegerType()) return;

	// See if either argument is a literal zero.
	auto IsLiteralZeroArg = [](const Expr* E) -> bool {
	const auto *MTE = dyn_cast<MaterializeTemporaryExpr>(E);
	if (!MTE) return false;
	const auto *Num = dyn_cast<IntegerLiteral>(MTE->GetTemporaryExpr());
	if (!Num) return false;
	if (Num->getValue() != 0) return false;
	return true;
	};

	const Expr *FirstArg = Call->getArg(0);
	const Expr *SecondArg = Call->getArg(1);
	const bool IsFirstArgZero = IsLiteralZeroArg(FirstArg);
	const bool IsSecondArgZero = IsLiteralZeroArg(SecondArg);

	// Only warn when exactly one argument is zero.
	if (IsFirstArgZero == IsSecondArgZero) return;

	SourceRange FirstRange = FirstArg->getSourceRange();
	SourceRange SecondRange = SecondArg->getSourceRange();

	SourceRange ZeroRange = IsFirstArgZero ? FirstRange : SecondRange;

	Diag(Call->getExprLoc(), diag::warn_max_unsigned_zero)
	<< IsFirstArgZero << Call->getCallee()->getSourceRange() << ZeroRange;

	// Deduce what parts to remove so that "std::max(0u, foo)" becomes "(foo)".
	SourceRange RemovalRange;
	if (IsFirstArgZero) {
	RemovalRange = SourceRange(FirstRange.getBegin(),
	SecondRange.getBegin().getLocWithOffset(-1));
	} else {
	RemovalRange = SourceRange(getLocForEndOfToken(FirstRange.getEnd()),
	SecondRange.getEnd());
	}

	Diag(Call->getExprLoc(), diag::note_remove_max_call)
	<< FixItHint::CreateRemoval(Call->getCallee()->getSourceRange())
	<< FixItHint::CreateRemoval(RemovalRange);
	}

	//===--- CHECK: Standard memory functions ---------------------------------===//

	/// \brief Takes the expression passed to the size_t parameter of functions
	/// such as memcmp, strncat, etc and warns if it's a comparison.
	///
	/// This is to catch typos like `if (memcmp(&a, &b, sizeof(a) > 0))`.
	static bool CheckMemorySizeofForComparison(Sema &S, const Expr *E,
	IdentifierInfo *FnName,
	SourceLocation FnLoc,
	SourceLocation RParenLoc) {
	const BinaryOperator *Size = dyn_cast<BinaryOperator>(E);
	if (!Size)
	return false;

	// if E is binop and op is <=>, >, <, >=, <=, ==, &&, \|\|:
	if (!Size->isComparisonOp() && !Size->isLogicalOp())
	return false;

	SourceRange SizeRange = Size->getSourceRange();
	S.Diag(Size->getOperatorLoc(), diag::warn_memsize_comparison)
	<< SizeRange << FnName;
	S.Diag(FnLoc, diag::note_memsize_comparison_paren)
	<< FnName << FixItHint::CreateInsertion(
	S.getLocForEndOfToken(Size->getLHS()->getLocEnd()), ")")
	<< FixItHint::CreateRemoval(RParenLoc);
	S.Diag(SizeRange.getBegin(), diag::note_memsize_comparison_cast_silence)
	<< FixItHint::CreateInsertion(SizeRange.getBegin(), "(size_t)(")
	<< FixItHint::CreateInsertion(S.getLocForEndOfToken(SizeRange.getEnd()),
	")");

	return true;
	}

	/// \brief Determine whether the given type is or contains a dynamic class type
	/// (e.g., whether it has a vtable).
	static const CXXRecordDecl *getContainedDynamicClass(QualType T,
	bool &IsContained) {
	// Look through array types while ignoring qualifiers.
	const Type *Ty = T->getBaseElementTypeUnsafe();
	IsContained = false;

	const CXXRecordDecl *RD = Ty->getAsCXXRecordDecl();
	RD = RD ? RD->getDefinition() : nullptr;
	if (!RD \|\| RD->isInvalidDecl())
	return nullptr;

	if (RD->isDynamicClass())
	return RD;

	// Check all the fields. If any bases were dynamic, the class is dynamic.
	// It's impossible for a class to transitively contain itself by value, so
	// infinite recursion is impossible.
	for (auto *FD : RD->fields()) {
	bool SubContained;
	if (const CXXRecordDecl *ContainedRD =
	getContainedDynamicClass(FD->getType(), SubContained)) {
	IsContained = true;
	return ContainedRD;
	}
	}

	return nullptr;
	}

	/// \brief If E is a sizeof expression, returns its argument expression,
	/// otherwise returns NULL.
	static const Expr getSizeOfExprArg(const Expr E) {
	if (const UnaryExprOrTypeTraitExpr *SizeOf =
	dyn_cast<UnaryExprOrTypeTraitExpr>(E))
	if (SizeOf->getKind() == UETT_SizeOf && !SizeOf->isArgumentType())
	return SizeOf->getArgumentExpr()->IgnoreParenImpCasts();

	return nullptr;
	}

	/// \brief If E is a sizeof expression, returns its argument type.
	static QualType getSizeOfArgType(const Expr *E) {
	if (const UnaryExprOrTypeTraitExpr *SizeOf =
	dyn_cast<UnaryExprOrTypeTraitExpr>(E))
	if (SizeOf->getKind() == UETT_SizeOf)
	return SizeOf->getTypeOfArgument();

	return QualType();
	}

	/// \brief Check for dangerous or invalid arguments to memset().
	///
	/// This issues warnings on known problematic, dangerous or unspecified
	/// arguments to the standard 'memset', 'memcpy', 'memmove', and 'memcmp'
	/// function calls.
	///
	/// \param Call The call expression to diagnose.
	void Sema::CheckMemaccessArguments(const CallExpr *Call,
	unsigned BId,
	IdentifierInfo *FnName) {
	assert(BId != 0);

	// It is possible to have a non-standard definition of memset. Validate
	// we have enough arguments, and if not, abort further checking.
	unsigned ExpectedNumArgs =
	(BId == Builtin::BIstrndup \|\| BId == Builtin::BIbzero ? 2 : 3);
	if (Call->getNumArgs() < ExpectedNumArgs)
	return;

	unsigned LastArg = (BId == Builtin::BImemset \|\| BId == Builtin::BIbzero \|\|
	BId == Builtin::BIstrndup ? 1 : 2);
	unsigned LenArg =
	(BId == Builtin::BIbzero \|\| BId == Builtin::BIstrndup ? 1 : 2);
	const Expr *LenExpr = Call->getArg(LenArg)->IgnoreParenImpCasts();

	if (CheckMemorySizeofForComparison(*this, LenExpr, FnName,
	Call->getLocStart(), Call->getRParenLoc()))
	return;

	// We have special checking when the length is a sizeof expression.
	QualType SizeOfArgTy = getSizeOfArgType(LenExpr);
	const Expr *SizeOfArg = getSizeOfExprArg(LenExpr);
	llvm::FoldingSetNodeID SizeOfArgID;

	// Although widely used, 'bzero' is not a standard function. Be more strict
	// with the argument types before allowing diagnostics and only allow the
	// form bzero(ptr, sizeof(...)).
	QualType FirstArgTy = Call->getArg(0)->IgnoreParenImpCasts()->getType();
	if (BId == Builtin::BIbzero && !FirstArgTy->getAs<PointerType>())
	return;

	for (unsigned ArgIdx = 0; ArgIdx != LastArg; ++ArgIdx) {
	const Expr *Dest = Call->getArg(ArgIdx)->IgnoreParenImpCasts();
	SourceRange ArgRange = Call->getArg(ArgIdx)->getSourceRange();

	QualType DestTy = Dest->getType();
	QualType PointeeTy;
	if (const PointerType *DestPtrTy = DestTy->getAs<PointerType>()) {
	PointeeTy = DestPtrTy->getPointeeType();

	// Never warn about void type pointers. This can be used to suppress
	// false positives.
	if (PointeeTy->isVoidType())
	continue;

	// Catch "memset(p, 0, sizeof(p))" -- needs to be sizeof(*p). Do this by
	// actually comparing the expressions for equality. Because computing the
	// expression IDs can be expensive, we only do this if the diagnostic is
	// enabled.
	if (SizeOfArg &&
	!Diags.isIgnored(diag::warn_sizeof_pointer_expr_memaccess,
	SizeOfArg->getExprLoc())) {
	// We only compute IDs for expressions if the warning is enabled, and
	// cache the sizeof arg's ID.
	if (SizeOfArgID == llvm::FoldingSetNodeID())
	SizeOfArg->Profile(SizeOfArgID, Context, true);
	llvm::FoldingSetNodeID DestID;
	Dest->Profile(DestID, Context, true);
	if (DestID == SizeOfArgID) {
	// TODO: For strncpy() and friends, this could suggest sizeof(dst)
	// over sizeof(src) as well.
	unsigned ActionIdx = 0; // Default is to suggest dereferencing.
	StringRef ReadableName = FnName->getName();

	if (const UnaryOperator *UnaryOp = dyn_cast<UnaryOperator>(Dest))
	if (UnaryOp->getOpcode() == UO_AddrOf)
	ActionIdx = 1; // If its an address-of operator, just remove it.
	if (!PointeeTy->isIncompleteType() &&
	(Context.getTypeSize(PointeeTy) == Context.getCharWidth()))
	ActionIdx = 2; // If the pointee's size is sizeof(char),
	// suggest an explicit length.

	// If the function is defined as a builtin macro, do not show macro
	// expansion.
	SourceLocation SL = SizeOfArg->getExprLoc();
	SourceRange DSR = Dest->getSourceRange();
	SourceRange SSR = SizeOfArg->getSourceRange();
	SourceManager &SM = getSourceManager();

	if (SM.isMacroArgExpansion(SL)) {
	ReadableName = Lexer::getImmediateMacroName(SL, SM, LangOpts);
	SL = SM.getSpellingLoc(SL);
	DSR = SourceRange(SM.getSpellingLoc(DSR.getBegin()),
	SM.getSpellingLoc(DSR.getEnd()));
	SSR = SourceRange(SM.getSpellingLoc(SSR.getBegin()),
	SM.getSpellingLoc(SSR.getEnd()));
	}

	DiagRuntimeBehavior(SL, SizeOfArg,
	PDiag(diag::warn_sizeof_pointer_expr_memaccess)
	<< ReadableName
	<< PointeeTy
	<< DestTy
	<< DSR
	<< SSR);
	DiagRuntimeBehavior(SL, SizeOfArg,
	PDiag(diag::warn_sizeof_pointer_expr_memaccess_note)
	<< ActionIdx
	<< SSR);

	break;
	}
	}

	// Also check for cases where the sizeof argument is the exact same
	// type as the memory argument, and where it points to a user-defined
	// record type.
	if (SizeOfArgTy != QualType()) {
	if (PointeeTy->isRecordType() &&
	Context.typesAreCompatible(SizeOfArgTy, DestTy)) {
	DiagRuntimeBehavior(LenExpr->getExprLoc(), Dest,
	PDiag(diag::warn_sizeof_pointer_type_memaccess)
	<< FnName << SizeOfArgTy << ArgIdx
	<< PointeeTy << Dest->getSourceRange()
	<< LenExpr->getSourceRange());
	break;
	}
	}
	} else if (DestTy->isArrayType()) {
	PointeeTy = DestTy;
	}

	if (PointeeTy == QualType())
	continue;

	// Always complain about dynamic classes.
	bool IsContained;
	if (const CXXRecordDecl *ContainedRD =
	getContainedDynamicClass(PointeeTy, IsContained)) {

	unsigned OperationType = 0;
	// "overwritten" if we're warning about the destination for any call
	// but memcmp; otherwise a verb appropriate to the call.
	if (ArgIdx != 0 \|\| BId == Builtin::BImemcmp) {
	if (BId == Builtin::BImemcpy)
	OperationType = 1;
	else if(BId == Builtin::BImemmove)
	OperationType = 2;
	else if (BId == Builtin::BImemcmp)
	OperationType = 3;
	}

	DiagRuntimeBehavior(
	Dest->getExprLoc(), Dest,
	PDiag(diag::warn_dyn_class_memaccess)
	<< (BId == Builtin::BImemcmp ? ArgIdx + 2 : ArgIdx)
	<< FnName << IsContained << ContainedRD << OperationType
	<< Call->getCallee()->getSourceRange());
	} else if (PointeeTy.hasNonTrivialObjCLifetime() &&
	BId != Builtin::BImemset)
	DiagRuntimeBehavior(
	Dest->getExprLoc(), Dest,
	PDiag(diag::warn_arc_object_memaccess)
	<< ArgIdx << FnName << PointeeTy
	<< Call->getCallee()->getSourceRange());
	else
	continue;

	DiagRuntimeBehavior(
	Dest->getExprLoc(), Dest,
	PDiag(diag::note_bad_memaccess_silence)
	<< FixItHint::CreateInsertion(ArgRange.getBegin(), "(void*)"));
	break;
	}
	}

	// A little helper routine: ignore addition and subtraction of integer literals.
	// This intentionally does not ignore all integer constant expressions because
	// we don't want to remove sizeof().
	static const Expr ignoreLiteralAdditions(const Expr Ex, ASTContext &Ctx) {
	Ex = Ex->IgnoreParenCasts();

	while (true) {
	const BinaryOperator * BO = dyn_cast<BinaryOperator>(Ex);
	if (!BO \|\| !BO->isAdditiveOp())
	break;

	const Expr *RHS = BO->getRHS()->IgnoreParenCasts();
	const Expr *LHS = BO->getLHS()->IgnoreParenCasts();

	if (isa<IntegerLiteral>(RHS))
	Ex = LHS;
	else if (isa<IntegerLiteral>(LHS))
	Ex = RHS;
	else
	break;
	}

	return Ex;
	}

	static bool isConstantSizeArrayWithMoreThanOneElement(QualType Ty,
	ASTContext &Context) {
	// Only handle constant-sized or VLAs, but not flexible members.
	if (const ConstantArrayType *CAT = Context.getAsConstantArrayType(Ty)) {
	// Only issue the FIXIT for arrays of size > 1.
	if (CAT->getSize().getSExtValue() <= 1)
	return false;
	} else if (!Ty->isVariableArrayType()) {
	return false;
	}
	return true;
	}

	// Warn if the user has made the 'size' argument to strlcpy or strlcat
	// be the size of the source, instead of the destination.
	void Sema::CheckStrlcpycatArguments(const CallExpr *Call,
	IdentifierInfo *FnName) {

	// Don't crash if the user has the wrong number of arguments
	unsigned NumArgs = Call->getNumArgs();
	if ((NumArgs != 3) && (NumArgs != 4))
	return;

	const Expr *SrcArg = ignoreLiteralAdditions(Call->getArg(1), Context);
	const Expr *SizeArg = ignoreLiteralAdditions(Call->getArg(2), Context);
	const Expr *CompareWithSrc = nullptr;

	if (CheckMemorySizeofForComparison(*this, SizeArg, FnName,
	Call->getLocStart(), Call->getRParenLoc()))
	return;

	// Look for 'strlcpy(dst, x, sizeof(x))'
	if (const Expr *Ex = getSizeOfExprArg(SizeArg))
	CompareWithSrc = Ex;
	else {
	// Look for 'strlcpy(dst, x, strlen(x))'
	if (const CallExpr *SizeCall = dyn_cast<CallExpr>(SizeArg)) {
	if (SizeCall->getBuiltinCallee() == Builtin::BIstrlen &&
	SizeCall->getNumArgs() == 1)
	CompareWithSrc = ignoreLiteralAdditions(SizeCall->getArg(0), Context);
	}
	}

	if (!CompareWithSrc)
	return;

	// Determine if the argument to sizeof/strlen is equal to the source
	// argument. In principle there's all kinds of things you could do
	// here, for instance creating an == expression and evaluating it with
	// EvaluateAsBooleanCondition, but this uses a more direct technique:
	const DeclRefExpr *SrcArgDRE = dyn_cast<DeclRefExpr>(SrcArg);
	if (!SrcArgDRE)
	return;

	const DeclRefExpr *CompareWithSrcDRE = dyn_cast<DeclRefExpr>(CompareWithSrc);
	if (!CompareWithSrcDRE \|\|
	SrcArgDRE->getDecl() != CompareWithSrcDRE->getDecl())
	return;

	const Expr *OriginalSizeArg = Call->getArg(2);
	Diag(CompareWithSrcDRE->getLocStart(), diag::warn_strlcpycat_wrong_size)
	<< OriginalSizeArg->getSourceRange() << FnName;

	// Output a FIXIT hint if the destination is an array (rather than a
	// pointer to an array). This could be enhanced to handle some
	// pointers if we know the actual size, like if DstArg is 'array+2'
	// we could say 'sizeof(array)-2'.
	const Expr *DstArg = Call->getArg(0)->IgnoreParenImpCasts();
	if (!isConstantSizeArrayWithMoreThanOneElement(DstArg->getType(), Context))
	return;

	SmallString<128> sizeString;
	llvm::raw_svector_ostream OS(sizeString);
	OS << "sizeof(";
	DstArg->printPretty(OS, nullptr, getPrintingPolicy());
	OS << ")";

	Diag(OriginalSizeArg->getLocStart(), diag::note_strlcpycat_wrong_size)
	<< FixItHint::CreateReplacement(OriginalSizeArg->getSourceRange(),
	OS.str());
	}

	/// Check if two expressions refer to the same declaration.
	static bool referToTheSameDecl(const Expr E1, const Expr E2) {
	if (const DeclRefExpr *D1 = dyn_cast_or_null<DeclRefExpr>(E1))
	if (const DeclRefExpr *D2 = dyn_cast_or_null<DeclRefExpr>(E2))
	return D1->getDecl() == D2->getDecl();
	return false;
	}

	static const Expr getStrlenExprArg(const Expr E) {
	if (const CallExpr *CE = dyn_cast<CallExpr>(E)) {
	const FunctionDecl *FD = CE->getDirectCallee();
	if (!FD \|\| FD->getMemoryFunctionKind() != Builtin::BIstrlen)
	return nullptr;
	return CE->getArg(0)->IgnoreParenCasts();
	}
	return nullptr;
	}

	// Warn on anti-patterns as the 'size' argument to strncat.
	// The correct size argument should look like following:
	// strncat(dst, src, sizeof(dst) - strlen(dest) - 1);
	void Sema::CheckStrncatArguments(const CallExpr *CE,
	IdentifierInfo *FnName) {
	// Don't crash if the user has the wrong number of arguments.
	if (CE->getNumArgs() < 3)
	return;
	const Expr *DstArg = CE->getArg(0)->IgnoreParenCasts();
	const Expr *SrcArg = CE->getArg(1)->IgnoreParenCasts();
	const Expr *LenArg = CE->getArg(2)->IgnoreParenCasts();

	if (CheckMemorySizeofForComparison(*this, LenArg, FnName, CE->getLocStart(),
	CE->getRParenLoc()))
	return;

	// Identify common expressions, which are wrongly used as the size argument
	// to strncat and may lead to buffer overflows.
	unsigned PatternType = 0;
	if (const Expr *SizeOfArg = getSizeOfExprArg(LenArg)) {
	// - sizeof(dst)
	if (referToTheSameDecl(SizeOfArg, DstArg))
	PatternType = 1;
	// - sizeof(src)
	else if (referToTheSameDecl(SizeOfArg, SrcArg))
	PatternType = 2;
	} else if (const BinaryOperator *BE = dyn_cast<BinaryOperator>(LenArg)) {
	if (BE->getOpcode() == BO_Sub) {
	const Expr *L = BE->getLHS()->IgnoreParenCasts();
	const Expr *R = BE->getRHS()->IgnoreParenCasts();
	// - sizeof(dst) - strlen(dst)
	if (referToTheSameDecl(DstArg, getSizeOfExprArg(L)) &&
	referToTheSameDecl(DstArg, getStrlenExprArg(R)))
	PatternType = 1;
	// - sizeof(src) - (anything)
	else if (referToTheSameDecl(SrcArg, getSizeOfExprArg(L)))
	PatternType = 2;
	}
	}

	if (PatternType == 0)
	return;

	// Generate the diagnostic.
	SourceLocation SL = LenArg->getLocStart();
	SourceRange SR = LenArg->getSourceRange();
	SourceManager &SM = getSourceManager();

	// If the function is defined as a builtin macro, do not show macro expansion.
	if (SM.isMacroArgExpansion(SL)) {
	SL = SM.getSpellingLoc(SL);
	SR = SourceRange(SM.getSpellingLoc(SR.getBegin()),
	SM.getSpellingLoc(SR.getEnd()));
	}

	// Check if the destination is an array (rather than a pointer to an array).
	QualType DstTy = DstArg->getType();
	bool isKnownSizeArray = isConstantSizeArrayWithMoreThanOneElement(DstTy,
	Context);
	if (!isKnownSizeArray) {
	if (PatternType == 1)
	Diag(SL, diag::warn_strncat_wrong_size) << SR;
	else
	Diag(SL, diag::warn_strncat_src_size) << SR;
	return;
	}

	if (PatternType == 1)
	Diag(SL, diag::warn_strncat_large_size) << SR;
	else
	Diag(SL, diag::warn_strncat_src_size) << SR;

	SmallString<128> sizeString;
	llvm::raw_svector_ostream OS(sizeString);
	OS << "sizeof(";
	DstArg->printPretty(OS, nullptr, getPrintingPolicy());
	OS << ") - ";
	OS << "strlen(";
	DstArg->printPretty(OS, nullptr, getPrintingPolicy());
	OS << ") - 1";

	Diag(SL, diag::note_strncat_wrong_size)
	<< FixItHint::CreateReplacement(SR, OS.str());
	}

	//===--- CHECK: Return Address of Stack Variable --------------------------===//

	static const Expr EvalVal(const Expr E,
	SmallVectorImpl<const DeclRefExpr *> &refVars,
	const Decl *ParentDecl);
	static const Expr EvalAddr(const Expr E,
	SmallVectorImpl<const DeclRefExpr *> &refVars,
	const Decl *ParentDecl);

	/// CheckReturnStackAddr - Check if a return statement returns the address
	/// of a stack variable.
	static void
	CheckReturnStackAddr(Sema &S, Expr *RetValExp, QualType lhsType,
	SourceLocation ReturnLoc) {
	const Expr *stackE = nullptr;
	SmallVector<const DeclRefExpr *, 8> refVars;

	// Perform checking for returned stack addresses, local blocks,
	// label addresses or references to temporaries.
	if (lhsType->isPointerType() \|\|
	(!S.getLangOpts().ObjCAutoRefCount && lhsType->isBlockPointerType())) {
	stackE = EvalAddr(RetValExp, refVars, /ParentDecl=/nullptr);
	} else if (lhsType->isReferenceType()) {
	stackE = EvalVal(RetValExp, refVars, /ParentDecl=/nullptr);
	}

	if (!stackE)
	return; // Nothing suspicious was found.

	// Parameters are initialized in the calling scope, so taking the address
	// of a parameter reference doesn't need a warning.
	for (auto *DRE : refVars)
	if (isa<ParmVarDecl>(DRE->getDecl()))
	return;

	SourceLocation diagLoc;
	SourceRange diagRange;
	if (refVars.empty()) {
	diagLoc = stackE->getLocStart();
	diagRange = stackE->getSourceRange();
	} else {
	// We followed through a reference variable. 'stackE' contains the
	// problematic expression but we will warn at the return statement pointing
	// at the reference variable. We will later display the "trail" of
	// reference variables using notes.
	diagLoc = refVars[0]->getLocStart();
	diagRange = refVars[0]->getSourceRange();
	}

	if (const DeclRefExpr *DR = dyn_cast<DeclRefExpr>(stackE)) {
	// address of local var
	S.Diag(diagLoc, diag::warn_ret_stack_addr_ref) << lhsType->isReferenceType()
	<< DR->getDecl()->getDeclName() << diagRange;
	} else if (isa<BlockExpr>(stackE)) { // local block.
	S.Diag(diagLoc, diag::err_ret_local_block) << diagRange;
	} else if (isa<AddrLabelExpr>(stackE)) { // address of label.
	S.Diag(diagLoc, diag::warn_ret_addr_label) << diagRange;
	} else { // local temporary.
	// If there is an LValue->RValue conversion, then the value of the
	// reference type is used, not the reference.
	if (auto *ICE = dyn_cast<ImplicitCastExpr>(RetValExp)) {
	if (ICE->getCastKind() == CK_LValueToRValue) {
	return;
	}
	}
	S.Diag(diagLoc, diag::warn_ret_local_temp_addr_ref)
	<< lhsType->isReferenceType() << diagRange;
	}

	// Display the "trail" of reference variables that we followed until we
	// found the problematic expression using notes.
	for (unsigned i = 0, e = refVars.size(); i != e; ++i) {
	const VarDecl *VD = cast<VarDecl>(refVars[i]->getDecl());
	// If this var binds to another reference var, show the range of the next
	// var, otherwise the var binds to the problematic expression, in which case
	// show the range of the expression.
	SourceRange range = (i < e - 1) ? refVars[i + 1]->getSourceRange()
	: stackE->getSourceRange();
	S.Diag(VD->getLocation(), diag::note_ref_var_local_bind)
	<< VD->getDeclName() << range;
	}
	}

	/// EvalAddr - EvalAddr and EvalVal are mutually recursive functions that
	/// check if the expression in a return statement evaluates to an address
	/// to a location on the stack, a local block, an address of a label, or a
	/// reference to local temporary. The recursion is used to traverse the
	/// AST of the return expression, with recursion backtracking when we
	/// encounter a subexpression that (1) clearly does not lead to one of the
	/// above problematic expressions (2) is something we cannot determine leads to
	/// a problematic expression based on such local checking.
	///
	/// Both EvalAddr and EvalVal follow through reference variables to evaluate
	/// the expression that they point to. Such variables are added to the
	/// 'refVars' vector so that we know what the reference variable "trail" was.
	///
	/// EvalAddr processes expressions that are pointers that are used as
	/// references (and not L-values). EvalVal handles all other values.
	/// At the base case of the recursion is a check for the above problematic
	/// expressions.
	///
	/// This implementation handles:
	///
	/// * pointer-to-pointer casts
	/// * implicit conversions from array references to pointers
	/// * taking the address of fields
	/// * arbitrary interplay between "&" and "*" operators
	/// * pointer arithmetic from an address of a stack variable
	/// * taking the address of an array element where the array is on the stack
	static const Expr EvalAddr(const Expr E,
	SmallVectorImpl<const DeclRefExpr *> &refVars,
	const Decl *ParentDecl) {
	if (E->isTypeDependent())
	return nullptr;

	// We should only be called for evaluating pointer expressions.
	assert((E->getType()->isAnyPointerType() \|\|
	E->getType()->isBlockPointerType() \|\|
	E->getType()->isObjCQualifiedIdType()) &&
	"EvalAddr only works on pointers");

	E = E->IgnoreParens();

	// Our "symbolic interpreter" is just a dispatch off the currently
	// viewed AST node. We then recursively traverse the AST by calling
	// EvalAddr and EvalVal appropriately.
	switch (E->getStmtClass()) {
	case Stmt::DeclRefExprClass: {
	const DeclRefExpr *DR = cast<DeclRefExpr>(E);

	// If we leave the immediate function, the lifetime isn't about to end.
	if (DR->refersToEnclosingVariableOrCapture())
	return nullptr;

	if (const VarDecl *V = dyn_cast<VarDecl>(DR->getDecl()))
	// If this is a reference variable, follow through to the expression that
	// it points to.
	if (V->hasLocalStorage() &&
	V->getType()->isReferenceType() && V->hasInit()) {
	// Add the reference variable to the "trail".
	refVars.push_back(DR);
	return EvalAddr(V->getInit(), refVars, ParentDecl);
	}

	return nullptr;
	}

	case Stmt::UnaryOperatorClass: {
	// The only unary operator that make sense to handle here
	// is AddrOf. All others don't make sense as pointers.
	const UnaryOperator *U = cast<UnaryOperator>(E);

	if (U->getOpcode() == UO_AddrOf)
	return EvalVal(U->getSubExpr(), refVars, ParentDecl);
	return nullptr;
	}

	case Stmt::BinaryOperatorClass: {
	// Handle pointer arithmetic. All other binary operators are not valid
	// in this context.
	const BinaryOperator *B = cast<BinaryOperator>(E);
	BinaryOperatorKind op = B->getOpcode();

	if (op != BO_Add && op != BO_Sub)
	return nullptr;

	const Expr *Base = B->getLHS();

	// Determine which argument is the real pointer base. It could be
	// the RHS argument instead of the LHS.
	if (!Base->getType()->isPointerType())
	Base = B->getRHS();

	assert(Base->getType()->isPointerType());
	return EvalAddr(Base, refVars, ParentDecl);
	}

	// For conditional operators we need to see if either the LHS or RHS are
	// valid DeclRefExpr*s. If one of them is valid, we return it.
	case Stmt::ConditionalOperatorClass: {
	const ConditionalOperator *C = cast<ConditionalOperator>(E);

	// Handle the GNU extension for missing LHS.
	// FIXME: That isn't a ConditionalOperator, so doesn't get here.
	if (const Expr *LHSExpr = C->getLHS()) {
	// In C++, we can have a throw-expression, which has 'void' type.
	if (!LHSExpr->getType()->isVoidType())
	if (const Expr *LHS = EvalAddr(LHSExpr, refVars, ParentDecl))
	return LHS;
	}

	// In C++, we can have a throw-expression, which has 'void' type.
	if (C->getRHS()->getType()->isVoidType())
	return nullptr;

	return EvalAddr(C->getRHS(), refVars, ParentDecl);
	}

	case Stmt::BlockExprClass:
	if (cast<BlockExpr>(E)->getBlockDecl()->hasCaptures())
	return E; // local block.
	return nullptr;

	case Stmt::AddrLabelExprClass:
	return E; // address of label.

	case Stmt::ExprWithCleanupsClass:
	return EvalAddr(cast<ExprWithCleanups>(E)->getSubExpr(), refVars,
	ParentDecl);

	// For casts, we need to handle conversions from arrays to
	// pointer values, and pointer-to-pointer conversions.
	case Stmt::ImplicitCastExprClass:
	case Stmt::CStyleCastExprClass:
	case Stmt::CXXFunctionalCastExprClass:
	case Stmt::ObjCBridgedCastExprClass:
	case Stmt::CXXStaticCastExprClass:
	case Stmt::CXXDynamicCastExprClass:
	case Stmt::CXXConstCastExprClass:
	case Stmt::CXXReinterpretCastExprClass: {
	const Expr* SubExpr = cast<CastExpr>(E)->getSubExpr();
	switch (cast<CastExpr>(E)->getCastKind()) {
	case CK_LValueToRValue:
	case CK_NoOp:
	case CK_BaseToDerived:
	case CK_DerivedToBase:
	case CK_UncheckedDerivedToBase:
	case CK_Dynamic:
	case CK_CPointerToObjCPointerCast:
	case CK_BlockPointerToObjCPointerCast:
	case CK_AnyPointerToBlockPointerCast:
	return EvalAddr(SubExpr, refVars, ParentDecl);

	case CK_ArrayToPointerDecay:
	return EvalVal(SubExpr, refVars, ParentDecl);

	case CK_BitCast:
	if (SubExpr->getType()->isAnyPointerType() \|\|
	SubExpr->getType()->isBlockPointerType() \|\|
	SubExpr->getType()->isObjCQualifiedIdType())
	return EvalAddr(SubExpr, refVars, ParentDecl);
	else
	return nullptr;

	default:
	return nullptr;
	}
	}

	case Stmt::MaterializeTemporaryExprClass:
	if (const Expr *Result =
	EvalAddr(cast<MaterializeTemporaryExpr>(E)->GetTemporaryExpr(),
	refVars, ParentDecl))
	return Result;
	return E;

	// Everything else: we simply don't reason about them.
	default:
	return nullptr;
	}
	}

	/// EvalVal - This function is complements EvalAddr in the mutual recursion.
	/// See the comments for EvalAddr for more details.
	static const Expr EvalVal(const Expr E,
	SmallVectorImpl<const DeclRefExpr *> &refVars,
	const Decl *ParentDecl) {
	do {
	// We should only be called for evaluating non-pointer expressions, or
	// expressions with a pointer type that are not used as references but
	// instead
	// are l-values (e.g., DeclRefExpr with a pointer type).

	// Our "symbolic interpreter" is just a dispatch off the currently
	// viewed AST node. We then recursively traverse the AST by calling
	// EvalAddr and EvalVal appropriately.

	E = E->IgnoreParens();
	switch (E->getStmtClass()) {
	case Stmt::ImplicitCastExprClass: {
	const ImplicitCastExpr *IE = cast<ImplicitCastExpr>(E);
	if (IE->getValueKind() == VK_LValue) {
	E = IE->getSubExpr();
	continue;
	}
	return nullptr;
	}

	case Stmt::ExprWithCleanupsClass:
	return EvalVal(cast<ExprWithCleanups>(E)->getSubExpr(), refVars,
	ParentDecl);

	case Stmt::DeclRefExprClass: {
	// When we hit a DeclRefExpr we are looking at code that refers to a
	// variable's name. If it's not a reference variable we check if it has
	// local storage within the function, and if so, return the expression.
	const DeclRefExpr *DR = cast<DeclRefExpr>(E);

	// If we leave the immediate function, the lifetime isn't about to end.
	if (DR->refersToEnclosingVariableOrCapture())
	return nullptr;

	if (const VarDecl *V = dyn_cast<VarDecl>(DR->getDecl())) {
	// Check if it refers to itself, e.g. "int& i = i;".
	if (V == ParentDecl)
	return DR;

	if (V->hasLocalStorage()) {
	if (!V->getType()->isReferenceType())
	return DR;

	// Reference variable, follow through to the expression that
	// it points to.
	if (V->hasInit()) {
	// Add the reference variable to the "trail".
	refVars.push_back(DR);
	return EvalVal(V->getInit(), refVars, V);
	}
	}
	}

	return nullptr;
	}

	case Stmt::UnaryOperatorClass: {
	// The only unary operator that make sense to handle here
	// is Deref. All others don't resolve to a "name." This includes
	// handling all sorts of rvalues passed to a unary operator.
	const UnaryOperator *U = cast<UnaryOperator>(E);

	if (U->getOpcode() == UO_Deref)
	return EvalAddr(U->getSubExpr(), refVars, ParentDecl);

	return nullptr;
	}

	case Stmt::ArraySubscriptExprClass: {
	// Array subscripts are potential references to data on the stack. We
	// retrieve the DeclRefExpr* for the array variable if it indeed
	// has local storage.
	const auto *ASE = cast<ArraySubscriptExpr>(E);
	if (ASE->isTypeDependent())
	return nullptr;
	return EvalAddr(ASE->getBase(), refVars, ParentDecl);
	}

	case Stmt::OMPArraySectionExprClass: {
	return EvalAddr(cast<OMPArraySectionExpr>(E)->getBase(), refVars,
	ParentDecl);
	}

	case Stmt::ConditionalOperatorClass: {
	// For conditional operators we need to see if either the LHS or RHS are
	// non-NULL Expr's. If one is non-NULL, we return it.
	const ConditionalOperator *C = cast<ConditionalOperator>(E);

	// Handle the GNU extension for missing LHS.
	if (const Expr *LHSExpr = C->getLHS()) {
	// In C++, we can have a throw-expression, which has 'void' type.
	if (!LHSExpr->getType()->isVoidType())
	if (const Expr *LHS = EvalVal(LHSExpr, refVars, ParentDecl))
	return LHS;
	}

	// In C++, we can have a throw-expression, which has 'void' type.
	if (C->getRHS()->getType()->isVoidType())
	return nullptr;

	return EvalVal(C->getRHS(), refVars, ParentDecl);
	}

	// Accesses to members are potential references to data on the stack.
	case Stmt::MemberExprClass: {
	const MemberExpr *M = cast<MemberExpr>(E);

	// Check for indirect access. We only want direct field accesses.
	if (M->isArrow())
	return nullptr;

	// Check whether the member type is itself a reference, in which case
	// we're not going to refer to the member, but to what the member refers
	// to.
	if (M->getMemberDecl()->getType()->isReferenceType())
	return nullptr;

	return EvalVal(M->getBase(), refVars, ParentDecl);
	}

	case Stmt::MaterializeTemporaryExprClass:
	if (const Expr *Result =
	EvalVal(cast<MaterializeTemporaryExpr>(E)->GetTemporaryExpr(),
	refVars, ParentDecl))
	return Result;
	return E;

	default:
	// Check that we don't return or take the address of a reference to a
	// temporary. This is only useful in C++.
	if (!E->isTypeDependent() && E->isRValue())
	return E;

	// Everything else: we simply don't reason about them.
	return nullptr;
	}
	} while (true);
	}

	void
	Sema::CheckReturnValExpr(Expr *RetValExp, QualType lhsType,
	SourceLocation ReturnLoc,
	bool isObjCMethod,
	const AttrVec *Attrs,
	const FunctionDecl *FD) {
	CheckReturnStackAddr(*this, RetValExp, lhsType, ReturnLoc);

	// Check if the return value is null but should not be.
	if (((Attrs && hasSpecificAttr<ReturnsNonNullAttr>(*Attrs)) \|\|
	(!isObjCMethod && isNonNullType(Context, lhsType))) &&
	CheckNonNullExpr(*this, RetValExp))
	Diag(ReturnLoc, diag::warn_null_ret)
	<< (isObjCMethod ? 1 : 0) << RetValExp->getSourceRange();

	// C++11 [basic.stc.dynamic.allocation]p4:
	// If an allocation function declared with a non-throwing
	// exception-specification fails to allocate storage, it shall return
	// a null pointer. Any other allocation function that fails to allocate
	// storage shall indicate failure only by throwing an exception [...]
	if (FD) {
	OverloadedOperatorKind Op = FD->getOverloadedOperator();
	if (Op == OO_New \|\| Op == OO_Array_New) {
	const FunctionProtoType *Proto
	= FD->getType()->castAs<FunctionProtoType>();
	if (!Proto->isNothrow(Context, /ResultIfDependent/true) &&
	CheckNonNullExpr(*this, RetValExp))
	Diag(ReturnLoc, diag::warn_operator_new_returns_null)
	<< FD << getLangOpts().CPlusPlus11;
	}
	}
	}

	//===--- CHECK: Floating-Point comparisons (-Wfloat-equal) ---------------===//

	/// Check for comparisons of floating point operands using != and ==.
	/// Issue a warning if these are no self-comparisons, as they are not likely
	/// to do what the programmer intended.
	void Sema::CheckFloatComparison(SourceLocation Loc, Expr* LHS, Expr *RHS) {
	Expr* LeftExprSansParen = LHS->IgnoreParenImpCasts();
	Expr* RightExprSansParen = RHS->IgnoreParenImpCasts();

	// Special case: check for x == x (which is OK).
	// Do not emit warnings for such cases.
	if (DeclRefExpr* DRL = dyn_cast<DeclRefExpr>(LeftExprSansParen))
	if (DeclRefExpr* DRR = dyn_cast<DeclRefExpr>(RightExprSansParen))
	if (DRL->getDecl() == DRR->getDecl())
	return;

	// Special case: check for comparisons against literals that can be exactly
	// represented by APFloat. In such cases, do not emit a warning. This
	// is a heuristic: often comparison against such literals are used to
	// detect if a value in a variable has not changed. This clearly can
	// lead to false negatives.
	if (FloatingLiteral* FLL = dyn_cast<FloatingLiteral>(LeftExprSansParen)) {
	if (FLL->isExact())
	return;
	} else
	if (FloatingLiteral* FLR = dyn_cast<FloatingLiteral>(RightExprSansParen))
	if (FLR->isExact())
	return;

	// Check for comparisons with builtin types.
	if (CallExpr* CL = dyn_cast<CallExpr>(LeftExprSansParen))
	if (CL->getBuiltinCallee())
	return;

	if (CallExpr* CR = dyn_cast<CallExpr>(RightExprSansParen))
	if (CR->getBuiltinCallee())
	return;

	// Emit the diagnostic.
	Diag(Loc, diag::warn_floatingpoint_eq)
	<< LHS->getSourceRange() << RHS->getSourceRange();
	}

	//===--- CHECK: Integer mixed-sign comparisons (-Wsign-compare) --------===//
	//===--- CHECK: Lossy implicit conversions (-Wconversion) --------------===//

	namespace {

	/// Structure recording the 'active' range of an integer-valued
	/// expression.
	struct IntRange {
	/// The number of bits active in the int.
	unsigned Width;

	/// True if the int is known not to have negative values.
	bool NonNegative;

	IntRange(unsigned Width, bool NonNegative)
	: Width(Width), NonNegative(NonNegative) {}

	/// Returns the range of the bool type.
	static IntRange forBoolType() {
	return IntRange(1, true);
	}

	/// Returns the range of an opaque value of the given integral type.
	static IntRange forValueOfType(ASTContext &C, QualType T) {
	return forValueOfCanonicalType(C,
	T->getCanonicalTypeInternal().getTypePtr());
	}

	/// Returns the range of an opaque value of a canonical integral type.
	static IntRange forValueOfCanonicalType(ASTContext &C, const Type *T) {
	assert(T->isCanonicalUnqualified());

	if (const VectorType *VT = dyn_cast<VectorType>(T))
	T = VT->getElementType().getTypePtr();
	if (const ComplexType *CT = dyn_cast<ComplexType>(T))
	T = CT->getElementType().getTypePtr();
	if (const AtomicType *AT = dyn_cast<AtomicType>(T))
	T = AT->getValueType().getTypePtr();

	if (!C.getLangOpts().CPlusPlus) {
	// For enum types in C code, use the underlying datatype.
	if (const EnumType *ET = dyn_cast<EnumType>(T))
	T = ET->getDecl()->getIntegerType().getDesugaredType(C).getTypePtr();
	} else if (const EnumType *ET = dyn_cast<EnumType>(T)) {
	// For enum types in C++, use the known bit width of the enumerators.
	EnumDecl *Enum = ET->getDecl();
	// In C++11, enums can have a fixed underlying type. Use this type to
	// compute the range.
	if (Enum->isFixed()) {
	return IntRange(C.getIntWidth(QualType(T, 0)),
	!ET->isSignedIntegerOrEnumerationType());
	}

	unsigned NumPositive = Enum->getNumPositiveBits();
	unsigned NumNegative = Enum->getNumNegativeBits();

	if (NumNegative == 0)
	return IntRange(NumPositive, true/NonNegative/);
	else
	return IntRange(std::max(NumPositive + 1, NumNegative),
	false/NonNegative/);
	}

	const BuiltinType *BT = cast<BuiltinType>(T);
	assert(BT->isInteger());

	return IntRange(C.getIntWidth(QualType(T, 0)), BT->isUnsignedInteger());
	}

	/// Returns the "target" range of a canonical integral type, i.e.
	/// the range of values expressible in the type.
	///
	/// This matches forValueOfCanonicalType except that enums have the
	/// full range of their type, not the range of their enumerators.
	static IntRange forTargetOfCanonicalType(ASTContext &C, const Type *T) {
	assert(T->isCanonicalUnqualified());

	if (const VectorType *VT = dyn_cast<VectorType>(T))
	T = VT->getElementType().getTypePtr();
	if (const ComplexType *CT = dyn_cast<ComplexType>(T))
	T = CT->getElementType().getTypePtr();
	if (const AtomicType *AT = dyn_cast<AtomicType>(T))
	T = AT->getValueType().getTypePtr();
	if (const EnumType *ET = dyn_cast<EnumType>(T))
	T = C.getCanonicalType(ET->getDecl()->getIntegerType()).getTypePtr();

	const BuiltinType *BT = cast<BuiltinType>(T);
	assert(BT->isInteger());

	return IntRange(C.getIntWidth(QualType(T, 0)), BT->isUnsignedInteger());
	}

	/// Returns the supremum of two ranges: i.e. their conservative merge.
	static IntRange join(IntRange L, IntRange R) {
	return IntRange(std::max(L.Width, R.Width),
	L.NonNegative && R.NonNegative);
	}

	/// Returns the infinum of two ranges: i.e. their aggressive merge.
	static IntRange meet(IntRange L, IntRange R) {
	return IntRange(std::min(L.Width, R.Width),
	L.NonNegative \|\| R.NonNegative);
	}
	};

	} // namespace

	static IntRange GetValueRange(ASTContext &C, llvm::APSInt &value,
	unsigned MaxWidth) {
	if (value.isSigned() && value.isNegative())
	return IntRange(value.getMinSignedBits(), false);

	if (value.getBitWidth() > MaxWidth)
	value = value.trunc(MaxWidth);

	// isNonNegative() just checks the sign bit without considering
	// signedness.
	return IntRange(value.getActiveBits(), true);
	}

	static IntRange GetValueRange(ASTContext &C, APValue &result, QualType Ty,
	unsigned MaxWidth) {
	if (result.isInt())
	return GetValueRange(C, result.getInt(), MaxWidth);

	if (result.isVector()) {
	IntRange R = GetValueRange(C, result.getVectorElt(0), Ty, MaxWidth);
	for (unsigned i = 1, e = result.getVectorLength(); i != e; ++i) {
	IntRange El = GetValueRange(C, result.getVectorElt(i), Ty, MaxWidth);
	R = IntRange::join(R, El);
	}
	return R;
	}

	if (result.isComplexInt()) {
	IntRange R = GetValueRange(C, result.getComplexIntReal(), MaxWidth);
	IntRange I = GetValueRange(C, result.getComplexIntImag(), MaxWidth);
	return IntRange::join(R, I);
	}

	// This can happen with lossless casts to intptr_t of "based" lvalues.
	// Assume it might use arbitrary bits.
	// FIXME: The only reason we need to pass the type in here is to get
	// the sign right on this one case. It would be nice if APValue
	// preserved this.
	assert(result.isLValue() \|\| result.isAddrLabelDiff());
	return IntRange(MaxWidth, Ty->isUnsignedIntegerOrEnumerationType());
	}

	static QualType GetExprType(const Expr *E) {
	QualType Ty = E->getType();
	if (const AtomicType *AtomicRHS = Ty->getAs<AtomicType>())
	Ty = AtomicRHS->getValueType();
	return Ty;
	}

	/// Pseudo-evaluate the given integer expression, estimating the
	/// range of values it might take.
	///
	/// \param MaxWidth - the width to which the value will be truncated
	static IntRange GetExprRange(ASTContext &C, const Expr *E, unsigned MaxWidth) {
	E = E->IgnoreParens();

	// Try a full evaluation first.
	Expr::EvalResult result;
	if (E->EvaluateAsRValue(result, C))
	return GetValueRange(C, result.Val, GetExprType(E), MaxWidth);

	// I think we only want to look through implicit casts here; if the
	// user has an explicit widening cast, we should treat the value as
	// being of the new, wider type.
	if (const auto *CE = dyn_cast<ImplicitCastExpr>(E)) {
	if (CE->getCastKind() == CK_NoOp \|\| CE->getCastKind() == CK_LValueToRValue)
	return GetExprRange(C, CE->getSubExpr(), MaxWidth);

	IntRange OutputTypeRange = IntRange::forValueOfType(C, GetExprType(CE));

	bool isIntegerCast = CE->getCastKind() == CK_IntegralCast \|\|
	CE->getCastKind() == CK_BooleanToSignedIntegral;

	// Assume that non-integer casts can span the full range of the type.
	if (!isIntegerCast)
	return OutputTypeRange;

	IntRange SubRange
	= GetExprRange(C, CE->getSubExpr(),
	std::min(MaxWidth, OutputTypeRange.Width));

	// Bail out if the subexpr's range is as wide as the cast type.
	if (SubRange.Width >= OutputTypeRange.Width)
	return OutputTypeRange;

	// Otherwise, we take the smaller width, and we're non-negative if
	// either the output type or the subexpr is.
	return IntRange(SubRange.Width,
	SubRange.NonNegative \|\| OutputTypeRange.NonNegative);
	}

	if (const auto *CO = dyn_cast<ConditionalOperator>(E)) {
	// If we can fold the condition, just take that operand.
	bool CondResult;
	if (CO->getCond()->EvaluateAsBooleanCondition(CondResult, C))
	return GetExprRange(C, CondResult ? CO->getTrueExpr()
	: CO->getFalseExpr(),
	MaxWidth);

	// Otherwise, conservatively merge.
	IntRange L = GetExprRange(C, CO->getTrueExpr(), MaxWidth);
	IntRange R = GetExprRange(C, CO->getFalseExpr(), MaxWidth);
	return IntRange::join(L, R);
	}

	if (const auto *BO = dyn_cast<BinaryOperator>(E)) {
	switch (BO->getOpcode()) {
	case BO_Cmp:
	llvm_unreachable("builtin <=> should have class type");

	// Boolean-valued operations are single-bit and positive.
	case BO_LAnd:
	case BO_LOr:
	case BO_LT:
	case BO_GT:
	case BO_LE:
	case BO_GE:
	case BO_EQ:
	case BO_NE:
	return IntRange::forBoolType();

	// The type of the assignments is the type of the LHS, so the RHS
	// is not necessarily the same type.
	case BO_MulAssign:
	case BO_DivAssign:
	case BO_RemAssign:
	case BO_AddAssign:
	case BO_SubAssign:
	case BO_XorAssign:
	case BO_OrAssign:
	// TODO: bitfields?
	return IntRange::forValueOfType(C, GetExprType(E));

	// Simple assignments just pass through the RHS, which will have
	// been coerced to the LHS type.
	case BO_Assign:
	// TODO: bitfields?
	return GetExprRange(C, BO->getRHS(), MaxWidth);

	// Operations with opaque sources are black-listed.
	case BO_PtrMemD:
	case BO_PtrMemI:
	return IntRange::forValueOfType(C, GetExprType(E));

	// Bitwise-and uses the infinum of the two source ranges.
	case BO_And:
	case BO_AndAssign:
	return IntRange::meet(GetExprRange(C, BO->getLHS(), MaxWidth),
	GetExprRange(C, BO->getRHS(), MaxWidth));

	// Left shift gets black-listed based on a judgement call.
	case BO_Shl:
	// ...except that we want to treat '1 << (blah)' as logically
	// positive. It's an important idiom.
	if (IntegerLiteral *I
	= dyn_cast<IntegerLiteral>(BO->getLHS()->IgnoreParenCasts())) {
	if (I->getValue() == 1) {
	IntRange R = IntRange::forValueOfType(C, GetExprType(E));
	return IntRange(R.Width, /NonNegative/ true);
	}
	}
	LLVM_FALLTHROUGH;

	case BO_ShlAssign:
	return IntRange::forValueOfType(C, GetExprType(E));

	// Right shift by a constant can narrow its left argument.
	case BO_Shr:
	case BO_ShrAssign: {
	IntRange L = GetExprRange(C, BO->getLHS(), MaxWidth);

	// If the shift amount is a positive constant, drop the width by
	// that much.
	llvm::APSInt shift;
	if (BO->getRHS()->isIntegerConstantExpr(shift, C) &&
	shift.isNonNegative()) {
	unsigned zext = shift.getZExtValue();
	if (zext >= L.Width)
	L.Width = (L.NonNegative ? 0 : 1);
	else
	L.Width -= zext;
	}

	return L;
	}

	// Comma acts as its right operand.
	case BO_Comma:
	return GetExprRange(C, BO->getRHS(), MaxWidth);

	// Black-list pointer subtractions.
	case BO_Sub:
	if (BO->getLHS()->getType()->isPointerType())
	return IntRange::forValueOfType(C, GetExprType(E));
	break;

	// The width of a division result is mostly determined by the size
	// of the LHS.
	case BO_Div: {
	// Don't 'pre-truncate' the operands.
	unsigned opWidth = C.getIntWidth(GetExprType(E));
	IntRange L = GetExprRange(C, BO->getLHS(), opWidth);

	// If the divisor is constant, use that.
	llvm::APSInt divisor;
	if (BO->getRHS()->isIntegerConstantExpr(divisor, C)) {
	unsigned log2 = divisor.logBase2(); // floor(log_2(divisor))
	if (log2 >= L.Width)
	L.Width = (L.NonNegative ? 0 : 1);
	else
	L.Width = std::min(L.Width - log2, MaxWidth);
	return L;
	}

	// Otherwise, just use the LHS's width.
	IntRange R = GetExprRange(C, BO->getRHS(), opWidth);
	return IntRange(L.Width, L.NonNegative && R.NonNegative);
	}

	// The result of a remainder can't be larger than the result of
	// either side.
	case BO_Rem: {
	// Don't 'pre-truncate' the operands.
	unsigned opWidth = C.getIntWidth(GetExprType(E));
	IntRange L = GetExprRange(C, BO->getLHS(), opWidth);
	IntRange R = GetExprRange(C, BO->getRHS(), opWidth);

	IntRange meet = IntRange::meet(L, R);
	meet.Width = std::min(meet.Width, MaxWidth);
	return meet;
	}

	// The default behavior is okay for these.
	case BO_Mul:
	case BO_Add:
	case BO_Xor:
	case BO_Or:
	break;
	}

	// The default case is to treat the operation as if it were closed
	// on the narrowest type that encompasses both operands.
	IntRange L = GetExprRange(C, BO->getLHS(), MaxWidth);
	IntRange R = GetExprRange(C, BO->getRHS(), MaxWidth);
	return IntRange::join(L, R);
	}

	if (const auto *UO = dyn_cast<UnaryOperator>(E)) {
	switch (UO->getOpcode()) {
	// Boolean-valued operations are white-listed.
	case UO_LNot:
	return IntRange::forBoolType();

	// Operations with opaque sources are black-listed.
	case UO_Deref:
	case UO_AddrOf: // should be impossible
	return IntRange::forValueOfType(C, GetExprType(E));

	default:
	return GetExprRange(C, UO->getSubExpr(), MaxWidth);
	}
	}

	if (const auto *OVE = dyn_cast<OpaqueValueExpr>(E))
	return GetExprRange(C, OVE->getSourceExpr(), MaxWidth);

	if (const auto *BitField = E->getSourceBitField())
	return IntRange(BitField->getBitWidthValue(C),
	BitField->getType()->isUnsignedIntegerOrEnumerationType());

	return IntRange::forValueOfType(C, GetExprType(E));
	}

	static IntRange GetExprRange(ASTContext &C, const Expr *E) {
	return GetExprRange(C, E, C.getIntWidth(GetExprType(E)));
	}

	/// Checks whether the given value, which currently has the given
	/// source semantics, has the same value when coerced through the
	/// target semantics.
	static bool IsSameFloatAfterCast(const llvm::APFloat &value,
	const llvm::fltSemantics &Src,
	const llvm::fltSemantics &Tgt) {
	llvm::APFloat truncated = value;

	bool ignored;
	truncated.convert(Src, llvm::APFloat::rmNearestTiesToEven, &ignored);
	truncated.convert(Tgt, llvm::APFloat::rmNearestTiesToEven, &ignored);

	return truncated.bitwiseIsEqual(value);
	}

	/// Checks whether the given value, which currently has the given
	/// source semantics, has the same value when coerced through the
	/// target semantics.
	///
	/// The value might be a vector of floats (or a complex number).
	static bool IsSameFloatAfterCast(const APValue &value,
	const llvm::fltSemantics &Src,
	const llvm::fltSemantics &Tgt) {
	if (value.isFloat())
	return IsSameFloatAfterCast(value.getFloat(), Src, Tgt);

	if (value.isVector()) {
	for (unsigned i = 0, e = value.getVectorLength(); i != e; ++i)
	if (!IsSameFloatAfterCast(value.getVectorElt(i), Src, Tgt))
	return false;
	return true;
	}

	assert(value.isComplexFloat());
	return (IsSameFloatAfterCast(value.getComplexFloatReal(), Src, Tgt) &&
	IsSameFloatAfterCast(value.getComplexFloatImag(), Src, Tgt));
	}

	static void AnalyzeImplicitConversions(Sema &S, Expr *E, SourceLocation CC);

	static bool IsEnumConstOrFromMacro(Sema &S, Expr *E) {
	// Suppress cases where we are comparing against an enum constant.
	if (const DeclRefExpr *DR =
	dyn_cast<DeclRefExpr>(E->IgnoreParenImpCasts()))
	if (isa<EnumConstantDecl>(DR->getDecl()))
	return true;

	// Suppress cases where the '0' value is expanded from a macro.
	if (E->getLocStart().isMacroID())
	return true;

	return false;
	}

	static bool isKnownToHaveUnsignedValue(Expr *E) {
	return E->getType()->isIntegerType() &&
	(!E->getType()->isSignedIntegerType() \|\|
	!E->IgnoreParenImpCasts()->getType()->isSignedIntegerType());
	}

	namespace {
	/// The promoted range of values of a type. In general this has the
	/// following structure:
	///
	/// \|-----------\| . . . \|-----------\|
	/// ^ ^ ^ ^
	/// Min HoleMin HoleMax Max
	///
	/// ... where there is only a hole if a signed type is promoted to unsigned
	/// (in which case Min and Max are the smallest and largest representable
	/// values).
	struct PromotedRange {
	// Min, or HoleMax if there is a hole.
	llvm::APSInt PromotedMin;
	// Max, or HoleMin if there is a hole.
	llvm::APSInt PromotedMax;

	PromotedRange(IntRange R, unsigned BitWidth, bool Unsigned) {
	if (R.Width == 0)
	PromotedMin = PromotedMax = llvm::APSInt(BitWidth, Unsigned);
	else if (R.Width >= BitWidth && !Unsigned) {
	// Promotion made the type narrower. This happens when promoting
	// a < 32-bit unsigned / <= 32-bit signed bit-field to 'signed int'.
	// Treat all values of 'signed int' as being in range for now.
	PromotedMin = llvm::APSInt::getMinValue(BitWidth, Unsigned);
	PromotedMax = llvm::APSInt::getMaxValue(BitWidth, Unsigned);
	} else {
	PromotedMin = llvm::APSInt::getMinValue(R.Width, R.NonNegative)
	.extOrTrunc(BitWidth);
	PromotedMin.setIsUnsigned(Unsigned);

	PromotedMax = llvm::APSInt::getMaxValue(R.Width, R.NonNegative)
	.extOrTrunc(BitWidth);
	PromotedMax.setIsUnsigned(Unsigned);
	}
	}

	// Determine whether this range is contiguous (has no hole).
	bool isContiguous() const { return PromotedMin <= PromotedMax; }

	// Where a constant value is within the range.
	enum ComparisonResult {
	LT = 0x1,
	LE = 0x2,
	GT = 0x4,
	GE = 0x8,
	EQ = 0x10,
	NE = 0x20,
	InRangeFlag = 0x40,

	Less = LE \| LT \| NE,
	Min = LE \| InRangeFlag,
	InRange = InRangeFlag,
	Max = GE \| InRangeFlag,
	Greater = GE \| GT \| NE,

	OnlyValue = LE \| GE \| EQ \| InRangeFlag,
	InHole = NE
	};

	ComparisonResult compare(const llvm::APSInt &Value) const {
	assert(Value.getBitWidth() == PromotedMin.getBitWidth() &&
	Value.isUnsigned() == PromotedMin.isUnsigned());
	if (!isContiguous()) {
	assert(Value.isUnsigned() && "discontiguous range for signed compare");
	if (Value.isMinValue()) return Min;
	if (Value.isMaxValue()) return Max;
	if (Value >= PromotedMin) return InRange;
	if (Value <= PromotedMax) return InRange;
	return InHole;
	}

	switch (llvm::APSInt::compareValues(Value, PromotedMin)) {
	case -1: return Less;
	case 0: return PromotedMin == PromotedMax ? OnlyValue : Min;
	case 1:
	switch (llvm::APSInt::compareValues(Value, PromotedMax)) {
	case -1: return InRange;
	case 0: return Max;
	case 1: return Greater;
	}
	}

	llvm_unreachable("impossible compare result");
	}

	static llvm::Optional<StringRef>
	constantValue(BinaryOperatorKind Op, ComparisonResult R, bool ConstantOnRHS) {
	if (Op == BO_Cmp) {
	ComparisonResult LTFlag = LT, GTFlag = GT;
	if (ConstantOnRHS) std::swap(LTFlag, GTFlag);

	if (R & EQ) return StringRef("'std::strong_ordering::equal'");
	if (R & LTFlag) return StringRef("'std::strong_ordering::less'");
	if (R & GTFlag) return StringRef("'std::strong_ordering::greater'");
	return llvm::None;
	}

	ComparisonResult TrueFlag, FalseFlag;
	if (Op == BO_EQ) {
	TrueFlag = EQ;
	FalseFlag = NE;
	} else if (Op == BO_NE) {
	TrueFlag = NE;
	FalseFlag = EQ;
	} else {
	if ((Op == BO_LT \|\| Op == BO_GE) ^ ConstantOnRHS) {
	TrueFlag = LT;
	FalseFlag = GE;
	} else {
	TrueFlag = GT;
	FalseFlag = LE;
	}
	if (Op == BO_GE \|\| Op == BO_LE)
	std::swap(TrueFlag, FalseFlag);
	}
	if (R & TrueFlag)
	return StringRef("true");
	if (R & FalseFlag)
	return StringRef("false");
	return llvm::None;
	}
	};
	}

	static bool HasEnumType(Expr *E) {
	// Strip off implicit integral promotions.
	while (ImplicitCastExpr *ICE = dyn_cast<ImplicitCastExpr>(E)) {
	if (ICE->getCastKind() != CK_IntegralCast &&
	ICE->getCastKind() != CK_NoOp)
	break;
	E = ICE->getSubExpr();
	}

	return E->getType()->isEnumeralType();
	}

	static int classifyConstantValue(Expr *Constant) {
	// The values of this enumeration are used in the diagnostics
	// diag::warn_out_of_range_compare and diag::warn_tautological_bool_compare.
	enum ConstantValueKind {
	Miscellaneous = 0,
	LiteralTrue,
	LiteralFalse
	};
	if (auto *BL = dyn_cast<CXXBoolLiteralExpr>(Constant))
	return BL->getValue() ? ConstantValueKind::LiteralTrue
	: ConstantValueKind::LiteralFalse;
	return ConstantValueKind::Miscellaneous;
	}

	static bool CheckTautologicalComparison(Sema &S, BinaryOperator *E,
	Expr Constant, Expr Other,
	const llvm::APSInt &Value,
	bool RhsConstant) {
	if (S.inTemplateInstantiation())
	return false;

	Expr *OriginalOther = Other;

	Constant = Constant->IgnoreParenImpCasts();
	Other = Other->IgnoreParenImpCasts();

	// Suppress warnings on tautological comparisons between values of the same
	// enumeration type. There are only two ways we could warn on this:
	// - If the constant is outside the range of representable values of
	// the enumeration. In such a case, we should warn about the cast
	// to enumeration type, not about the comparison.
	// - If the constant is the maximum / minimum in-range value. For an
	// enumeratin type, such comparisons can be meaningful and useful.
	if (Constant->getType()->isEnumeralType() &&
	S.Context.hasSameUnqualifiedType(Constant->getType(), Other->getType()))
	return false;

	// TODO: Investigate using GetExprRange() to get tighter bounds
	// on the bit ranges.
	QualType OtherT = Other->getType();
	if (const auto *AT = OtherT->getAs<AtomicType>())
	OtherT = AT->getValueType();
	IntRange OtherRange = IntRange::forValueOfType(S.Context, OtherT);

	// Whether we're treating Other as being a bool because of the form of
	// expression despite it having another type (typically 'int' in C).
	bool OtherIsBooleanDespiteType =
	!OtherT->isBooleanType() && Other->isKnownToHaveBooleanValue();
	if (OtherIsBooleanDespiteType)
	OtherRange = IntRange::forBoolType();

	// Determine the promoted range of the other type and see if a comparison of
	// the constant against that range is tautological.
	PromotedRange OtherPromotedRange(OtherRange, Value.getBitWidth(),
	Value.isUnsigned());
	auto Cmp = OtherPromotedRange.compare(Value);
	auto Result = PromotedRange::constantValue(E->getOpcode(), Cmp, RhsConstant);
	if (!Result)
	return false;

	// Suppress the diagnostic for an in-range comparison if the constant comes
	// from a macro or enumerator. We don't want to diagnose
	//
	// some_long_value <= INT_MAX
	//
	// when sizeof(int) == sizeof(long).
	bool InRange = Cmp & PromotedRange::InRangeFlag;
	if (InRange && IsEnumConstOrFromMacro(S, Constant))
	return false;

	// If this is a comparison to an enum constant, include that
	// constant in the diagnostic.
	const EnumConstantDecl *ED = nullptr;
	if (const DeclRefExpr *DR = dyn_cast<DeclRefExpr>(Constant))
	ED = dyn_cast<EnumConstantDecl>(DR->getDecl());

	// Should be enough for uint128 (39 decimal digits)
	SmallString<64> PrettySourceValue;
	llvm::raw_svector_ostream OS(PrettySourceValue);
	if (ED)
	OS << '\'' << *ED << "' (" << Value << ")";
	else
	OS << Value;

	// FIXME: We use a somewhat different formatting for the in-range cases and
	// cases involving boolean values for historical reasons. We should pick a
	// consistent way of presenting these diagnostics.
	if (!InRange \|\| Other->isKnownToHaveBooleanValue()) {
	S.DiagRuntimeBehavior(
	E->getOperatorLoc(), E,
	S.PDiag(!InRange ? diag::warn_out_of_range_compare
	: diag::warn_tautological_bool_compare)
	<< OS.str() << classifyConstantValue(Constant)
	<< OtherT << OtherIsBooleanDespiteType << *Result
	<< E->getLHS()->getSourceRange() << E->getRHS()->getSourceRange());
	} else {
	unsigned Diag = (isKnownToHaveUnsignedValue(OriginalOther) && Value == 0)
	? (HasEnumType(OriginalOther)
	? diag::warn_unsigned_enum_always_true_comparison
	: diag::warn_unsigned_always_true_comparison)
	: diag::warn_tautological_constant_compare;

	S.Diag(E->getOperatorLoc(), Diag)
	<< RhsConstant << OtherT << E->getOpcodeStr() << OS.str() << *Result
	<< E->getLHS()->getSourceRange() << E->getRHS()->getSourceRange();
	}

	return true;
	}

	/// Analyze the operands of the given comparison. Implements the
	/// fallback case from AnalyzeComparison.
	static void AnalyzeImpConvsInComparison(Sema &S, BinaryOperator *E) {
	AnalyzeImplicitConversions(S, E->getLHS(), E->getOperatorLoc());
	AnalyzeImplicitConversions(S, E->getRHS(), E->getOperatorLoc());
	}

	/// \brief Implements -Wsign-compare.
	///
	/// \param E the binary operator to check for warnings
	static void AnalyzeComparison(Sema &S, BinaryOperator *E) {
	// The type the comparison is being performed in.
	QualType T = E->getLHS()->getType();

	// Only analyze comparison operators where both sides have been converted to
	// the same type.
	if (!S.Context.hasSameUnqualifiedType(T, E->getRHS()->getType()))
	return AnalyzeImpConvsInComparison(S, E);

	// Don't analyze value-dependent comparisons directly.
	if (E->isValueDependent())
	return AnalyzeImpConvsInComparison(S, E);

	Expr *LHS = E->getLHS();
	Expr *RHS = E->getRHS();

	if (T->isIntegralType(S.Context)) {
	llvm::APSInt RHSValue;
	llvm::APSInt LHSValue;

	bool IsRHSIntegralLiteral = RHS->isIntegerConstantExpr(RHSValue, S.Context);
	bool IsLHSIntegralLiteral = LHS->isIntegerConstantExpr(LHSValue, S.Context);

	// We don't care about expressions whose result is a constant.
	if (IsRHSIntegralLiteral && IsLHSIntegralLiteral)
	return AnalyzeImpConvsInComparison(S, E);

	// We only care about expressions where just one side is literal
	if (IsRHSIntegralLiteral ^ IsLHSIntegralLiteral) {
	// Is the constant on the RHS or LHS?
	const bool RhsConstant = IsRHSIntegralLiteral;
	Expr *Const = RhsConstant ? RHS : LHS;
	Expr *Other = RhsConstant ? LHS : RHS;
	const llvm::APSInt &Value = RhsConstant ? RHSValue : LHSValue;

	// Check whether an integer constant comparison results in a value
	// of 'true' or 'false'.
	if (CheckTautologicalComparison(S, E, Const, Other, Value, RhsConstant))
	return AnalyzeImpConvsInComparison(S, E);
	}
	}

	if (!T->hasUnsignedIntegerRepresentation()) {
	// We don't do anything special if this isn't an unsigned integral
	// comparison: we're only interested in integral comparisons, and
	// signed comparisons only happen in cases we don't care to warn about.
	return AnalyzeImpConvsInComparison(S, E);
	}

	LHS = LHS->IgnoreParenImpCasts();
	RHS = RHS->IgnoreParenImpCasts();

	+ if (!S.getLangOpts().CPlusPlus) {
	+ // Avoid warning about comparison of integers with different signs when
	+ // RHS/LHS has a `typeof(E)` type whose sign is different from the sign of
	+ // the type of `E`.
	+ if (const auto *TET = dyn_cast<TypeOfExprType>(LHS->getType()))
	+ LHS = TET->getUnderlyingExpr()->IgnoreParenImpCasts();
	+ if (const auto *TET = dyn_cast<TypeOfExprType>(RHS->getType()))
	+ RHS = TET->getUnderlyingExpr()->IgnoreParenImpCasts();
	+ }
	+
	// Check to see if one of the (unmodified) operands is of different
	// signedness.
	Expr signedOperand, unsignedOperand;
	if (LHS->getType()->hasSignedIntegerRepresentation()) {
	assert(!RHS->getType()->hasSignedIntegerRepresentation() &&
	"unsigned comparison between two signed integer expressions?");
	signedOperand = LHS;
	unsignedOperand = RHS;
	} else if (RHS->getType()->hasSignedIntegerRepresentation()) {
	signedOperand = RHS;
	unsignedOperand = LHS;
	} else {
	return AnalyzeImpConvsInComparison(S, E);
	}

	// Otherwise, calculate the effective range of the signed operand.
	IntRange signedRange = GetExprRange(S.Context, signedOperand);

	// Go ahead and analyze implicit conversions in the operands. Note
	// that we skip the implicit conversions on both sides.
	AnalyzeImplicitConversions(S, LHS, E->getOperatorLoc());
	AnalyzeImplicitConversions(S, RHS, E->getOperatorLoc());

	// If the signed range is non-negative, -Wsign-compare won't fire.
	if (signedRange.NonNegative)
	return;

	// For (in)equality comparisons, if the unsigned operand is a
	// constant which cannot collide with a overflowed signed operand,
	// then reinterpreting the signed operand as unsigned will not
	// change the result of the comparison.
	if (E->isEqualityOp()) {
	unsigned comparisonWidth = S.Context.getIntWidth(T);
	IntRange unsignedRange = GetExprRange(S.Context, unsignedOperand);

	// We should never be unable to prove that the unsigned operand is
	// non-negative.
	assert(unsignedRange.NonNegative && "unsigned range includes negative?");

	if (unsignedRange.Width < comparisonWidth)
	return;
	}

	S.DiagRuntimeBehavior(E->getOperatorLoc(), E,
	S.PDiag(diag::warn_mixed_sign_comparison)
	<< LHS->getType() << RHS->getType()
	<< LHS->getSourceRange() << RHS->getSourceRange());
	}

	/// Analyzes an attempt to assign the given value to a bitfield.
	///
	/// Returns true if there was something fishy about the attempt.
	static bool AnalyzeBitFieldAssignment(Sema &S, FieldDecl Bitfield, Expr Init,
	SourceLocation InitLoc) {
	assert(Bitfield->isBitField());
	if (Bitfield->isInvalidDecl())
	return false;

	// White-list bool bitfields.
	QualType BitfieldType = Bitfield->getType();
	if (BitfieldType->isBooleanType())
	return false;

	if (BitfieldType->isEnumeralType()) {
	EnumDecl *BitfieldEnumDecl = BitfieldType->getAs<EnumType>()->getDecl();
	// If the underlying enum type was not explicitly specified as an unsigned
	// type and the enum contain only positive values, MSVC++ will cause an
	// inconsistency by storing this as a signed type.
	if (S.getLangOpts().CPlusPlus11 &&
	!BitfieldEnumDecl->getIntegerTypeSourceInfo() &&
	BitfieldEnumDecl->getNumPositiveBits() > 0 &&
	BitfieldEnumDecl->getNumNegativeBits() == 0) {
	S.Diag(InitLoc, diag::warn_no_underlying_type_specified_for_enum_bitfield)
	<< BitfieldEnumDecl->getNameAsString();
	}
	}

	if (Bitfield->getType()->isBooleanType())
	return false;

	// Ignore value- or type-dependent expressions.
	if (Bitfield->getBitWidth()->isValueDependent() \|\|
	Bitfield->getBitWidth()->isTypeDependent() \|\|
	Init->isValueDependent() \|\|
	Init->isTypeDependent())
	return false;

	Expr *OriginalInit = Init->IgnoreParenImpCasts();
	unsigned FieldWidth = Bitfield->getBitWidthValue(S.Context);

	llvm::APSInt Value;
	if (!OriginalInit->EvaluateAsInt(Value, S.Context,
	Expr::SE_AllowSideEffects)) {
	// The RHS is not constant. If the RHS has an enum type, make sure the
	// bitfield is wide enough to hold all the values of the enum without
	// truncation.
	if (const auto *EnumTy = OriginalInit->getType()->getAs<EnumType>()) {
	EnumDecl *ED = EnumTy->getDecl();
	bool SignedBitfield = BitfieldType->isSignedIntegerType();

	// Enum types are implicitly signed on Windows, so check if there are any
	// negative enumerators to see if the enum was intended to be signed or
	// not.
	bool SignedEnum = ED->getNumNegativeBits() > 0;

	// Check for surprising sign changes when assigning enum values to a
	// bitfield of different signedness. If the bitfield is signed and we
	// have exactly the right number of bits to store this unsigned enum,
	// suggest changing the enum to an unsigned type. This typically happens
	// on Windows where unfixed enums always use an underlying type of 'int'.
	unsigned DiagID = 0;
	if (SignedEnum && !SignedBitfield) {
	DiagID = diag::warn_unsigned_bitfield_assigned_signed_enum;
	} else if (SignedBitfield && !SignedEnum &&
	ED->getNumPositiveBits() == FieldWidth) {
	DiagID = diag::warn_signed_bitfield_enum_conversion;
	}

	if (DiagID) {
	S.Diag(InitLoc, DiagID) << Bitfield << ED;
	TypeSourceInfo *TSI = Bitfield->getTypeSourceInfo();
	SourceRange TypeRange =
	TSI ? TSI->getTypeLoc().getSourceRange() : SourceRange();
	S.Diag(Bitfield->getTypeSpecStartLoc(), diag::note_change_bitfield_sign)
	<< SignedEnum << TypeRange;
	}

	// Compute the required bitwidth. If the enum has negative values, we need
	// one more bit than the normal number of positive bits to represent the
	// sign bit.
	unsigned BitsNeeded = SignedEnum ? std::max(ED->getNumPositiveBits() + 1,
	ED->getNumNegativeBits())
	: ED->getNumPositiveBits();

	// Check the bitwidth.
	if (BitsNeeded > FieldWidth) {
	Expr *WidthExpr = Bitfield->getBitWidth();
	S.Diag(InitLoc, diag::warn_bitfield_too_small_for_enum)
	<< Bitfield << ED;
	S.Diag(WidthExpr->getExprLoc(), diag::note_widen_bitfield)
	<< BitsNeeded << ED << WidthExpr->getSourceRange();
	}
	}

	return false;
	}

	unsigned OriginalWidth = Value.getBitWidth();

	if (!Value.isSigned() \|\| Value.isNegative())
	if (UnaryOperator *UO = dyn_cast<UnaryOperator>(OriginalInit))
	if (UO->getOpcode() == UO_Minus \|\| UO->getOpcode() == UO_Not)
	OriginalWidth = Value.getMinSignedBits();

	if (OriginalWidth <= FieldWidth)
	return false;

	// Compute the value which the bitfield will contain.
	llvm::APSInt TruncatedValue = Value.trunc(FieldWidth);
	TruncatedValue.setIsSigned(BitfieldType->isSignedIntegerType());

	// Check whether the stored value is equal to the original value.
	TruncatedValue = TruncatedValue.extend(OriginalWidth);
	if (llvm::APSInt::isSameValue(Value, TruncatedValue))
	return false;

	// Special-case bitfields of width 1: booleans are naturally 0/1, and
	// therefore don't strictly fit into a signed bitfield of width 1.
	if (FieldWidth == 1 && Value == 1)
	return false;

	std::string PrettyValue = Value.toString(10);
	std::string PrettyTrunc = TruncatedValue.toString(10);

	S.Diag(InitLoc, diag::warn_impcast_bitfield_precision_constant)
	<< PrettyValue << PrettyTrunc << OriginalInit->getType()
	<< Init->getSourceRange();

	return true;
	}

	/// Analyze the given simple or compound assignment for warning-worthy
	/// operations.
	static void AnalyzeAssignment(Sema &S, BinaryOperator *E) {
	// Just recurse on the LHS.
	AnalyzeImplicitConversions(S, E->getLHS(), E->getOperatorLoc());

	// We want to recurse on the RHS as normal unless we're assigning to
	// a bitfield.
	if (FieldDecl *Bitfield = E->getLHS()->getSourceBitField()) {
	if (AnalyzeBitFieldAssignment(S, Bitfield, E->getRHS(),
	E->getOperatorLoc())) {
	// Recurse, ignoring any implicit conversions on the RHS.
	return AnalyzeImplicitConversions(S, E->getRHS()->IgnoreParenImpCasts(),
	E->getOperatorLoc());
	}
	}

	AnalyzeImplicitConversions(S, E->getRHS(), E->getOperatorLoc());
	}

	/// Diagnose an implicit cast; purely a helper for CheckImplicitConversion.
	static void DiagnoseImpCast(Sema &S, Expr *E, QualType SourceType, QualType T,
	SourceLocation CContext, unsigned diag,
	bool pruneControlFlow = false) {
	if (pruneControlFlow) {
	S.DiagRuntimeBehavior(E->getExprLoc(), E,
	S.PDiag(diag)
	<< SourceType << T << E->getSourceRange()
	<< SourceRange(CContext));
	return;
	}
	S.Diag(E->getExprLoc(), diag)
	<< SourceType << T << E->getSourceRange() << SourceRange(CContext);
	}

	/// Diagnose an implicit cast; purely a helper for CheckImplicitConversion.
	static void DiagnoseImpCast(Sema &S, Expr *E, QualType T,
	SourceLocation CContext,
	unsigned diag, bool pruneControlFlow = false) {
	DiagnoseImpCast(S, E, E->getType(), T, CContext, diag, pruneControlFlow);
	}


	/// Diagnose an implicit cast from a floating point value to an integer value.
	static void DiagnoseFloatingImpCast(Sema &S, Expr *E, QualType T,
	SourceLocation CContext) {
	const bool IsBool = T->isSpecificBuiltinType(BuiltinType::Bool);
	const bool PruneWarnings = S.inTemplateInstantiation();

	Expr *InnerE = E->IgnoreParenImpCasts();
	// We also want to warn on, e.g., "int i = -1.234"
	if (UnaryOperator *UOp = dyn_cast<UnaryOperator>(InnerE))
	if (UOp->getOpcode() == UO_Minus \|\| UOp->getOpcode() == UO_Plus)
	InnerE = UOp->getSubExpr()->IgnoreParenImpCasts();

	const bool IsLiteral =
	isa<FloatingLiteral>(E) \|\| isa<FloatingLiteral>(InnerE);

	llvm::APFloat Value(0.0);
	bool IsConstant =
	E->EvaluateAsFloat(Value, S.Context, Expr::SE_AllowSideEffects);
	if (!IsConstant) {
	return DiagnoseImpCast(S, E, T, CContext,
	diag::warn_impcast_float_integer, PruneWarnings);
	}

	bool isExact = false;

	llvm::APSInt IntegerValue(S.Context.getIntWidth(T),
	T->hasUnsignedIntegerRepresentation());
	if (Value.convertToInteger(IntegerValue, llvm::APFloat::rmTowardZero,
	&isExact) == llvm::APFloat::opOK &&
	isExact) {
	if (IsLiteral) return;
	return DiagnoseImpCast(S, E, T, CContext, diag::warn_impcast_float_integer,
	PruneWarnings);
	}

	unsigned DiagID = 0;
	if (IsLiteral) {
	// Warn on floating point literal to integer.
	DiagID = diag::warn_impcast_literal_float_to_integer;
	} else if (IntegerValue == 0) {
	if (Value.isZero()) { // Skip -0.0 to 0 conversion.
	return DiagnoseImpCast(S, E, T, CContext,
	diag::warn_impcast_float_integer, PruneWarnings);
	}
	// Warn on non-zero to zero conversion.
	DiagID = diag::warn_impcast_float_to_integer_zero;
	} else {
	if (IntegerValue.isUnsigned()) {
	if (!IntegerValue.isMaxValue()) {
	return DiagnoseImpCast(S, E, T, CContext,
	diag::warn_impcast_float_integer, PruneWarnings);
	}
	} else { // IntegerValue.isSigned()
	if (!IntegerValue.isMaxSignedValue() &&
	!IntegerValue.isMinSignedValue()) {
	return DiagnoseImpCast(S, E, T, CContext,
	diag::warn_impcast_float_integer, PruneWarnings);
	}
	}
	// Warn on evaluatable floating point expression to integer conversion.
	DiagID = diag::warn_impcast_float_to_integer;
	}

	// FIXME: Force the precision of the source value down so we don't print
	// digits which are usually useless (we don't really care here if we
	// truncate a digit by accident in edge cases). Ideally, APFloat::toString
	// would automatically print the shortest representation, but it's a bit
	// tricky to implement.
	SmallString<16> PrettySourceValue;
	unsigned precision = llvm::APFloat::semanticsPrecision(Value.getSemantics());
	precision = (precision * 59 + 195) / 196;
	Value.toString(PrettySourceValue, precision);

	SmallString<16> PrettyTargetValue;
	if (IsBool)
	PrettyTargetValue = Value.isZero() ? "false" : "true";
	else
	IntegerValue.toString(PrettyTargetValue);

	if (PruneWarnings) {
	S.DiagRuntimeBehavior(E->getExprLoc(), E,
	S.PDiag(DiagID)
	<< E->getType() << T.getUnqualifiedType()
	<< PrettySourceValue << PrettyTargetValue
	<< E->getSourceRange() << SourceRange(CContext));
	} else {
	S.Diag(E->getExprLoc(), DiagID)
	<< E->getType() << T.getUnqualifiedType() << PrettySourceValue
	<< PrettyTargetValue << E->getSourceRange() << SourceRange(CContext);
	}
	}

	static std::string PrettyPrintInRange(const llvm::APSInt &Value,
	IntRange Range) {
	if (!Range.Width) return "0";

	llvm::APSInt ValueInRange = Value;
	ValueInRange.setIsSigned(!Range.NonNegative);
	ValueInRange = ValueInRange.trunc(Range.Width);
	return ValueInRange.toString(10);
	}

	static bool IsImplicitBoolFloatConversion(Sema &S, Expr *Ex, bool ToBool) {
	if (!isa<ImplicitCastExpr>(Ex))
	return false;

	Expr *InnerE = Ex->IgnoreParenImpCasts();
	const Type *Target = S.Context.getCanonicalType(Ex->getType()).getTypePtr();
	const Type *Source =
	S.Context.getCanonicalType(InnerE->getType()).getTypePtr();
	if (Target->isDependentType())
	return false;

	const BuiltinType *FloatCandidateBT =
	dyn_cast<BuiltinType>(ToBool ? Source : Target);
	const Type *BoolCandidateType = ToBool ? Target : Source;

	return (BoolCandidateType->isSpecificBuiltinType(BuiltinType::Bool) &&
	FloatCandidateBT && (FloatCandidateBT->isFloatingPoint()));
	}

	static void CheckImplicitArgumentConversions(Sema &S, CallExpr *TheCall,
	SourceLocation CC) {
	unsigned NumArgs = TheCall->getNumArgs();
	for (unsigned i = 0; i < NumArgs; ++i) {
	Expr *CurrA = TheCall->getArg(i);
	if (!IsImplicitBoolFloatConversion(S, CurrA, true))
	continue;

	bool IsSwapped = ((i > 0) &&
	IsImplicitBoolFloatConversion(S, TheCall->getArg(i - 1), false));
	IsSwapped \|= ((i < (NumArgs - 1)) &&
	IsImplicitBoolFloatConversion(S, TheCall->getArg(i + 1), false));
	if (IsSwapped) {
	// Warn on this floating-point to bool conversion.
	DiagnoseImpCast(S, CurrA->IgnoreParenImpCasts(),
	CurrA->getType(), CC,
	diag::warn_impcast_floating_point_to_bool);
	}
	}
	}

	static void DiagnoseNullConversion(Sema &S, Expr *E, QualType T,
	SourceLocation CC) {
	if (S.Diags.isIgnored(diag::warn_impcast_null_pointer_to_integer,
	E->getExprLoc()))
	return;

	// Don't warn on functions which have return type nullptr_t.
	if (isa<CallExpr>(E))
	return;

	// Check for NULL (GNUNull) or nullptr (CXX11_nullptr).
	const Expr::NullPointerConstantKind NullKind =
	E->isNullPointerConstant(S.Context, Expr::NPC_ValueDependentIsNotNull);
	if (NullKind != Expr::NPCK_GNUNull && NullKind != Expr::NPCK_CXX11_nullptr)
	return;

	// Return if target type is a safe conversion.
	if (T->isAnyPointerType() \|\| T->isBlockPointerType() \|\|
	T->isMemberPointerType() \|\| !T->isScalarType() \|\| T->isNullPtrType())
	return;

	SourceLocation Loc = E->getSourceRange().getBegin();

	// Venture through the macro stacks to get to the source of macro arguments.
	// The new location is a better location than the complete location that was
	// passed in.
	while (S.SourceMgr.isMacroArgExpansion(Loc))
	Loc = S.SourceMgr.getImmediateMacroCallerLoc(Loc);

	while (S.SourceMgr.isMacroArgExpansion(CC))
	CC = S.SourceMgr.getImmediateMacroCallerLoc(CC);

	// __null is usually wrapped in a macro. Go up a macro if that is the case.
	if (NullKind == Expr::NPCK_GNUNull && Loc.isMacroID()) {
	StringRef MacroName = Lexer::getImmediateMacroNameForDiagnostics(
	Loc, S.SourceMgr, S.getLangOpts());
	if (MacroName == "NULL")
	Loc = S.SourceMgr.getImmediateExpansionRange(Loc).first;
	}

	// Only warn if the null and context location are in the same macro expansion.
	if (S.SourceMgr.getFileID(Loc) != S.SourceMgr.getFileID(CC))
	return;

	S.Diag(Loc, diag::warn_impcast_null_pointer_to_integer)
	<< (NullKind == Expr::NPCK_CXX11_nullptr) << T << SourceRange(CC)
	<< FixItHint::CreateReplacement(Loc,
	S.getFixItZeroLiteralForType(T, Loc));
	}

	static void checkObjCArrayLiteral(Sema &S, QualType TargetType,
	ObjCArrayLiteral *ArrayLiteral);

	static void
	checkObjCDictionaryLiteral(Sema &S, QualType TargetType,
	ObjCDictionaryLiteral *DictionaryLiteral);

	/// Check a single element within a collection literal against the
	/// target element type.
	static void checkObjCCollectionLiteralElement(Sema &S,
	QualType TargetElementType,
	Expr *Element,
	unsigned ElementKind) {
	// Skip a bitcast to 'id' or qualified 'id'.
	if (auto ICE = dyn_cast<ImplicitCastExpr>(Element)) {
	if (ICE->getCastKind() == CK_BitCast &&
	ICE->getSubExpr()->getType()->getAs<ObjCObjectPointerType>())
	Element = ICE->getSubExpr();
	}

	QualType ElementType = Element->getType();
	ExprResult ElementResult(Element);
	if (ElementType->getAs<ObjCObjectPointerType>() &&
	S.CheckSingleAssignmentConstraints(TargetElementType,
	ElementResult,
	false, false)
	!= Sema::Compatible) {
	S.Diag(Element->getLocStart(),
	diag::warn_objc_collection_literal_element)
	<< ElementType << ElementKind << TargetElementType
	<< Element->getSourceRange();
	}

	if (auto ArrayLiteral = dyn_cast<ObjCArrayLiteral>(Element))
	checkObjCArrayLiteral(S, TargetElementType, ArrayLiteral);
	else if (auto DictionaryLiteral = dyn_cast<ObjCDictionaryLiteral>(Element))
	checkObjCDictionaryLiteral(S, TargetElementType, DictionaryLiteral);
	}

	/// Check an Objective-C array literal being converted to the given
	/// target type.
	static void checkObjCArrayLiteral(Sema &S, QualType TargetType,
	ObjCArrayLiteral *ArrayLiteral) {
	if (!S.NSArrayDecl)
	return;

	const auto *TargetObjCPtr = TargetType->getAs<ObjCObjectPointerType>();
	if (!TargetObjCPtr)
	return;

	if (TargetObjCPtr->isUnspecialized() \|\|
	TargetObjCPtr->getInterfaceDecl()->getCanonicalDecl()
	!= S.NSArrayDecl->getCanonicalDecl())
	return;

	auto TypeArgs = TargetObjCPtr->getTypeArgs();
	if (TypeArgs.size() != 1)
	return;

	QualType TargetElementType = TypeArgs[0];
	for (unsigned I = 0, N = ArrayLiteral->getNumElements(); I != N; ++I) {
	checkObjCCollectionLiteralElement(S, TargetElementType,
	ArrayLiteral->getElement(I),
	0);
	}
	}

	/// Check an Objective-C dictionary literal being converted to the given
	/// target type.
	static void
	checkObjCDictionaryLiteral(Sema &S, QualType TargetType,
	ObjCDictionaryLiteral *DictionaryLiteral) {
	if (!S.NSDictionaryDecl)
	return;

	const auto *TargetObjCPtr = TargetType->getAs<ObjCObjectPointerType>();
	if (!TargetObjCPtr)
	return;

	if (TargetObjCPtr->isUnspecialized() \|\|
	TargetObjCPtr->getInterfaceDecl()->getCanonicalDecl()
	!= S.NSDictionaryDecl->getCanonicalDecl())
	return;

	auto TypeArgs = TargetObjCPtr->getTypeArgs();
	if (TypeArgs.size() != 2)
	return;

	QualType TargetKeyType = TypeArgs[0];
	QualType TargetObjectType = TypeArgs[1];
	for (unsigned I = 0, N = DictionaryLiteral->getNumElements(); I != N; ++I) {
	auto Element = DictionaryLiteral->getKeyValueElement(I);
	checkObjCCollectionLiteralElement(S, TargetKeyType, Element.Key, 1);
	checkObjCCollectionLiteralElement(S, TargetObjectType, Element.Value, 2);
	}
	}

	// Helper function to filter out cases for constant width constant conversion.
	// Don't warn on char array initialization or for non-decimal values.
	static bool isSameWidthConstantConversion(Sema &S, Expr *E, QualType T,
	SourceLocation CC) {
	// If initializing from a constant, and the constant starts with '0',
	// then it is a binary, octal, or hexadecimal. Allow these constants
	// to fill all the bits, even if there is a sign change.
	if (auto *IntLit = dyn_cast<IntegerLiteral>(E->IgnoreParenImpCasts())) {
	const char FirstLiteralCharacter =
	S.getSourceManager().getCharacterData(IntLit->getLocStart())[0];
	if (FirstLiteralCharacter == '0')
	return false;
	}

	// If the CC location points to a '{', and the type is char, then assume
	// assume it is an array initialization.
	if (CC.isValid() && T->isCharType()) {
	const char FirstContextCharacter =
	S.getSourceManager().getCharacterData(CC)[0];
	if (FirstContextCharacter == '{')
	return false;
	}

	return true;
	}

	static void
	CheckImplicitConversion(Sema &S, Expr *E, QualType T, SourceLocation CC,
	bool *ICContext = nullptr) {
	if (E->isTypeDependent() \|\| E->isValueDependent()) return;

	const Type *Source = S.Context.getCanonicalType(E->getType()).getTypePtr();
	const Type *Target = S.Context.getCanonicalType(T).getTypePtr();
	if (Source == Target) return;
	if (Target->isDependentType()) return;

	// If the conversion context location is invalid don't complain. We also
	// don't want to emit a warning if the issue occurs from the expansion of
	// a system macro. The problem is that 'getSpellingLoc()' is slow, so we
	// delay this check as long as possible. Once we detect we are in that
	// scenario, we just return.
	if (CC.isInvalid())
	return;

	// Diagnose implicit casts to bool.
	if (Target->isSpecificBuiltinType(BuiltinType::Bool)) {
	if (isa<StringLiteral>(E))
	// Warn on string literal to bool. Checks for string literals in logical
	// and expressions, for instance, assert(0 && "error here"), are
	// prevented by a check in AnalyzeImplicitConversions().
	return DiagnoseImpCast(S, E, T, CC,
	diag::warn_impcast_string_literal_to_bool);
	if (isa<ObjCStringLiteral>(E) \|\| isa<ObjCArrayLiteral>(E) \|\|
	isa<ObjCDictionaryLiteral>(E) \|\| isa<ObjCBoxedExpr>(E)) {
	// This covers the literal expressions that evaluate to Objective-C
	// objects.
	return DiagnoseImpCast(S, E, T, CC,
	diag::warn_impcast_objective_c_literal_to_bool);
	}
	if (Source->isPointerType() \|\| Source->canDecayToPointerType()) {
	// Warn on pointer to bool conversion that is always true.
	S.DiagnoseAlwaysNonNullPointer(E, Expr::NPCK_NotNull, /IsEqual/ false,
	SourceRange(CC));
	}
	}

	// Check implicit casts from Objective-C collection literals to specialized
	// collection types, e.g., NSArray<NSString > .
	if (auto *ArrayLiteral = dyn_cast<ObjCArrayLiteral>(E))
	checkObjCArrayLiteral(S, QualType(Target, 0), ArrayLiteral);
	else if (auto *DictionaryLiteral = dyn_cast<ObjCDictionaryLiteral>(E))
	checkObjCDictionaryLiteral(S, QualType(Target, 0), DictionaryLiteral);

	// Strip vector types.
	if (isa<VectorType>(Source)) {
	if (!isa<VectorType>(Target)) {
	if (S.SourceMgr.isInSystemMacro(CC))
	return;
	return DiagnoseImpCast(S, E, T, CC, diag::warn_impcast_vector_scalar);
	}

	// If the vector cast is cast between two vectors of the same size, it is
	// a bitcast, not a conversion.
	if (S.Context.getTypeSize(Source) == S.Context.getTypeSize(Target))
	return;

	Source = cast<VectorType>(Source)->getElementType().getTypePtr();
	Target = cast<VectorType>(Target)->getElementType().getTypePtr();
	}
	if (auto VecTy = dyn_cast<VectorType>(Target))
	Target = VecTy->getElementType().getTypePtr();

	// Strip complex types.
	if (isa<ComplexType>(Source)) {
	if (!isa<ComplexType>(Target)) {
	if (S.SourceMgr.isInSystemMacro(CC) \|\| Target->isBooleanType())
	return;

	return DiagnoseImpCast(S, E, T, CC,
	S.getLangOpts().CPlusPlus
	? diag::err_impcast_complex_scalar
	: diag::warn_impcast_complex_scalar);
	}

	Source = cast<ComplexType>(Source)->getElementType().getTypePtr();
	Target = cast<ComplexType>(Target)->getElementType().getTypePtr();
	}

	const BuiltinType *SourceBT = dyn_cast<BuiltinType>(Source);
	const BuiltinType *TargetBT = dyn_cast<BuiltinType>(Target);

	// If the source is floating point...
	if (SourceBT && SourceBT->isFloatingPoint()) {
	// ...and the target is floating point...
	if (TargetBT && TargetBT->isFloatingPoint()) {
	// ...then warn if we're dropping FP rank.

	// Builtin FP kinds are ordered by increasing FP rank.
	if (SourceBT->getKind() > TargetBT->getKind()) {
	// Don't warn about float constants that are precisely
	// representable in the target type.
	Expr::EvalResult result;
	if (E->EvaluateAsRValue(result, S.Context)) {
	// Value might be a float, a float vector, or a float complex.
	if (IsSameFloatAfterCast(result.Val,
	S.Context.getFloatTypeSemantics(QualType(TargetBT, 0)),
	S.Context.getFloatTypeSemantics(QualType(SourceBT, 0))))
	return;
	}

	if (S.SourceMgr.isInSystemMacro(CC))
	return;

	DiagnoseImpCast(S, E, T, CC, diag::warn_impcast_float_precision);
	}
	// ... or possibly if we're increasing rank, too
	else if (TargetBT->getKind() > SourceBT->getKind()) {
	if (S.SourceMgr.isInSystemMacro(CC))
	return;

	DiagnoseImpCast(S, E, T, CC, diag::warn_impcast_double_promotion);
	}
	return;
	}

	// If the target is integral, always warn.
	if (TargetBT && TargetBT->isInteger()) {
	if (S.SourceMgr.isInSystemMacro(CC))
	return;

	DiagnoseFloatingImpCast(S, E, T, CC);
	}

	// Detect the case where a call result is converted from floating-point to
	// to bool, and the final argument to the call is converted from bool, to
	// discover this typo:
	//
	// bool b = fabs(x < 1.0); // should be "bool b = fabs(x) < 1.0;"
	//
	// FIXME: This is an incredibly special case; is there some more general
	// way to detect this class of misplaced-parentheses bug?
	if (Target->isBooleanType() && isa<CallExpr>(E)) {
	// Check last argument of function call to see if it is an
	// implicit cast from a type matching the type the result
	// is being cast to.
	CallExpr *CEx = cast<CallExpr>(E);
	if (unsigned NumArgs = CEx->getNumArgs()) {
	Expr *LastA = CEx->getArg(NumArgs - 1);
	Expr *InnerE = LastA->IgnoreParenImpCasts();
	if (isa<ImplicitCastExpr>(LastA) &&
	InnerE->getType()->isBooleanType()) {
	// Warn on this floating-point to bool conversion
	DiagnoseImpCast(S, E, T, CC,
	diag::warn_impcast_floating_point_to_bool);
	}
	}
	}
	return;
	}

	DiagnoseNullConversion(S, E, T, CC);

	S.DiscardMisalignedMemberAddress(Target, E);

	if (!Source->isIntegerType() \|\| !Target->isIntegerType())
	return;

	// TODO: remove this early return once the false positives for constant->bool
	// in templates, macros, etc, are reduced or removed.
	if (Target->isSpecificBuiltinType(BuiltinType::Bool))
	return;

	IntRange SourceRange = GetExprRange(S.Context, E);
	IntRange TargetRange = IntRange::forTargetOfCanonicalType(S.Context, Target);

	if (SourceRange.Width > TargetRange.Width) {
	// If the source is a constant, use a default-on diagnostic.
	// TODO: this should happen for bitfield stores, too.
	llvm::APSInt Value(32);
	if (E->EvaluateAsInt(Value, S.Context, Expr::SE_AllowSideEffects)) {
	if (S.SourceMgr.isInSystemMacro(CC))
	return;

	std::string PrettySourceValue = Value.toString(10);
	std::string PrettyTargetValue = PrettyPrintInRange(Value, TargetRange);

	S.DiagRuntimeBehavior(E->getExprLoc(), E,
	S.PDiag(diag::warn_impcast_integer_precision_constant)
	<< PrettySourceValue << PrettyTargetValue
	<< E->getType() << T << E->getSourceRange()
	<< clang::SourceRange(CC));
	return;
	}

	// People want to build with -Wshorten-64-to-32 and not -Wconversion.
	if (S.SourceMgr.isInSystemMacro(CC))
	return;

	if (TargetRange.Width == 32 && S.Context.getIntWidth(E->getType()) == 64)
	return DiagnoseImpCast(S, E, T, CC, diag::warn_impcast_integer_64_32,
	/* pruneControlFlow */ true);
	return DiagnoseImpCast(S, E, T, CC, diag::warn_impcast_integer_precision);
	}

	if (TargetRange.Width == SourceRange.Width && !TargetRange.NonNegative &&
	SourceRange.NonNegative && Source->isSignedIntegerType()) {
	// Warn when doing a signed to signed conversion, warn if the positive
	// source value is exactly the width of the target type, which will
	// cause a negative value to be stored.

	llvm::APSInt Value;
	if (E->EvaluateAsInt(Value, S.Context, Expr::SE_AllowSideEffects) &&
	!S.SourceMgr.isInSystemMacro(CC)) {
	if (isSameWidthConstantConversion(S, E, T, CC)) {
	std::string PrettySourceValue = Value.toString(10);
	std::string PrettyTargetValue = PrettyPrintInRange(Value, TargetRange);

	S.DiagRuntimeBehavior(
	E->getExprLoc(), E,
	S.PDiag(diag::warn_impcast_integer_precision_constant)
	<< PrettySourceValue << PrettyTargetValue << E->getType() << T
	<< E->getSourceRange() << clang::SourceRange(CC));
	return;
	}
	}

	// Fall through for non-constants to give a sign conversion warning.
	}

	if ((TargetRange.NonNegative && !SourceRange.NonNegative) \|\|
	(!TargetRange.NonNegative && SourceRange.NonNegative &&
	SourceRange.Width == TargetRange.Width)) {
	if (S.SourceMgr.isInSystemMacro(CC))
	return;

	unsigned DiagID = diag::warn_impcast_integer_sign;

	// Traditionally, gcc has warned about this under -Wsign-compare.
	// We also want to warn about it in -Wconversion.
	// So if -Wconversion is off, use a completely identical diagnostic
	// in the sign-compare group.
	// The conditional-checking code will
	if (ICContext) {
	DiagID = diag::warn_impcast_integer_sign_conditional;
	*ICContext = true;
	}

	return DiagnoseImpCast(S, E, T, CC, DiagID);
	}

	// Diagnose conversions between different enumeration types.
	// In C, we pretend that the type of an EnumConstantDecl is its enumeration
	// type, to give us better diagnostics.
	QualType SourceType = E->getType();
	if (!S.getLangOpts().CPlusPlus) {
	if (DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(E))
	if (EnumConstantDecl *ECD = dyn_cast<EnumConstantDecl>(DRE->getDecl())) {
	EnumDecl *Enum = cast<EnumDecl>(ECD->getDeclContext());
	SourceType = S.Context.getTypeDeclType(Enum);
	Source = S.Context.getCanonicalType(SourceType).getTypePtr();
	}
	}

	if (const EnumType *SourceEnum = Source->getAs<EnumType>())
	if (const EnumType *TargetEnum = Target->getAs<EnumType>())
	if (SourceEnum->getDecl()->hasNameForLinkage() &&
	TargetEnum->getDecl()->hasNameForLinkage() &&
	SourceEnum != TargetEnum) {
	if (S.SourceMgr.isInSystemMacro(CC))
	return;

	return DiagnoseImpCast(S, E, SourceType, T, CC,
	diag::warn_impcast_different_enum_types);
	}
	}

	static void CheckConditionalOperator(Sema &S, ConditionalOperator *E,
	SourceLocation CC, QualType T);

	static void CheckConditionalOperand(Sema &S, Expr *E, QualType T,
	SourceLocation CC, bool &ICContext) {
	E = E->IgnoreParenImpCasts();

	if (isa<ConditionalOperator>(E))
	return CheckConditionalOperator(S, cast<ConditionalOperator>(E), CC, T);

	AnalyzeImplicitConversions(S, E, CC);
	if (E->getType() != T)
	return CheckImplicitConversion(S, E, T, CC, &ICContext);
	}

	static void CheckConditionalOperator(Sema &S, ConditionalOperator *E,
	SourceLocation CC, QualType T) {
	AnalyzeImplicitConversions(S, E->getCond(), E->getQuestionLoc());

	bool Suspicious = false;
	CheckConditionalOperand(S, E->getTrueExpr(), T, CC, Suspicious);
	CheckConditionalOperand(S, E->getFalseExpr(), T, CC, Suspicious);

	// If -Wconversion would have warned about either of the candidates
	// for a signedness conversion to the context type...
	if (!Suspicious) return;

	// ...but it's currently ignored...
	if (!S.Diags.isIgnored(diag::warn_impcast_integer_sign_conditional, CC))
	return;

	// ...then check whether it would have warned about either of the
	// candidates for a signedness conversion to the condition type.
	if (E->getType() == T) return;

	Suspicious = false;
	CheckImplicitConversion(S, E->getTrueExpr()->IgnoreParenImpCasts(),
	E->getType(), CC, &Suspicious);
	if (!Suspicious)
	CheckImplicitConversion(S, E->getFalseExpr()->IgnoreParenImpCasts(),
	E->getType(), CC, &Suspicious);
	}

	/// CheckBoolLikeConversion - Check conversion of given expression to boolean.
	/// Input argument E is a logical expression.
	static void CheckBoolLikeConversion(Sema &S, Expr *E, SourceLocation CC) {
	if (S.getLangOpts().Bool)
	return;
	CheckImplicitConversion(S, E->IgnoreParenImpCasts(), S.Context.BoolTy, CC);
	}

	/// AnalyzeImplicitConversions - Find and report any interesting
	/// implicit conversions in the given expression. There are a couple
	/// of competing diagnostics here, -Wconversion and -Wsign-compare.
	static void AnalyzeImplicitConversions(Sema &S, Expr *OrigE,
	SourceLocation CC) {
	QualType T = OrigE->getType();
	Expr *E = OrigE->IgnoreParenImpCasts();

	if (E->isTypeDependent() \|\| E->isValueDependent())
	return;

	// For conditional operators, we analyze the arguments as if they
	// were being fed directly into the output.
	if (isa<ConditionalOperator>(E)) {
	ConditionalOperator *CO = cast<ConditionalOperator>(E);
	CheckConditionalOperator(S, CO, CC, T);
	return;
	}

	// Check implicit argument conversions for function calls.
	if (CallExpr *Call = dyn_cast<CallExpr>(E))
	CheckImplicitArgumentConversions(S, Call, CC);

	// Go ahead and check any implicit conversions we might have skipped.
	// The non-canonical typecheck is just an optimization;
	// CheckImplicitConversion will filter out dead implicit conversions.
	if (E->getType() != T)
	CheckImplicitConversion(S, E, T, CC);

	// Now continue drilling into this expression.

	if (PseudoObjectExpr *POE = dyn_cast<PseudoObjectExpr>(E)) {
	// The bound subexpressions in a PseudoObjectExpr are not reachable
	// as transitive children.
	// FIXME: Use a more uniform representation for this.
	for (auto *SE : POE->semantics())
	if (auto *OVE = dyn_cast<OpaqueValueExpr>(SE))
	AnalyzeImplicitConversions(S, OVE->getSourceExpr(), CC);
	}

	// Skip past explicit casts.
	if (isa<ExplicitCastExpr>(E)) {
	E = cast<ExplicitCastExpr>(E)->getSubExpr()->IgnoreParenImpCasts();
	return AnalyzeImplicitConversions(S, E, CC);
	}

	if (BinaryOperator *BO = dyn_cast<BinaryOperator>(E)) {
	// Do a somewhat different check with comparison operators.
	if (BO->isComparisonOp())
	return AnalyzeComparison(S, BO);

	// And with simple assignments.
	if (BO->getOpcode() == BO_Assign)
	return AnalyzeAssignment(S, BO);
	}

	// These break the otherwise-useful invariant below. Fortunately,
	// we don't really need to recurse into them, because any internal
	// expressions should have been analyzed already when they were
	// built into statements.
	if (isa<StmtExpr>(E)) return;

	// Don't descend into unevaluated contexts.
	if (isa<UnaryExprOrTypeTraitExpr>(E)) return;

	// Now just recurse over the expression's children.
	CC = E->getExprLoc();
	BinaryOperator *BO = dyn_cast<BinaryOperator>(E);
	bool IsLogicalAndOperator = BO && BO->getOpcode() == BO_LAnd;
	for (Stmt *SubStmt : E->children()) {
	Expr *ChildExpr = dyn_cast_or_null<Expr>(SubStmt);
	if (!ChildExpr)
	continue;

	if (IsLogicalAndOperator &&
	isa<StringLiteral>(ChildExpr->IgnoreParenImpCasts()))
	// Ignore checking string literals that are in logical and operators.
	// This is a common pattern for asserts.
	continue;
	AnalyzeImplicitConversions(S, ChildExpr, CC);
	}

	if (BO && BO->isLogicalOp()) {
	Expr *SubExpr = BO->getLHS()->IgnoreParenImpCasts();
	if (!IsLogicalAndOperator \|\| !isa<StringLiteral>(SubExpr))
	::CheckBoolLikeConversion(S, SubExpr, BO->getExprLoc());

	SubExpr = BO->getRHS()->IgnoreParenImpCasts();
	if (!IsLogicalAndOperator \|\| !isa<StringLiteral>(SubExpr))
	::CheckBoolLikeConversion(S, SubExpr, BO->getExprLoc());
	}

	if (const UnaryOperator *U = dyn_cast<UnaryOperator>(E))
	if (U->getOpcode() == UO_LNot)
	::CheckBoolLikeConversion(S, U->getSubExpr(), CC);
	}

	/// Diagnose integer type and any valid implicit convertion to it.
	static bool checkOpenCLEnqueueIntType(Sema &S, Expr *E, const QualType &IntT) {
	// Taking into account implicit conversions,
	// allow any integer.
	if (!E->getType()->isIntegerType()) {
	S.Diag(E->getLocStart(),
	diag::err_opencl_enqueue_kernel_invalid_local_size_type);
	return true;
	}
	// Potentially emit standard warnings for implicit conversions if enabled
	// using -Wconversion.
	CheckImplicitConversion(S, E, IntT, E->getLocStart());
	return false;
	}

	// Helper function for Sema::DiagnoseAlwaysNonNullPointer.
	// Returns true when emitting a warning about taking the address of a reference.
	static bool CheckForReference(Sema &SemaRef, const Expr *E,
	const PartialDiagnostic &PD) {
	E = E->IgnoreParenImpCasts();

	const FunctionDecl *FD = nullptr;

	if (const DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(E)) {
	if (!DRE->getDecl()->getType()->isReferenceType())
	return false;
	} else if (const MemberExpr *M = dyn_cast<MemberExpr>(E)) {
	if (!M->getMemberDecl()->getType()->isReferenceType())
	return false;
	} else if (const CallExpr *Call = dyn_cast<CallExpr>(E)) {
	if (!Call->getCallReturnType(SemaRef.Context)->isReferenceType())
	return false;
	FD = Call->getDirectCallee();
	} else {
	return false;
	}

	SemaRef.Diag(E->getExprLoc(), PD);

	// If possible, point to location of function.
	if (FD) {
	SemaRef.Diag(FD->getLocation(), diag::note_reference_is_return_value) << FD;
	}

	return true;
	}

	// Returns true if the SourceLocation is expanded from any macro body.
	// Returns false if the SourceLocation is invalid, is from not in a macro
	// expansion, or is from expanded from a top-level macro argument.
	static bool IsInAnyMacroBody(const SourceManager &SM, SourceLocation Loc) {
	if (Loc.isInvalid())
	return false;

	while (Loc.isMacroID()) {
	if (SM.isMacroBodyExpansion(Loc))
	return true;
	Loc = SM.getImmediateMacroCallerLoc(Loc);
	}

	return false;
	}

	/// \brief Diagnose pointers that are always non-null.
	/// \param E the expression containing the pointer
	/// \param NullKind NPCK_NotNull if E is a cast to bool, otherwise, E is
	/// compared to a null pointer
	/// \param IsEqual True when the comparison is equal to a null pointer
	/// \param Range Extra SourceRange to highlight in the diagnostic
	void Sema::DiagnoseAlwaysNonNullPointer(Expr *E,
	Expr::NullPointerConstantKind NullKind,
	bool IsEqual, SourceRange Range) {
	if (!E)
	return;

	// Don't warn inside macros.
	if (E->getExprLoc().isMacroID()) {
	const SourceManager &SM = getSourceManager();
	if (IsInAnyMacroBody(SM, E->getExprLoc()) \|\|
	IsInAnyMacroBody(SM, Range.getBegin()))
	return;
	}
	E = E->IgnoreImpCasts();

	const bool IsCompare = NullKind != Expr::NPCK_NotNull;

	if (isa<CXXThisExpr>(E)) {
	unsigned DiagID = IsCompare ? diag::warn_this_null_compare
	: diag::warn_this_bool_conversion;
	Diag(E->getExprLoc(), DiagID) << E->getSourceRange() << Range << IsEqual;
	return;
	}

	bool IsAddressOf = false;

	if (UnaryOperator *UO = dyn_cast<UnaryOperator>(E)) {
	if (UO->getOpcode() != UO_AddrOf)
	return;
	IsAddressOf = true;
	E = UO->getSubExpr();
	}

	if (IsAddressOf) {
	unsigned DiagID = IsCompare
	? diag::warn_address_of_reference_null_compare
	: diag::warn_address_of_reference_bool_conversion;
	PartialDiagnostic PD = PDiag(DiagID) << E->getSourceRange() << Range
	<< IsEqual;
	if (CheckForReference(*this, E, PD)) {
	return;
	}
	}

	auto ComplainAboutNonnullParamOrCall = [&](const Attr *NonnullAttr) {
	bool IsParam = isa<NonNullAttr>(NonnullAttr);
	std::string Str;
	llvm::raw_string_ostream S(Str);
	E->printPretty(S, nullptr, getPrintingPolicy());
	unsigned DiagID = IsCompare ? diag::warn_nonnull_expr_compare
	: diag::warn_cast_nonnull_to_bool;
	Diag(E->getExprLoc(), DiagID) << IsParam << S.str()
	<< E->getSourceRange() << Range << IsEqual;
	Diag(NonnullAttr->getLocation(), diag::note_declared_nonnull) << IsParam;
	};

	// If we have a CallExpr that is tagged with returns_nonnull, we can complain.
	if (auto *Call = dyn_cast<CallExpr>(E->IgnoreParenImpCasts())) {
	if (auto *Callee = Call->getDirectCallee()) {
	if (const Attr *A = Callee->getAttr<ReturnsNonNullAttr>()) {
	ComplainAboutNonnullParamOrCall(A);
	return;
	}
	}
	}

	// Expect to find a single Decl. Skip anything more complicated.
	ValueDecl *D = nullptr;
	if (DeclRefExpr *R = dyn_cast<DeclRefExpr>(E)) {
	D = R->getDecl();
	} else if (MemberExpr *M = dyn_cast<MemberExpr>(E)) {
	D = M->getMemberDecl();
	}

	// Weak Decls can be null.
	if (!D \|\| D->isWeak())
	return;

	// Check for parameter decl with nonnull attribute
	if (const auto* PV = dyn_cast<ParmVarDecl>(D)) {
	if (getCurFunction() &&
	!getCurFunction()->ModifiedNonNullParams.count(PV)) {
	if (const Attr *A = PV->getAttr<NonNullAttr>()) {
	ComplainAboutNonnullParamOrCall(A);
	return;
	}

	if (const auto *FD = dyn_cast<FunctionDecl>(PV->getDeclContext())) {
	auto ParamIter = llvm::find(FD->parameters(), PV);
	assert(ParamIter != FD->param_end());
	unsigned ParamNo = std::distance(FD->param_begin(), ParamIter);

	for (const auto *NonNull : FD->specific_attrs<NonNullAttr>()) {
	if (!NonNull->args_size()) {
	ComplainAboutNonnullParamOrCall(NonNull);
	return;
	}

	for (unsigned ArgNo : NonNull->args()) {
	if (ArgNo == ParamNo) {
	ComplainAboutNonnullParamOrCall(NonNull);
	return;
	}
	}
	}
	}
	}
	}

	QualType T = D->getType();
	const bool IsArray = T->isArrayType();
	const bool IsFunction = T->isFunctionType();

	// Address of function is used to silence the function warning.
	if (IsAddressOf && IsFunction) {
	return;
	}

	// Found nothing.
	if (!IsAddressOf && !IsFunction && !IsArray)
	return;

	// Pretty print the expression for the diagnostic.
	std::string Str;
	llvm::raw_string_ostream S(Str);
	E->printPretty(S, nullptr, getPrintingPolicy());

	unsigned DiagID = IsCompare ? diag::warn_null_pointer_compare
	: diag::warn_impcast_pointer_to_bool;
	enum {
	AddressOf,
	FunctionPointer,
	ArrayPointer
	} DiagType;
	if (IsAddressOf)
	DiagType = AddressOf;
	else if (IsFunction)
	DiagType = FunctionPointer;
	else if (IsArray)
	DiagType = ArrayPointer;
	else
	llvm_unreachable("Could not determine diagnostic.");
	Diag(E->getExprLoc(), DiagID) << DiagType << S.str() << E->getSourceRange()
	<< Range << IsEqual;

	if (!IsFunction)
	return;

	// Suggest '&' to silence the function warning.
	Diag(E->getExprLoc(), diag::note_function_warning_silence)
	<< FixItHint::CreateInsertion(E->getLocStart(), "&");

	// Check to see if '()' fixit should be emitted.
	QualType ReturnType;
	UnresolvedSet<4> NonTemplateOverloads;
	tryExprAsCall(*E, ReturnType, NonTemplateOverloads);
	if (ReturnType.isNull())
	return;

	if (IsCompare) {
	// There are two cases here. If there is null constant, the only suggest
	// for a pointer return type. If the null is 0, then suggest if the return
	// type is a pointer or an integer type.
	if (!ReturnType->isPointerType()) {
	if (NullKind == Expr::NPCK_ZeroExpression \|\|
	NullKind == Expr::NPCK_ZeroLiteral) {
	if (!ReturnType->isIntegerType())
	return;
	} else {
	return;
	}
	}
	} else { // !IsCompare
	// For function to bool, only suggest if the function pointer has bool
	// return type.
	if (!ReturnType->isSpecificBuiltinType(BuiltinType::Bool))
	return;
	}
	Diag(E->getExprLoc(), diag::note_function_to_function_call)
	<< FixItHint::CreateInsertion(getLocForEndOfToken(E->getLocEnd()), "()");
	}

	/// Diagnoses "dangerous" implicit conversions within the given
	/// expression (which is a full expression). Implements -Wconversion
	/// and -Wsign-compare.
	///
	/// \param CC the "context" location of the implicit conversion, i.e.
	/// the most location of the syntactic entity requiring the implicit
	/// conversion
	void Sema::CheckImplicitConversions(Expr *E, SourceLocation CC) {
	// Don't diagnose in unevaluated contexts.
	if (isUnevaluatedContext())
	return;

	// Don't diagnose for value- or type-dependent expressions.
	if (E->isTypeDependent() \|\| E->isValueDependent())
	return;

	// Check for array bounds violations in cases where the check isn't triggered
	// elsewhere for other Expr types (like BinaryOperators), e.g. when an
	// ArraySubscriptExpr is on the RHS of a variable initialization.
	CheckArrayAccess(E);

	// This is not the right CC for (e.g.) a variable initialization.
	AnalyzeImplicitConversions(*this, E, CC);
	}

	/// CheckBoolLikeConversion - Check conversion of given expression to boolean.
	/// Input argument E is a logical expression.
	void Sema::CheckBoolLikeConversion(Expr *E, SourceLocation CC) {
	::CheckBoolLikeConversion(*this, E, CC);
	}

	/// Diagnose when expression is an integer constant expression and its evaluation
	/// results in integer overflow
	void Sema::CheckForIntOverflow (Expr *E) {
	// Use a work list to deal with nested struct initializers.
	SmallVector<Expr *, 2> Exprs(1, E);

	do {
	Expr *E = Exprs.pop_back_val();

	if (isa<BinaryOperator>(E->IgnoreParenCasts())) {
	E->IgnoreParenCasts()->EvaluateForOverflow(Context);
	continue;
	}

	if (auto InitList = dyn_cast<InitListExpr>(E))
	Exprs.append(InitList->inits().begin(), InitList->inits().end());

	if (isa<ObjCBoxedExpr>(E))
	E->IgnoreParenCasts()->EvaluateForOverflow(Context);
	} while (!Exprs.empty());
	}

	namespace {

	/// \brief Visitor for expressions which looks for unsequenced operations on the
	/// same object.
	class SequenceChecker : public EvaluatedExprVisitor<SequenceChecker> {
	using Base = EvaluatedExprVisitor<SequenceChecker>;

	/// \brief A tree of sequenced regions within an expression. Two regions are
	/// unsequenced if one is an ancestor or a descendent of the other. When we
	/// finish processing an expression with sequencing, such as a comma
	/// expression, we fold its tree nodes into its parent, since they are
	/// unsequenced with respect to nodes we will visit later.
	class SequenceTree {
	struct Value {
	explicit Value(unsigned Parent) : Parent(Parent), Merged(false) {}
	unsigned Parent : 31;
	unsigned Merged : 1;
	};
	SmallVector<Value, 8> Values;

	public:
	/// \brief A region within an expression which may be sequenced with respect
	/// to some other region.
	class Seq {
	friend class SequenceTree;

	unsigned Index = 0;

	explicit Seq(unsigned N) : Index(N) {}

	public:
	Seq() = default;
	};

	SequenceTree() { Values.push_back(Value(0)); }
	Seq root() const { return Seq(0); }

	/// \brief Create a new sequence of operations, which is an unsequenced
	/// subset of \p Parent. This sequence of operations is sequenced with
	/// respect to other children of \p Parent.
	Seq allocate(Seq Parent) {
	Values.push_back(Value(Parent.Index));
	return Seq(Values.size() - 1);
	}

	/// \brief Merge a sequence of operations into its parent.
	void merge(Seq S) {
	Values[S.Index].Merged = true;
	}

	/// \brief Determine whether two operations are unsequenced. This operation
	/// is asymmetric: \p Cur should be the more recent sequence, and \p Old
	/// should have been merged into its parent as appropriate.
	bool isUnsequenced(Seq Cur, Seq Old) {
	unsigned C = representative(Cur.Index);
	unsigned Target = representative(Old.Index);
	while (C >= Target) {
	if (C == Target)
	return true;
	C = Values[C].Parent;
	}
	return false;
	}

	private:
	/// \brief Pick a representative for a sequence.
	unsigned representative(unsigned K) {
	if (Values[K].Merged)
	// Perform path compression as we go.
	return Values[K].Parent = representative(Values[K].Parent);
	return K;
	}
	};

	/// An object for which we can track unsequenced uses.
	using Object = NamedDecl *;

	/// Different flavors of object usage which we track. We only track the
	/// least-sequenced usage of each kind.
	enum UsageKind {
	/// A read of an object. Multiple unsequenced reads are OK.
	UK_Use,

	/// A modification of an object which is sequenced before the value
	/// computation of the expression, such as ++n in C++.
	UK_ModAsValue,

	/// A modification of an object which is not sequenced before the value
	/// computation of the expression, such as n++.
	UK_ModAsSideEffect,

	UK_Count = UK_ModAsSideEffect + 1
	};

	struct Usage {
	Expr *Use = nullptr;
	SequenceTree::Seq Seq;

	Usage() = default;
	};

	struct UsageInfo {
	Usage Uses[UK_Count];

	/// Have we issued a diagnostic for this variable already?
	bool Diagnosed = false;

	UsageInfo() = default;
	};
	using UsageInfoMap = llvm::SmallDenseMap<Object, UsageInfo, 16>;

	Sema &SemaRef;

	/// Sequenced regions within the expression.
	SequenceTree Tree;

	/// Declaration modifications and references which we have seen.
	UsageInfoMap UsageMap;

	/// The region we are currently within.
	SequenceTree::Seq Region;

	/// Filled in with declarations which were modified as a side-effect
	/// (that is, post-increment operations).
	SmallVectorImpl<std::pair<Object, Usage>> *ModAsSideEffect = nullptr;

	/// Expressions to check later. We defer checking these to reduce
	/// stack usage.
	SmallVectorImpl<Expr *> &WorkList;

	/// RAII object wrapping the visitation of a sequenced subexpression of an
	/// expression. At the end of this process, the side-effects of the evaluation
	/// become sequenced with respect to the value computation of the result, so
	/// we downgrade any UK_ModAsSideEffect within the evaluation to
	/// UK_ModAsValue.
	struct SequencedSubexpression {
	SequencedSubexpression(SequenceChecker &Self)
	: Self(Self), OldModAsSideEffect(Self.ModAsSideEffect) {
	Self.ModAsSideEffect = &ModAsSideEffect;
	}

	~SequencedSubexpression() {
	for (auto &M : llvm::reverse(ModAsSideEffect)) {
	UsageInfo &U = Self.UsageMap[M.first];
	auto &SideEffectUsage = U.Uses[UK_ModAsSideEffect];
	Self.addUsage(U, M.first, SideEffectUsage.Use, UK_ModAsValue);
	SideEffectUsage = M.second;
	}
	Self.ModAsSideEffect = OldModAsSideEffect;
	}

	SequenceChecker &Self;
	SmallVector<std::pair<Object, Usage>, 4> ModAsSideEffect;
	SmallVectorImpl<std::pair<Object, Usage>> *OldModAsSideEffect;
	};

	/// RAII object wrapping the visitation of a subexpression which we might
	/// choose to evaluate as a constant. If any subexpression is evaluated and
	/// found to be non-constant, this allows us to suppress the evaluation of
	/// the outer expression.
	class EvaluationTracker {
	public:
	EvaluationTracker(SequenceChecker &Self)
	: Self(Self), Prev(Self.EvalTracker) {
	Self.EvalTracker = this;
	}

	~EvaluationTracker() {
	Self.EvalTracker = Prev;
	if (Prev)
	Prev->EvalOK &= EvalOK;
	}

	bool evaluate(const Expr *E, bool &Result) {
	if (!EvalOK \|\| E->isValueDependent())
	return false;
	EvalOK = E->EvaluateAsBooleanCondition(Result, Self.SemaRef.Context);
	return EvalOK;
	}

	private:
	SequenceChecker &Self;
	EvaluationTracker *Prev;
	bool EvalOK = true;
	} *EvalTracker = nullptr;

	/// \brief Find the object which is produced by the specified expression,
	/// if any.
	Object getObject(Expr *E, bool Mod) const {
	E = E->IgnoreParenCasts();
	if (UnaryOperator *UO = dyn_cast<UnaryOperator>(E)) {
	if (Mod && (UO->getOpcode() == UO_PreInc \|\| UO->getOpcode() == UO_PreDec))
	return getObject(UO->getSubExpr(), Mod);
	} else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(E)) {
	if (BO->getOpcode() == BO_Comma)
	return getObject(BO->getRHS(), Mod);
	if (Mod && BO->isAssignmentOp())
	return getObject(BO->getLHS(), Mod);
	} else if (MemberExpr *ME = dyn_cast<MemberExpr>(E)) {
	// FIXME: Check for more interesting cases, like "x.n = ++x.n".
	if (isa<CXXThisExpr>(ME->getBase()->IgnoreParenCasts()))
	return ME->getMemberDecl();
	} else if (DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(E))
	// FIXME: If this is a reference, map through to its value.
	return DRE->getDecl();
	return nullptr;
	}

	/// \brief Note that an object was modified or used by an expression.
	void addUsage(UsageInfo &UI, Object O, Expr *Ref, UsageKind UK) {
	Usage &U = UI.Uses[UK];
	if (!U.Use \|\| !Tree.isUnsequenced(Region, U.Seq)) {
	if (UK == UK_ModAsSideEffect && ModAsSideEffect)
	ModAsSideEffect->push_back(std::make_pair(O, U));
	U.Use = Ref;
	U.Seq = Region;
	}
	}

	/// \brief Check whether a modification or use conflicts with a prior usage.
	void checkUsage(Object O, UsageInfo &UI, Expr *Ref, UsageKind OtherKind,
	bool IsModMod) {
	if (UI.Diagnosed)
	return;

	const Usage &U = UI.Uses[OtherKind];
	if (!U.Use \|\| !Tree.isUnsequenced(Region, U.Seq))
	return;

	Expr *Mod = U.Use;
	Expr *ModOrUse = Ref;
	if (OtherKind == UK_Use)
	std::swap(Mod, ModOrUse);

	SemaRef.Diag(Mod->getExprLoc(),
	IsModMod ? diag::warn_unsequenced_mod_mod
	: diag::warn_unsequenced_mod_use)
	<< O << SourceRange(ModOrUse->getExprLoc());
	UI.Diagnosed = true;
	}

	void notePreUse(Object O, Expr *Use) {
	UsageInfo &U = UsageMap[O];
	// Uses conflict with other modifications.
	checkUsage(O, U, Use, UK_ModAsValue, false);
	}

	void notePostUse(Object O, Expr *Use) {
	UsageInfo &U = UsageMap[O];
	checkUsage(O, U, Use, UK_ModAsSideEffect, false);
	addUsage(U, O, Use, UK_Use);
	}

	void notePreMod(Object O, Expr *Mod) {
	UsageInfo &U = UsageMap[O];
	// Modifications conflict with other modifications and with uses.
	checkUsage(O, U, Mod, UK_ModAsValue, true);
	checkUsage(O, U, Mod, UK_Use, false);
	}

	void notePostMod(Object O, Expr *Use, UsageKind UK) {
	UsageInfo &U = UsageMap[O];
	checkUsage(O, U, Use, UK_ModAsSideEffect, true);
	addUsage(U, O, Use, UK);
	}

	public:
	SequenceChecker(Sema &S, Expr E, SmallVectorImpl<Expr > &WorkList)
	: Base(S.Context), SemaRef(S), Region(Tree.root()), WorkList(WorkList) {
	Visit(E);
	}

	void VisitStmt(Stmt *S) {
	// Skip all statements which aren't expressions for now.
	}

	void VisitExpr(Expr *E) {
	// By default, just recurse to evaluated subexpressions.
	Base::VisitStmt(E);
	}

	void VisitCastExpr(CastExpr *E) {
	Object O = Object();
	if (E->getCastKind() == CK_LValueToRValue)
	O = getObject(E->getSubExpr(), false);

	if (O)
	notePreUse(O, E);
	VisitExpr(E);
	if (O)
	notePostUse(O, E);
	}

	void VisitBinComma(BinaryOperator *BO) {
	// C++11 [expr.comma]p1:
	// Every value computation and side effect associated with the left
	// expression is sequenced before every value computation and side
	// effect associated with the right expression.
	SequenceTree::Seq LHS = Tree.allocate(Region);
	SequenceTree::Seq RHS = Tree.allocate(Region);
	SequenceTree::Seq OldRegion = Region;

	{
	SequencedSubexpression SeqLHS(*this);
	Region = LHS;
	Visit(BO->getLHS());
	}

	Region = RHS;
	Visit(BO->getRHS());

	Region = OldRegion;

	// Forget that LHS and RHS are sequenced. They are both unsequenced
	// with respect to other stuff.
	Tree.merge(LHS);
	Tree.merge(RHS);
	}

	void VisitBinAssign(BinaryOperator *BO) {
	// The modification is sequenced after the value computation of the LHS
	// and RHS, so check it before inspecting the operands and update the
	// map afterwards.
	Object O = getObject(BO->getLHS(), true);
	if (!O)
	return VisitExpr(BO);

	notePreMod(O, BO);

	// C++11 [expr.ass]p7:
	// E1 op= E2 is equivalent to E1 = E1 op E2, except that E1 is evaluated
	// only once.
	//
	// Therefore, for a compound assignment operator, O is considered used
	// everywhere except within the evaluation of E1 itself.
	if (isa<CompoundAssignOperator>(BO))
	notePreUse(O, BO);

	Visit(BO->getLHS());

	if (isa<CompoundAssignOperator>(BO))
	notePostUse(O, BO);

	Visit(BO->getRHS());

	// C++11 [expr.ass]p1:
	// the assignment is sequenced [...] before the value computation of the
	// assignment expression.
	// C11 6.5.16/3 has no such rule.
	notePostMod(O, BO, SemaRef.getLangOpts().CPlusPlus ? UK_ModAsValue
	: UK_ModAsSideEffect);
	}

	void VisitCompoundAssignOperator(CompoundAssignOperator *CAO) {
	VisitBinAssign(CAO);
	}

	void VisitUnaryPreInc(UnaryOperator *UO) { VisitUnaryPreIncDec(UO); }
	void VisitUnaryPreDec(UnaryOperator *UO) { VisitUnaryPreIncDec(UO); }
	void VisitUnaryPreIncDec(UnaryOperator *UO) {
	Object O = getObject(UO->getSubExpr(), true);
	if (!O)
	return VisitExpr(UO);

	notePreMod(O, UO);
	Visit(UO->getSubExpr());
	// C++11 [expr.pre.incr]p1:
	// the expression ++x is equivalent to x+=1
	notePostMod(O, UO, SemaRef.getLangOpts().CPlusPlus ? UK_ModAsValue
	: UK_ModAsSideEffect);
	}

	void VisitUnaryPostInc(UnaryOperator *UO) { VisitUnaryPostIncDec(UO); }
	void VisitUnaryPostDec(UnaryOperator *UO) { VisitUnaryPostIncDec(UO); }
	void VisitUnaryPostIncDec(UnaryOperator *UO) {
	Object O = getObject(UO->getSubExpr(), true);
	if (!O)
	return VisitExpr(UO);

	notePreMod(O, UO);
	Visit(UO->getSubExpr());
	notePostMod(O, UO, UK_ModAsSideEffect);
	}

	/// Don't visit the RHS of '&&' or '\|\|' if it might not be evaluated.
	void VisitBinLOr(BinaryOperator *BO) {
	// The side-effects of the LHS of an '&&' are sequenced before the
	// value computation of the RHS, and hence before the value computation
	// of the '&&' itself, unless the LHS evaluates to zero. We treat them
	// as if they were unconditionally sequenced.
	EvaluationTracker Eval(*this);
	{
	SequencedSubexpression Sequenced(*this);
	Visit(BO->getLHS());
	}

	bool Result;
	if (Eval.evaluate(BO->getLHS(), Result)) {
	if (!Result)
	Visit(BO->getRHS());
	} else {
	// Check for unsequenced operations in the RHS, treating it as an
	// entirely separate evaluation.
	//
	// FIXME: If there are operations in the RHS which are unsequenced
	// with respect to operations outside the RHS, and those operations
	// are unconditionally evaluated, diagnose them.
	WorkList.push_back(BO->getRHS());
	}
	}
	void VisitBinLAnd(BinaryOperator *BO) {
	EvaluationTracker Eval(*this);
	{
	SequencedSubexpression Sequenced(*this);
	Visit(BO->getLHS());
	}

	bool Result;
	if (Eval.evaluate(BO->getLHS(), Result)) {
	if (Result)
	Visit(BO->getRHS());
	} else {
	WorkList.push_back(BO->getRHS());
	}
	}

	// Only visit the condition, unless we can be sure which subexpression will
	// be chosen.
	void VisitAbstractConditionalOperator(AbstractConditionalOperator *CO) {
	EvaluationTracker Eval(*this);
	{
	SequencedSubexpression Sequenced(*this);
	Visit(CO->getCond());
	}

	bool Result;
	if (Eval.evaluate(CO->getCond(), Result))
	Visit(Result ? CO->getTrueExpr() : CO->getFalseExpr());
	else {
	WorkList.push_back(CO->getTrueExpr());
	WorkList.push_back(CO->getFalseExpr());
	}
	}

	void VisitCallExpr(CallExpr *CE) {
	// C++11 [intro.execution]p15:
	// When calling a function [...], every value computation and side effect
	// associated with any argument expression, or with the postfix expression
	// designating the called function, is sequenced before execution of every
	// expression or statement in the body of the function [and thus before
	// the value computation of its result].
	SequencedSubexpression Sequenced(*this);
	Base::VisitCallExpr(CE);

	// FIXME: CXXNewExpr and CXXDeleteExpr implicitly call functions.
	}

	void VisitCXXConstructExpr(CXXConstructExpr *CCE) {
	// This is a call, so all subexpressions are sequenced before the result.
	SequencedSubexpression Sequenced(*this);

	if (!CCE->isListInitialization())
	return VisitExpr(CCE);

	// In C++11, list initializations are sequenced.
	SmallVector<SequenceTree::Seq, 32> Elts;
	SequenceTree::Seq Parent = Region;
	for (CXXConstructExpr::arg_iterator I = CCE->arg_begin(),
	E = CCE->arg_end();
	I != E; ++I) {
	Region = Tree.allocate(Parent);
	Elts.push_back(Region);
	Visit(*I);
	}

	// Forget that the initializers are sequenced.
	Region = Parent;
	for (unsigned I = 0; I < Elts.size(); ++I)
	Tree.merge(Elts[I]);
	}

	void VisitInitListExpr(InitListExpr *ILE) {
	if (!SemaRef.getLangOpts().CPlusPlus11)
	return VisitExpr(ILE);

	// In C++11, list initializations are sequenced.
	SmallVector<SequenceTree::Seq, 32> Elts;
	SequenceTree::Seq Parent = Region;
	for (unsigned I = 0; I < ILE->getNumInits(); ++I) {
	Expr *E = ILE->getInit(I);
	if (!E) continue;
	Region = Tree.allocate(Parent);
	Elts.push_back(Region);
	Visit(E);
	}

	// Forget that the initializers are sequenced.
	Region = Parent;
	for (unsigned I = 0; I < Elts.size(); ++I)
	Tree.merge(Elts[I]);
	}
	};

	} // namespace

	void Sema::CheckUnsequencedOperations(Expr *E) {
	SmallVector<Expr *, 8> WorkList;
	WorkList.push_back(E);
	while (!WorkList.empty()) {
	Expr *Item = WorkList.pop_back_val();
	SequenceChecker(*this, Item, WorkList);
	}
	}

	void Sema::CheckCompletedExpr(Expr *E, SourceLocation CheckLoc,
	bool IsConstexpr) {
	CheckImplicitConversions(E, CheckLoc);
	if (!E->isInstantiationDependent())
	CheckUnsequencedOperations(E);
	if (!IsConstexpr && !E->isValueDependent())
	CheckForIntOverflow(E);
	DiagnoseMisalignedMembers();
	}

	void Sema::CheckBitFieldInitialization(SourceLocation InitLoc,
	FieldDecl *BitField,
	Expr *Init) {
	(void) AnalyzeBitFieldAssignment(*this, BitField, Init, InitLoc);
	}

	static void diagnoseArrayStarInParamType(Sema &S, QualType PType,
	SourceLocation Loc) {
	if (!PType->isVariablyModifiedType())
	return;
	if (const auto *PointerTy = dyn_cast<PointerType>(PType)) {
	diagnoseArrayStarInParamType(S, PointerTy->getPointeeType(), Loc);
	return;
	}
	if (const auto *ReferenceTy = dyn_cast<ReferenceType>(PType)) {
	diagnoseArrayStarInParamType(S, ReferenceTy->getPointeeType(), Loc);
	return;
	}
	if (const auto *ParenTy = dyn_cast<ParenType>(PType)) {
	diagnoseArrayStarInParamType(S, ParenTy->getInnerType(), Loc);
	return;
	}

	const ArrayType *AT = S.Context.getAsArrayType(PType);
	if (!AT)
	return;

	if (AT->getSizeModifier() != ArrayType::Star) {
	diagnoseArrayStarInParamType(S, AT->getElementType(), Loc);
	return;
	}

	S.Diag(Loc, diag::err_array_star_in_function_definition);
	}

	/// CheckParmsForFunctionDef - Check that the parameters of the given
	/// function are appropriate for the definition of a function. This
	/// takes care of any checks that cannot be performed on the
	/// declaration itself, e.g., that the types of each of the function
	/// parameters are complete.
	bool Sema::CheckParmsForFunctionDef(ArrayRef<ParmVarDecl *> Parameters,
	bool CheckParameterNames) {
	bool HasInvalidParm = false;
	for (ParmVarDecl *Param : Parameters) {
	// C99 6.7.5.3p4: the parameters in a parameter type list in a
	// function declarator that is part of a function definition of
	// that function shall not have incomplete type.
	//
	// This is also C++ [dcl.fct]p6.
	if (!Param->isInvalidDecl() &&
	RequireCompleteType(Param->getLocation(), Param->getType(),
	diag::err_typecheck_decl_incomplete_type)) {
	Param->setInvalidDecl();
	HasInvalidParm = true;
	}

	// C99 6.9.1p5: If the declarator includes a parameter type list, the
	// declaration of each parameter shall include an identifier.
	if (CheckParameterNames &&
	Param->getIdentifier() == nullptr &&
	!Param->isImplicit() &&
	!getLangOpts().CPlusPlus)
	Diag(Param->getLocation(), diag::err_parameter_name_omitted);

	// C99 6.7.5.3p12:
	// If the function declarator is not part of a definition of that
	// function, parameters may have incomplete type and may use the [*]
	// notation in their sequences of declarator specifiers to specify
	// variable length array types.
	QualType PType = Param->getOriginalType();
	// FIXME: This diagnostic should point the '[*]' if source-location
	// information is added for it.
	diagnoseArrayStarInParamType(*this, PType, Param->getLocation());

	// MSVC destroys objects passed by value in the callee. Therefore a
	// function definition which takes such a parameter must be able to call the
	// object's destructor. However, we don't perform any direct access check
	// on the dtor.
	if (getLangOpts().CPlusPlus && Context.getTargetInfo()
	.getCXXABI()
	.areArgsDestroyedLeftToRightInCallee()) {
	if (!Param->isInvalidDecl()) {
	if (const RecordType *RT = Param->getType()->getAs<RecordType>()) {
	CXXRecordDecl *ClassDecl = cast<CXXRecordDecl>(RT->getDecl());
	if (!ClassDecl->isInvalidDecl() &&
	!ClassDecl->hasIrrelevantDestructor() &&
	!ClassDecl->isDependentContext()) {
	CXXDestructorDecl *Destructor = LookupDestructor(ClassDecl);
	MarkFunctionReferenced(Param->getLocation(), Destructor);
	DiagnoseUseOfDecl(Destructor, Param->getLocation());
	}
	}
	}
	}

	// Parameters with the pass_object_size attribute only need to be marked
	// constant at function definitions. Because we lack information about
	// whether we're on a declaration or definition when we're instantiating the
	// attribute, we need to check for constness here.
	if (const auto *Attr = Param->getAttr<PassObjectSizeAttr>())
	if (!Param->getType().isConstQualified())
	Diag(Param->getLocation(), diag::err_attribute_pointers_only)
	<< Attr->getSpelling() << 1;
	}

	return HasInvalidParm;
	}

	/// A helper function to get the alignment of a Decl referred to by DeclRefExpr
	/// or MemberExpr.
	static CharUnits getDeclAlign(Expr *E, CharUnits TypeAlign,
	ASTContext &Context) {
	if (const auto *DRE = dyn_cast<DeclRefExpr>(E))
	return Context.getDeclAlign(DRE->getDecl());

	if (const auto *ME = dyn_cast<MemberExpr>(E))
	return Context.getDeclAlign(ME->getMemberDecl());

	return TypeAlign;
	}

	/// CheckCastAlign - Implements -Wcast-align, which warns when a
	/// pointer cast increases the alignment requirements.
	void Sema::CheckCastAlign(Expr *Op, QualType T, SourceRange TRange) {
	// This is actually a lot of work to potentially be doing on every
	// cast; don't do it if we're ignoring -Wcast_align (as is the default).
	if (getDiagnostics().isIgnored(diag::warn_cast_align, TRange.getBegin()))
	return;

	// Ignore dependent types.
	if (T->isDependentType() \|\| Op->getType()->isDependentType())
	return;

	// Require that the destination be a pointer type.
	const PointerType *DestPtr = T->getAs<PointerType>();
	if (!DestPtr) return;

	// If the destination has alignment 1, we're done.
	QualType DestPointee = DestPtr->getPointeeType();
	if (DestPointee->isIncompleteType()) return;
	CharUnits DestAlign = Context.getTypeAlignInChars(DestPointee);
	if (DestAlign.isOne()) return;

	// Require that the source be a pointer type.
	const PointerType *SrcPtr = Op->getType()->getAs<PointerType>();
	if (!SrcPtr) return;
	QualType SrcPointee = SrcPtr->getPointeeType();

	// Whitelist casts from cv void*. We already implicitly
	// whitelisted casts to cv void*, since they have alignment 1.
	// Also whitelist casts involving incomplete types, which implicitly
	// includes 'void'.
	if (SrcPointee->isIncompleteType()) return;

	CharUnits SrcAlign = Context.getTypeAlignInChars(SrcPointee);

	if (auto *CE = dyn_cast<CastExpr>(Op)) {
	if (CE->getCastKind() == CK_ArrayToPointerDecay)
	SrcAlign = getDeclAlign(CE->getSubExpr(), SrcAlign, Context);
	} else if (auto *UO = dyn_cast<UnaryOperator>(Op)) {
	if (UO->getOpcode() == UO_AddrOf)
	SrcAlign = getDeclAlign(UO->getSubExpr(), SrcAlign, Context);
	}

	if (SrcAlign >= DestAlign) return;

	Diag(TRange.getBegin(), diag::warn_cast_align)
	<< Op->getType() << T
	<< static_cast<unsigned>(SrcAlign.getQuantity())
	<< static_cast<unsigned>(DestAlign.getQuantity())
	<< TRange << Op->getSourceRange();
	}

	/// \brief Check whether this array fits the idiom of a size-one tail padded
	/// array member of a struct.
	///
	/// We avoid emitting out-of-bounds access warnings for such arrays as they are
	/// commonly used to emulate flexible arrays in C89 code.
	static bool IsTailPaddedMemberArray(Sema &S, const llvm::APInt &Size,
	const NamedDecl *ND) {
	if (Size != 1 \|\| !ND) return false;

	const FieldDecl *FD = dyn_cast<FieldDecl>(ND);
	if (!FD) return false;

	// Don't consider sizes resulting from macro expansions or template argument
	// substitution to form C89 tail-padded arrays.

	TypeSourceInfo *TInfo = FD->getTypeSourceInfo();
	while (TInfo) {
	TypeLoc TL = TInfo->getTypeLoc();
	// Look through typedefs.
	if (TypedefTypeLoc TTL = TL.getAs<TypedefTypeLoc>()) {
	const TypedefNameDecl *TDL = TTL.getTypedefNameDecl();
	TInfo = TDL->getTypeSourceInfo();
	continue;
	}
	if (ConstantArrayTypeLoc CTL = TL.getAs<ConstantArrayTypeLoc>()) {
	const Expr *SizeExpr = dyn_cast<IntegerLiteral>(CTL.getSizeExpr());
	if (!SizeExpr \|\| SizeExpr->getExprLoc().isMacroID())
	return false;
	}
	break;
	}

	const RecordDecl *RD = dyn_cast<RecordDecl>(FD->getDeclContext());
	if (!RD) return false;
	if (RD->isUnion()) return false;
	if (const CXXRecordDecl *CRD = dyn_cast<CXXRecordDecl>(RD)) {
	if (!CRD->isStandardLayout()) return false;
	}

	// See if this is the last field decl in the record.
	const Decl *D = FD;
	while ((D = D->getNextDeclInContext()))
	if (isa<FieldDecl>(D))
	return false;
	return true;
	}

	void Sema::CheckArrayAccess(const Expr BaseExpr, const Expr IndexExpr,
	const ArraySubscriptExpr *ASE,
	bool AllowOnePastEnd, bool IndexNegated) {
	IndexExpr = IndexExpr->IgnoreParenImpCasts();
	if (IndexExpr->isValueDependent())
	return;

	const Type *EffectiveType =
	BaseExpr->getType()->getPointeeOrArrayElementType();
	BaseExpr = BaseExpr->IgnoreParenCasts();
	const ConstantArrayType *ArrayTy =
	Context.getAsConstantArrayType(BaseExpr->getType());
	if (!ArrayTy)
	return;

	llvm::APSInt index;
	if (!IndexExpr->EvaluateAsInt(index, Context, Expr::SE_AllowSideEffects))
	return;
	if (IndexNegated)
	index = -index;

	const NamedDecl *ND = nullptr;
	if (const DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(BaseExpr))
	ND = dyn_cast<NamedDecl>(DRE->getDecl());
	if (const MemberExpr *ME = dyn_cast<MemberExpr>(BaseExpr))
	ND = dyn_cast<NamedDecl>(ME->getMemberDecl());

	if (index.isUnsigned() \|\| !index.isNegative()) {
	llvm::APInt size = ArrayTy->getSize();
	if (!size.isStrictlyPositive())
	return;

	const Type *BaseType = BaseExpr->getType()->getPointeeOrArrayElementType();
	if (BaseType != EffectiveType) {
	// Make sure we're comparing apples to apples when comparing index to size
	uint64_t ptrarith_typesize = Context.getTypeSize(EffectiveType);
	uint64_t array_typesize = Context.getTypeSize(BaseType);
	// Handle ptrarith_typesize being zero, such as when casting to void*
	if (!ptrarith_typesize) ptrarith_typesize = 1;
	if (ptrarith_typesize != array_typesize) {
	// There's a cast to a different size type involved
	uint64_t ratio = array_typesize / ptrarith_typesize;
	// TODO: Be smarter about handling cases where array_typesize is not a
	// multiple of ptrarith_typesize
	if (ptrarith_typesize * ratio == array_typesize)
	size *= llvm::APInt(size.getBitWidth(), ratio);
	}
	}

	if (size.getBitWidth() > index.getBitWidth())
	index = index.zext(size.getBitWidth());
	else if (size.getBitWidth() < index.getBitWidth())
	size = size.zext(index.getBitWidth());

	// For array subscripting the index must be less than size, but for pointer
	// arithmetic also allow the index (offset) to be equal to size since
	// computing the next address after the end of the array is legal and
	// commonly done e.g. in C++ iterators and range-based for loops.
	if (AllowOnePastEnd ? index.ule(size) : index.ult(size))
	return;

	// Also don't warn for arrays of size 1 which are members of some
	// structure. These are often used to approximate flexible arrays in C89
	// code.
	if (IsTailPaddedMemberArray(*this, size, ND))
	return;

	// Suppress the warning if the subscript expression (as identified by the
	// ']' location) and the index expression are both from macro expansions
	// within a system header.
	if (ASE) {
	SourceLocation RBracketLoc = SourceMgr.getSpellingLoc(
	ASE->getRBracketLoc());
	if (SourceMgr.isInSystemHeader(RBracketLoc)) {
	SourceLocation IndexLoc = SourceMgr.getSpellingLoc(
	IndexExpr->getLocStart());
	if (SourceMgr.isWrittenInSameFile(RBracketLoc, IndexLoc))
	return;
	}
	}

	unsigned DiagID = diag::warn_ptr_arith_exceeds_bounds;
	if (ASE)
	DiagID = diag::warn_array_index_exceeds_bounds;

	DiagRuntimeBehavior(BaseExpr->getLocStart(), BaseExpr,
	PDiag(DiagID) << index.toString(10, true)
	<< size.toString(10, true)
	<< (unsigned)size.getLimitedValue(~0U)
	<< IndexExpr->getSourceRange());
	} else {
	unsigned DiagID = diag::warn_array_index_precedes_bounds;
	if (!ASE) {
	DiagID = diag::warn_ptr_arith_precedes_bounds;
	if (index.isNegative()) index = -index;
	}

	DiagRuntimeBehavior(BaseExpr->getLocStart(), BaseExpr,
	PDiag(DiagID) << index.toString(10, true)
	<< IndexExpr->getSourceRange());
	}

	if (!ND) {
	// Try harder to find a NamedDecl to point at in the note.
	while (const ArraySubscriptExpr *ASE =
	dyn_cast<ArraySubscriptExpr>(BaseExpr))
	BaseExpr = ASE->getBase()->IgnoreParenCasts();
	if (const DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(BaseExpr))
	ND = dyn_cast<NamedDecl>(DRE->getDecl());
	if (const MemberExpr *ME = dyn_cast<MemberExpr>(BaseExpr))
	ND = dyn_cast<NamedDecl>(ME->getMemberDecl());
	}

	if (ND)
	DiagRuntimeBehavior(ND->getLocStart(), BaseExpr,
	PDiag(diag::note_array_index_out_of_bounds)
	<< ND->getDeclName());
	}

	void Sema::CheckArrayAccess(const Expr *expr) {
	int AllowOnePastEnd = 0;
	while (expr) {
	expr = expr->IgnoreParenImpCasts();
	switch (expr->getStmtClass()) {
	case Stmt::ArraySubscriptExprClass: {
	const ArraySubscriptExpr *ASE = cast<ArraySubscriptExpr>(expr);
	CheckArrayAccess(ASE->getBase(), ASE->getIdx(), ASE,
	AllowOnePastEnd > 0);
	return;
	}
	case Stmt::OMPArraySectionExprClass: {
	const OMPArraySectionExpr *ASE = cast<OMPArraySectionExpr>(expr);
	if (ASE->getLowerBound())
	CheckArrayAccess(ASE->getBase(), ASE->getLowerBound(),
	/ASE=/nullptr, AllowOnePastEnd > 0);
	return;
	}
	case Stmt::UnaryOperatorClass: {
	// Only unwrap the * and & unary operators
	const UnaryOperator *UO = cast<UnaryOperator>(expr);
	expr = UO->getSubExpr();
	switch (UO->getOpcode()) {
	case UO_AddrOf:
	AllowOnePastEnd++;
	break;
	case UO_Deref:
	AllowOnePastEnd--;
	break;
	default:
	return;
	}
	break;
	}
	case Stmt::ConditionalOperatorClass: {
	const ConditionalOperator *cond = cast<ConditionalOperator>(expr);
	if (const Expr *lhs = cond->getLHS())
	CheckArrayAccess(lhs);
	if (const Expr *rhs = cond->getRHS())
	CheckArrayAccess(rhs);
	return;
	}
	case Stmt::CXXOperatorCallExprClass: {
	const auto *OCE = cast<CXXOperatorCallExpr>(expr);
	for (const auto *Arg : OCE->arguments())
	CheckArrayAccess(Arg);
	return;
	}
	default:
	return;
	}
	}
	}

	//===--- CHECK: Objective-C retain cycles ----------------------------------//

	namespace {

	struct RetainCycleOwner {
	VarDecl *Variable = nullptr;
	SourceRange Range;
	SourceLocation Loc;
	bool Indirect = false;

	RetainCycleOwner() = default;

	void setLocsFrom(Expr *e) {
	Loc = e->getExprLoc();
	Range = e->getSourceRange();
	}
	};

	} // namespace

	/// Consider whether capturing the given variable can possibly lead to
	/// a retain cycle.
	static bool considerVariable(VarDecl var, Expr ref, RetainCycleOwner &owner) {
	// In ARC, it's captured strongly iff the variable has __strong
	// lifetime. In MRR, it's captured strongly if the variable is
	// __block and has an appropriate type.
	if (var->getType().getObjCLifetime() != Qualifiers::OCL_Strong)
	return false;

	owner.Variable = var;
	if (ref)
	owner.setLocsFrom(ref);
	return true;
	}

	static bool findRetainCycleOwner(Sema &S, Expr *e, RetainCycleOwner &owner) {
	while (true) {
	e = e->IgnoreParens();
	if (CastExpr *cast = dyn_cast<CastExpr>(e)) {
	switch (cast->getCastKind()) {
	case CK_BitCast:
	case CK_LValueBitCast:
	case CK_LValueToRValue:
	case CK_ARCReclaimReturnedObject:
	e = cast->getSubExpr();
	continue;

	default:
	return false;
	}
	}

	if (ObjCIvarRefExpr *ref = dyn_cast<ObjCIvarRefExpr>(e)) {
	ObjCIvarDecl *ivar = ref->getDecl();
	if (ivar->getType().getObjCLifetime() != Qualifiers::OCL_Strong)
	return false;

	// Try to find a retain cycle in the base.
	if (!findRetainCycleOwner(S, ref->getBase(), owner))
	return false;

	if (ref->isFreeIvar()) owner.setLocsFrom(ref);
	owner.Indirect = true;
	return true;
	}

	if (DeclRefExpr *ref = dyn_cast<DeclRefExpr>(e)) {
	VarDecl *var = dyn_cast<VarDecl>(ref->getDecl());
	if (!var) return false;
	return considerVariable(var, ref, owner);
	}

	if (MemberExpr *member = dyn_cast<MemberExpr>(e)) {
	if (member->isArrow()) return false;

	// Don't count this as an indirect ownership.
	e = member->getBase();
	continue;
	}

	if (PseudoObjectExpr *pseudo = dyn_cast<PseudoObjectExpr>(e)) {
	// Only pay attention to pseudo-objects on property references.
	ObjCPropertyRefExpr *pre
	= dyn_cast<ObjCPropertyRefExpr>(pseudo->getSyntacticForm()
	->IgnoreParens());
	if (!pre) return false;
	if (pre->isImplicitProperty()) return false;
	ObjCPropertyDecl *property = pre->getExplicitProperty();
	if (!property->isRetaining() &&
	!(property->getPropertyIvarDecl() &&
	property->getPropertyIvarDecl()->getType()
	.getObjCLifetime() == Qualifiers::OCL_Strong))
	return false;

	owner.Indirect = true;
	if (pre->isSuperReceiver()) {
	owner.Variable = S.getCurMethodDecl()->getSelfDecl();
	if (!owner.Variable)
	return false;
	owner.Loc = pre->getLocation();
	owner.Range = pre->getSourceRange();
	return true;
	}
	e = const_cast<Expr*>(cast<OpaqueValueExpr>(pre->getBase())
	->getSourceExpr());
	continue;
	}

	// Array ivars?

	return false;
	}
	}

	namespace {

	struct FindCaptureVisitor : EvaluatedExprVisitor<FindCaptureVisitor> {
	ASTContext &Context;
	VarDecl *Variable;
	Expr *Capturer = nullptr;
	bool VarWillBeReased = false;

	FindCaptureVisitor(ASTContext &Context, VarDecl *variable)
	: EvaluatedExprVisitor<FindCaptureVisitor>(Context),
	Context(Context), Variable(variable) {}

	void VisitDeclRefExpr(DeclRefExpr *ref) {
	if (ref->getDecl() == Variable && !Capturer)
	Capturer = ref;
	}

	void VisitObjCIvarRefExpr(ObjCIvarRefExpr *ref) {
	if (Capturer) return;
	Visit(ref->getBase());
	if (Capturer && ref->isFreeIvar())
	Capturer = ref;
	}

	void VisitBlockExpr(BlockExpr *block) {
	// Look inside nested blocks
	if (block->getBlockDecl()->capturesVariable(Variable))
	Visit(block->getBlockDecl()->getBody());
	}

	void VisitOpaqueValueExpr(OpaqueValueExpr *OVE) {
	if (Capturer) return;
	if (OVE->getSourceExpr())
	Visit(OVE->getSourceExpr());
	}

	void VisitBinaryOperator(BinaryOperator *BinOp) {
	if (!Variable \|\| VarWillBeReased \|\| BinOp->getOpcode() != BO_Assign)
	return;
	Expr *LHS = BinOp->getLHS();
	if (const DeclRefExpr *DRE = dyn_cast_or_null<DeclRefExpr>(LHS)) {
	if (DRE->getDecl() != Variable)
	return;
	if (Expr *RHS = BinOp->getRHS()) {
	RHS = RHS->IgnoreParenCasts();
	llvm::APSInt Value;
	VarWillBeReased =
	(RHS && RHS->isIntegerConstantExpr(Value, Context) && Value == 0);
	}
	}
	}
	};

	} // namespace

	/// Check whether the given argument is a block which captures a
	/// variable.
	static Expr findCapturingExpr(Sema &S, Expr e, RetainCycleOwner &owner) {
	assert(owner.Variable && owner.Loc.isValid());

	e = e->IgnoreParenCasts();

	// Look through [^{...} copy] and Block_copy(^{...}).
	if (ObjCMessageExpr *ME = dyn_cast<ObjCMessageExpr>(e)) {
	Selector Cmd = ME->getSelector();
	if (Cmd.isUnarySelector() && Cmd.getNameForSlot(0) == "copy") {
	e = ME->getInstanceReceiver();
	if (!e)
	return nullptr;
	e = e->IgnoreParenCasts();
	}
	} else if (CallExpr *CE = dyn_cast<CallExpr>(e)) {
	if (CE->getNumArgs() == 1) {
	FunctionDecl *Fn = dyn_cast_or_null<FunctionDecl>(CE->getCalleeDecl());
	if (Fn) {
	const IdentifierInfo *FnI = Fn->getIdentifier();
	if (FnI && FnI->isStr("_Block_copy")) {
	e = CE->getArg(0)->IgnoreParenCasts();
	}
	}
	}
	}

	BlockExpr *block = dyn_cast<BlockExpr>(e);
	if (!block \|\| !block->getBlockDecl()->capturesVariable(owner.Variable))
	return nullptr;

	FindCaptureVisitor visitor(S.Context, owner.Variable);
	visitor.Visit(block->getBlockDecl()->getBody());
	return visitor.VarWillBeReased ? nullptr : visitor.Capturer;
	}

	static void diagnoseRetainCycle(Sema &S, Expr *capturer,
	RetainCycleOwner &owner) {
	assert(capturer);
	assert(owner.Variable && owner.Loc.isValid());

	S.Diag(capturer->getExprLoc(), diag::warn_arc_retain_cycle)
	<< owner.Variable << capturer->getSourceRange();
	S.Diag(owner.Loc, diag::note_arc_retain_cycle_owner)
	<< owner.Indirect << owner.Range;
	}

	/// Check for a keyword selector that starts with the word 'add' or
	/// 'set'.
	static bool isSetterLikeSelector(Selector sel) {
	if (sel.isUnarySelector()) return false;

	StringRef str = sel.getNameForSlot(0);
	while (!str.empty() && str.front() == '_') str = str.substr(1);
	if (str.startswith("set"))
	str = str.substr(3);
	else if (str.startswith("add")) {
	// Specially whitelist 'addOperationWithBlock:'.
	if (sel.getNumArgs() == 1 && str.startswith("addOperationWithBlock"))
	return false;
	str = str.substr(3);
	}
	else
	return false;

	if (str.empty()) return true;
	return !isLowercase(str.front());
	}

	static Optional<int> GetNSMutableArrayArgumentIndex(Sema &S,
	ObjCMessageExpr *Message) {
	bool IsMutableArray = S.NSAPIObj->isSubclassOfNSClass(
	Message->getReceiverInterface(),
	NSAPI::ClassId_NSMutableArray);
	if (!IsMutableArray) {
	return None;
	}

	Selector Sel = Message->getSelector();

	Optional<NSAPI::NSArrayMethodKind> MKOpt =
	S.NSAPIObj->getNSArrayMethodKind(Sel);
	if (!MKOpt) {
	return None;
	}

	NSAPI::NSArrayMethodKind MK = *MKOpt;

	switch (MK) {
	case NSAPI::NSMutableArr_addObject:
	case NSAPI::NSMutableArr_insertObjectAtIndex:
	case NSAPI::NSMutableArr_setObjectAtIndexedSubscript:
	return 0;
	case NSAPI::NSMutableArr_replaceObjectAtIndex:
	return 1;

	default:
	return None;
	}

	return None;
	}

	static
	Optional<int> GetNSMutableDictionaryArgumentIndex(Sema &S,
	ObjCMessageExpr *Message) {
	bool IsMutableDictionary = S.NSAPIObj->isSubclassOfNSClass(
	Message->getReceiverInterface(),
	NSAPI::ClassId_NSMutableDictionary);
	if (!IsMutableDictionary) {
	return None;
	}

	Selector Sel = Message->getSelector();

	Optional<NSAPI::NSDictionaryMethodKind> MKOpt =
	S.NSAPIObj->getNSDictionaryMethodKind(Sel);
	if (!MKOpt) {
	return None;
	}

	NSAPI::NSDictionaryMethodKind MK = *MKOpt;

	switch (MK) {
	case NSAPI::NSMutableDict_setObjectForKey:
	case NSAPI::NSMutableDict_setValueForKey:
	case NSAPI::NSMutableDict_setObjectForKeyedSubscript:
	return 0;

	default:
	return None;
	}

	return None;
	}

	static Optional<int> GetNSSetArgumentIndex(Sema &S, ObjCMessageExpr *Message) {
	bool IsMutableSet = S.NSAPIObj->isSubclassOfNSClass(
	Message->getReceiverInterface(),
	NSAPI::ClassId_NSMutableSet);

	bool IsMutableOrderedSet = S.NSAPIObj->isSubclassOfNSClass(
	Message->getReceiverInterface(),
	NSAPI::ClassId_NSMutableOrderedSet);
	if (!IsMutableSet && !IsMutableOrderedSet) {
	return None;
	}

	Selector Sel = Message->getSelector();

	Optional<NSAPI::NSSetMethodKind> MKOpt = S.NSAPIObj->getNSSetMethodKind(Sel);
	if (!MKOpt) {
	return None;
	}

	NSAPI::NSSetMethodKind MK = *MKOpt;

	switch (MK) {
	case NSAPI::NSMutableSet_addObject:
	case NSAPI::NSOrderedSet_setObjectAtIndex:
	case NSAPI::NSOrderedSet_setObjectAtIndexedSubscript:
	case NSAPI::NSOrderedSet_insertObjectAtIndex:
	return 0;
	case NSAPI::NSOrderedSet_replaceObjectAtIndexWithObject:
	return 1;
	}

	return None;
	}

	void Sema::CheckObjCCircularContainer(ObjCMessageExpr *Message) {
	if (!Message->isInstanceMessage()) {
	return;
	}

	Optional<int> ArgOpt;

	if (!(ArgOpt = GetNSMutableArrayArgumentIndex(*this, Message)) &&
	!(ArgOpt = GetNSMutableDictionaryArgumentIndex(*this, Message)) &&
	!(ArgOpt = GetNSSetArgumentIndex(*this, Message))) {
	return;
	}

	int ArgIndex = *ArgOpt;

	Expr *Arg = Message->getArg(ArgIndex)->IgnoreImpCasts();
	if (OpaqueValueExpr *OE = dyn_cast<OpaqueValueExpr>(Arg)) {
	Arg = OE->getSourceExpr()->IgnoreImpCasts();
	}

	if (Message->getReceiverKind() == ObjCMessageExpr::SuperInstance) {
	if (DeclRefExpr *ArgRE = dyn_cast<DeclRefExpr>(Arg)) {
	if (ArgRE->isObjCSelfExpr()) {
	Diag(Message->getSourceRange().getBegin(),
	diag::warn_objc_circular_container)
	<< ArgRE->getDecl()->getName() << StringRef("super");
	}
	}
	} else {
	Expr *Receiver = Message->getInstanceReceiver()->IgnoreImpCasts();

	if (OpaqueValueExpr *OE = dyn_cast<OpaqueValueExpr>(Receiver)) {
	Receiver = OE->getSourceExpr()->IgnoreImpCasts();
	}

	if (DeclRefExpr *ReceiverRE = dyn_cast<DeclRefExpr>(Receiver)) {
	if (DeclRefExpr *ArgRE = dyn_cast<DeclRefExpr>(Arg)) {
	if (ReceiverRE->getDecl() == ArgRE->getDecl()) {
	ValueDecl *Decl = ReceiverRE->getDecl();
	Diag(Message->getSourceRange().getBegin(),
	diag::warn_objc_circular_container)
	<< Decl->getName() << Decl->getName();
	if (!ArgRE->isObjCSelfExpr()) {
	Diag(Decl->getLocation(),
	diag::note_objc_circular_container_declared_here)
	<< Decl->getName();
	}
	}
	}
	} else if (ObjCIvarRefExpr *IvarRE = dyn_cast<ObjCIvarRefExpr>(Receiver)) {
	if (ObjCIvarRefExpr *IvarArgRE = dyn_cast<ObjCIvarRefExpr>(Arg)) {
	if (IvarRE->getDecl() == IvarArgRE->getDecl()) {
	ObjCIvarDecl *Decl = IvarRE->getDecl();
	Diag(Message->getSourceRange().getBegin(),
	diag::warn_objc_circular_container)
	<< Decl->getName() << Decl->getName();
	Diag(Decl->getLocation(),
	diag::note_objc_circular_container_declared_here)
	<< Decl->getName();
	}
	}
	}
	}
	}

	/// Check a message send to see if it's likely to cause a retain cycle.
	void Sema::checkRetainCycles(ObjCMessageExpr *msg) {
	// Only check instance methods whose selector looks like a setter.
	if (!msg->isInstanceMessage() \|\| !isSetterLikeSelector(msg->getSelector()))
	return;

	// Try to find a variable that the receiver is strongly owned by.
	RetainCycleOwner owner;
	if (msg->getReceiverKind() == ObjCMessageExpr::Instance) {
	if (!findRetainCycleOwner(*this, msg->getInstanceReceiver(), owner))
	return;
	} else {
	assert(msg->getReceiverKind() == ObjCMessageExpr::SuperInstance);
	owner.Variable = getCurMethodDecl()->getSelfDecl();
	owner.Loc = msg->getSuperLoc();
	owner.Range = msg->getSuperLoc();
	}

	// Check whether the receiver is captured by any of the arguments.
	const ObjCMethodDecl *MD = msg->getMethodDecl();
	for (unsigned i = 0, e = msg->getNumArgs(); i != e; ++i) {
	if (Expr capturer = findCapturingExpr(this, msg->getArg(i), owner)) {
	// noescape blocks should not be retained by the method.
	if (MD && MD->parameters()[i]->hasAttr<NoEscapeAttr>())
	continue;
	return diagnoseRetainCycle(*this, capturer, owner);
	}
	}
	}

	/// Check a property assign to see if it's likely to cause a retain cycle.
	void Sema::checkRetainCycles(Expr receiver, Expr argument) {
	RetainCycleOwner owner;
	if (!findRetainCycleOwner(*this, receiver, owner))
	return;

	if (Expr capturer = findCapturingExpr(this, argument, owner))
	diagnoseRetainCycle(*this, capturer, owner);
	}

	void Sema::checkRetainCycles(VarDecl Var, Expr Init) {
	RetainCycleOwner Owner;
	if (!considerVariable(Var, /DeclRefExpr=/nullptr, Owner))
	return;

	// Because we don't have an expression for the variable, we have to set the
	// location explicitly here.
	Owner.Loc = Var->getLocation();
	Owner.Range = Var->getSourceRange();

	if (Expr Capturer = findCapturingExpr(this, Init, Owner))
	diagnoseRetainCycle(*this, Capturer, Owner);
	}

	static bool checkUnsafeAssignLiteral(Sema &S, SourceLocation Loc,
	Expr *RHS, bool isProperty) {
	// Check if RHS is an Objective-C object literal, which also can get
	// immediately zapped in a weak reference. Note that we explicitly
	// allow ObjCStringLiterals, since those are designed to never really die.
	RHS = RHS->IgnoreParenImpCasts();

	// This enum needs to match with the 'select' in
	// warn_objc_arc_literal_assign (off-by-1).
	Sema::ObjCLiteralKind Kind = S.CheckLiteralKind(RHS);
	if (Kind == Sema::LK_String \|\| Kind == Sema::LK_None)
	return false;

	S.Diag(Loc, diag::warn_arc_literal_assign)
	<< (unsigned) Kind
	<< (isProperty ? 0 : 1)
	<< RHS->getSourceRange();

	return true;
	}

	static bool checkUnsafeAssignObject(Sema &S, SourceLocation Loc,
	Qualifiers::ObjCLifetime LT,
	Expr *RHS, bool isProperty) {
	// Strip off any implicit cast added to get to the one ARC-specific.
	while (ImplicitCastExpr *cast = dyn_cast<ImplicitCastExpr>(RHS)) {
	if (cast->getCastKind() == CK_ARCConsumeObject) {
	S.Diag(Loc, diag::warn_arc_retained_assign)
	<< (LT == Qualifiers::OCL_ExplicitNone)
	<< (isProperty ? 0 : 1)
	<< RHS->getSourceRange();
	return true;
	}
	RHS = cast->getSubExpr();
	}

	if (LT == Qualifiers::OCL_Weak &&
	checkUnsafeAssignLiteral(S, Loc, RHS, isProperty))
	return true;

	return false;
	}

	bool Sema::checkUnsafeAssigns(SourceLocation Loc,
	QualType LHS, Expr *RHS) {
	Qualifiers::ObjCLifetime LT = LHS.getObjCLifetime();

	if (LT != Qualifiers::OCL_Weak && LT != Qualifiers::OCL_ExplicitNone)
	return false;

	if (checkUnsafeAssignObject(*this, Loc, LT, RHS, false))
	return true;

	return false;
	}

	void Sema::checkUnsafeExprAssigns(SourceLocation Loc,
	Expr LHS, Expr RHS) {
	QualType LHSType;
	// PropertyRef on LHS type need be directly obtained from
	// its declaration as it has a PseudoType.
	ObjCPropertyRefExpr *PRE
	= dyn_cast<ObjCPropertyRefExpr>(LHS->IgnoreParens());
	if (PRE && !PRE->isImplicitProperty()) {
	const ObjCPropertyDecl *PD = PRE->getExplicitProperty();
	if (PD)
	LHSType = PD->getType();
	}

	if (LHSType.isNull())
	LHSType = LHS->getType();

	Qualifiers::ObjCLifetime LT = LHSType.getObjCLifetime();

	if (LT == Qualifiers::OCL_Weak) {
	if (!Diags.isIgnored(diag::warn_arc_repeated_use_of_weak, Loc))
	getCurFunction()->markSafeWeakUse(LHS);
	}

	if (checkUnsafeAssigns(Loc, LHSType, RHS))
	return;

	// FIXME. Check for other life times.
	if (LT != Qualifiers::OCL_None)
	return;

	if (PRE) {
	if (PRE->isImplicitProperty())
	return;
	const ObjCPropertyDecl *PD = PRE->getExplicitProperty();
	if (!PD)
	return;

	unsigned Attributes = PD->getPropertyAttributes();
	if (Attributes & ObjCPropertyDecl::OBJC_PR_assign) {
	// when 'assign' attribute was not explicitly specified
	// by user, ignore it and rely on property type itself
	// for lifetime info.
	unsigned AsWrittenAttr = PD->getPropertyAttributesAsWritten();
	if (!(AsWrittenAttr & ObjCPropertyDecl::OBJC_PR_assign) &&
	LHSType->isObjCRetainableType())
	return;

	while (ImplicitCastExpr *cast = dyn_cast<ImplicitCastExpr>(RHS)) {
	if (cast->getCastKind() == CK_ARCConsumeObject) {
	Diag(Loc, diag::warn_arc_retained_property_assign)
	<< RHS->getSourceRange();
	return;
	}
	RHS = cast->getSubExpr();
	}
	}
	else if (Attributes & ObjCPropertyDecl::OBJC_PR_weak) {
	if (checkUnsafeAssignObject(*this, Loc, Qualifiers::OCL_Weak, RHS, true))
	return;
	}
	}
	}

	//===--- CHECK: Empty statement body (-Wempty-body) ---------------------===//

	static bool ShouldDiagnoseEmptyStmtBody(const SourceManager &SourceMgr,
	SourceLocation StmtLoc,
	const NullStmt *Body) {
	// Do not warn if the body is a macro that expands to nothing, e.g:
	//
	// #define CALL(x)
	// if (condition)
	// CALL(0);
	if (Body->hasLeadingEmptyMacro())
	return false;

	// Get line numbers of statement and body.
	bool StmtLineInvalid;
	unsigned StmtLine = SourceMgr.getPresumedLineNumber(StmtLoc,
	&StmtLineInvalid);
	if (StmtLineInvalid)
	return false;

	bool BodyLineInvalid;
	unsigned BodyLine = SourceMgr.getSpellingLineNumber(Body->getSemiLoc(),
	&BodyLineInvalid);
	if (BodyLineInvalid)
	return false;

	// Warn if null statement and body are on the same line.
	if (StmtLine != BodyLine)
	return false;

	return true;
	}

	void Sema::DiagnoseEmptyStmtBody(SourceLocation StmtLoc,
	const Stmt *Body,
	unsigned DiagID) {
	// Since this is a syntactic check, don't emit diagnostic for template
	// instantiations, this just adds noise.
	if (CurrentInstantiationScope)
	return;

	// The body should be a null statement.
	const NullStmt *NBody = dyn_cast<NullStmt>(Body);
	if (!NBody)
	return;

	// Do the usual checks.
	if (!ShouldDiagnoseEmptyStmtBody(SourceMgr, StmtLoc, NBody))
	return;

	Diag(NBody->getSemiLoc(), DiagID);
	Diag(NBody->getSemiLoc(), diag::note_empty_body_on_separate_line);
	}

	void Sema::DiagnoseEmptyLoopBody(const Stmt *S,
	const Stmt *PossibleBody) {
	assert(!CurrentInstantiationScope); // Ensured by caller

	SourceLocation StmtLoc;
	const Stmt *Body;
	unsigned DiagID;
	if (const ForStmt *FS = dyn_cast<ForStmt>(S)) {
	StmtLoc = FS->getRParenLoc();
	Body = FS->getBody();
	DiagID = diag::warn_empty_for_body;
	} else if (const WhileStmt *WS = dyn_cast<WhileStmt>(S)) {
	StmtLoc = WS->getCond()->getSourceRange().getEnd();
	Body = WS->getBody();
	DiagID = diag::warn_empty_while_body;
	} else
	return; // Neither `for' nor `while'.

	// The body should be a null statement.
	const NullStmt *NBody = dyn_cast<NullStmt>(Body);
	if (!NBody)
	return;

	// Skip expensive checks if diagnostic is disabled.
	if (Diags.isIgnored(DiagID, NBody->getSemiLoc()))
	return;

	// Do the usual checks.
	if (!ShouldDiagnoseEmptyStmtBody(SourceMgr, StmtLoc, NBody))
	return;

	// `for(...);' and `while(...);' are popular idioms, so in order to keep
	// noise level low, emit diagnostics only if for/while is followed by a
	// CompoundStmt, e.g.:
	// for (int i = 0; i < n; i++);
	// {
	// a(i);
	// }
	// or if for/while is followed by a statement with more indentation
	// than for/while itself:
	// for (int i = 0; i < n; i++);
	// a(i);
	bool ProbableTypo = isa<CompoundStmt>(PossibleBody);
	if (!ProbableTypo) {
	bool BodyColInvalid;
	unsigned BodyCol = SourceMgr.getPresumedColumnNumber(
	PossibleBody->getLocStart(),
	&BodyColInvalid);
	if (BodyColInvalid)
	return;

	bool StmtColInvalid;
	unsigned StmtCol = SourceMgr.getPresumedColumnNumber(
	S->getLocStart(),
	&StmtColInvalid);
	if (StmtColInvalid)
	return;

	if (BodyCol > StmtCol)
	ProbableTypo = true;
	}

	if (ProbableTypo) {
	Diag(NBody->getSemiLoc(), DiagID);
	Diag(NBody->getSemiLoc(), diag::note_empty_body_on_separate_line);
	}
	}

	//===--- CHECK: Warn on self move with std::move. -------------------------===//

	/// DiagnoseSelfMove - Emits a warning if a value is moved to itself.
	void Sema::DiagnoseSelfMove(const Expr LHSExpr, const Expr RHSExpr,
	SourceLocation OpLoc) {
	if (Diags.isIgnored(diag::warn_sizeof_pointer_expr_memaccess, OpLoc))
	return;

	if (inTemplateInstantiation())
	return;

	// Strip parens and casts away.
	LHSExpr = LHSExpr->IgnoreParenImpCasts();
	RHSExpr = RHSExpr->IgnoreParenImpCasts();

	// Check for a call expression
	const CallExpr *CE = dyn_cast<CallExpr>(RHSExpr);
	if (!CE \|\| CE->getNumArgs() != 1)
	return;

	// Check for a call to std::move
	if (!CE->isCallToStdMove())
	return;

	// Get argument from std::move
	RHSExpr = CE->getArg(0);

	const DeclRefExpr *LHSDeclRef = dyn_cast<DeclRefExpr>(LHSExpr);
	const DeclRefExpr *RHSDeclRef = dyn_cast<DeclRefExpr>(RHSExpr);

	// Two DeclRefExpr's, check that the decls are the same.
	if (LHSDeclRef && RHSDeclRef) {
	if (!LHSDeclRef->getDecl() \|\| !RHSDeclRef->getDecl())
	return;
	if (LHSDeclRef->getDecl()->getCanonicalDecl() !=
	RHSDeclRef->getDecl()->getCanonicalDecl())
	return;

	Diag(OpLoc, diag::warn_self_move) << LHSExpr->getType()
	<< LHSExpr->getSourceRange()
	<< RHSExpr->getSourceRange();
	return;
	}

	// Member variables require a different approach to check for self moves.
	// MemberExpr's are the same if every nested MemberExpr refers to the same
	// Decl and that the base Expr's are DeclRefExpr's with the same Decl or
	// the base Expr's are CXXThisExpr's.
	const Expr *LHSBase = LHSExpr;
	const Expr *RHSBase = RHSExpr;
	const MemberExpr *LHSME = dyn_cast<MemberExpr>(LHSExpr);
	const MemberExpr *RHSME = dyn_cast<MemberExpr>(RHSExpr);
	if (!LHSME \|\| !RHSME)
	return;

	while (LHSME && RHSME) {
	if (LHSME->getMemberDecl()->getCanonicalDecl() !=
	RHSME->getMemberDecl()->getCanonicalDecl())
	return;

	LHSBase = LHSME->getBase();
	RHSBase = RHSME->getBase();
	LHSME = dyn_cast<MemberExpr>(LHSBase);
	RHSME = dyn_cast<MemberExpr>(RHSBase);
	}

	LHSDeclRef = dyn_cast<DeclRefExpr>(LHSBase);
	RHSDeclRef = dyn_cast<DeclRefExpr>(RHSBase);
	if (LHSDeclRef && RHSDeclRef) {
	if (!LHSDeclRef->getDecl() \|\| !RHSDeclRef->getDecl())
	return;
	if (LHSDeclRef->getDecl()->getCanonicalDecl() !=
	RHSDeclRef->getDecl()->getCanonicalDecl())
	return;

	Diag(OpLoc, diag::warn_self_move) << LHSExpr->getType()
	<< LHSExpr->getSourceRange()
	<< RHSExpr->getSourceRange();
	return;
	}

	if (isa<CXXThisExpr>(LHSBase) && isa<CXXThisExpr>(RHSBase))
	Diag(OpLoc, diag::warn_self_move) << LHSExpr->getType()
	<< LHSExpr->getSourceRange()
	<< RHSExpr->getSourceRange();
	}

	//===--- Layout compatibility ----------------------------------------------//

	static bool isLayoutCompatible(ASTContext &C, QualType T1, QualType T2);

	/// \brief Check if two enumeration types are layout-compatible.
	static bool isLayoutCompatible(ASTContext &C, EnumDecl ED1, EnumDecl ED2) {
	// C++11 [dcl.enum] p8:
	// Two enumeration types are layout-compatible if they have the same
	// underlying type.
	return ED1->isComplete() && ED2->isComplete() &&
	C.hasSameType(ED1->getIntegerType(), ED2->getIntegerType());
	}

	/// \brief Check if two fields are layout-compatible.
	static bool isLayoutCompatible(ASTContext &C, FieldDecl *Field1,
	FieldDecl *Field2) {
	if (!isLayoutCompatible(C, Field1->getType(), Field2->getType()))
	return false;

	if (Field1->isBitField() != Field2->isBitField())
	return false;

	if (Field1->isBitField()) {
	// Make sure that the bit-fields are the same length.
	unsigned Bits1 = Field1->getBitWidthValue(C);
	unsigned Bits2 = Field2->getBitWidthValue(C);

	if (Bits1 != Bits2)
	return false;
	}

	return true;
	}

	/// \brief Check if two standard-layout structs are layout-compatible.
	/// (C++11 [class.mem] p17)
	static bool isLayoutCompatibleStruct(ASTContext &C, RecordDecl *RD1,
	RecordDecl *RD2) {
	// If both records are C++ classes, check that base classes match.
	if (const CXXRecordDecl *D1CXX = dyn_cast<CXXRecordDecl>(RD1)) {
	// If one of records is a CXXRecordDecl we are in C++ mode,
	// thus the other one is a CXXRecordDecl, too.
	const CXXRecordDecl *D2CXX = cast<CXXRecordDecl>(RD2);
	// Check number of base classes.
	if (D1CXX->getNumBases() != D2CXX->getNumBases())
	return false;

	// Check the base classes.
	for (CXXRecordDecl::base_class_const_iterator
	Base1 = D1CXX->bases_begin(),
	BaseEnd1 = D1CXX->bases_end(),
	Base2 = D2CXX->bases_begin();
	Base1 != BaseEnd1;
	++Base1, ++Base2) {
	if (!isLayoutCompatible(C, Base1->getType(), Base2->getType()))
	return false;
	}
	} else if (const CXXRecordDecl *D2CXX = dyn_cast<CXXRecordDecl>(RD2)) {
	// If only RD2 is a C++ class, it should have zero base classes.
	if (D2CXX->getNumBases() > 0)
	return false;
	}

	// Check the fields.
	RecordDecl::field_iterator Field2 = RD2->field_begin(),
	Field2End = RD2->field_end(),
	Field1 = RD1->field_begin(),
	Field1End = RD1->field_end();
	for ( ; Field1 != Field1End && Field2 != Field2End; ++Field1, ++Field2) {
	if (!isLayoutCompatible(C, Field1, Field2))
	return false;
	}
	if (Field1 != Field1End \|\| Field2 != Field2End)
	return false;

	return true;
	}

	/// \brief Check if two standard-layout unions are layout-compatible.
	/// (C++11 [class.mem] p18)
	static bool isLayoutCompatibleUnion(ASTContext &C, RecordDecl *RD1,
	RecordDecl *RD2) {
	llvm::SmallPtrSet<FieldDecl *, 8> UnmatchedFields;
	for (auto *Field2 : RD2->fields())
	UnmatchedFields.insert(Field2);

	for (auto *Field1 : RD1->fields()) {
	llvm::SmallPtrSet<FieldDecl *, 8>::iterator
	I = UnmatchedFields.begin(),
	E = UnmatchedFields.end();

	for ( ; I != E; ++I) {
	if (isLayoutCompatible(C, Field1, *I)) {
	bool Result = UnmatchedFields.erase(*I);
	(void) Result;
	assert(Result);
	break;
	}
	}
	if (I == E)
	return false;
	}

	return UnmatchedFields.empty();
	}

	static bool isLayoutCompatible(ASTContext &C, RecordDecl *RD1,
	RecordDecl *RD2) {
	if (RD1->isUnion() != RD2->isUnion())
	return false;

	if (RD1->isUnion())
	return isLayoutCompatibleUnion(C, RD1, RD2);
	else
	return isLayoutCompatibleStruct(C, RD1, RD2);
	}

	/// \brief Check if two types are layout-compatible in C++11 sense.
	static bool isLayoutCompatible(ASTContext &C, QualType T1, QualType T2) {
	if (T1.isNull() \|\| T2.isNull())
	return false;

	// C++11 [basic.types] p11:
	// If two types T1 and T2 are the same type, then T1 and T2 are
	// layout-compatible types.
	if (C.hasSameType(T1, T2))
	return true;

	T1 = T1.getCanonicalType().getUnqualifiedType();
	T2 = T2.getCanonicalType().getUnqualifiedType();

	const Type::TypeClass TC1 = T1->getTypeClass();
	const Type::TypeClass TC2 = T2->getTypeClass();

	if (TC1 != TC2)
	return false;

	if (TC1 == Type::Enum) {
	return isLayoutCompatible(C,
	cast<EnumType>(T1)->getDecl(),
	cast<EnumType>(T2)->getDecl());
	} else if (TC1 == Type::Record) {
	if (!T1->isStandardLayoutType() \|\| !T2->isStandardLayoutType())
	return false;

	return isLayoutCompatible(C,
	cast<RecordType>(T1)->getDecl(),
	cast<RecordType>(T2)->getDecl());
	}

	return false;
	}

	//===--- CHECK: pointer_with_type_tag attribute: datatypes should match ----//

	/// \brief Given a type tag expression find the type tag itself.
	///
	/// \param TypeExpr Type tag expression, as it appears in user's code.
	///
	/// \param VD Declaration of an identifier that appears in a type tag.
	///
	/// \param MagicValue Type tag magic value.
	static bool FindTypeTagExpr(const Expr *TypeExpr, const ASTContext &Ctx,
	const ValueDecl *VD, uint64_t MagicValue) {
	while(true) {
	if (!TypeExpr)
	return false;

	TypeExpr = TypeExpr->IgnoreParenImpCasts()->IgnoreParenCasts();

	switch (TypeExpr->getStmtClass()) {
	case Stmt::UnaryOperatorClass: {
	const UnaryOperator *UO = cast<UnaryOperator>(TypeExpr);
	if (UO->getOpcode() == UO_AddrOf \|\| UO->getOpcode() == UO_Deref) {
	TypeExpr = UO->getSubExpr();
	continue;
	}
	return false;
	}

	case Stmt::DeclRefExprClass: {
	const DeclRefExpr *DRE = cast<DeclRefExpr>(TypeExpr);
	*VD = DRE->getDecl();
	return true;
	}

	case Stmt::IntegerLiteralClass: {
	const IntegerLiteral *IL = cast<IntegerLiteral>(TypeExpr);
	llvm::APInt MagicValueAPInt = IL->getValue();
	if (MagicValueAPInt.getActiveBits() <= 64) {
	*MagicValue = MagicValueAPInt.getZExtValue();
	return true;
	} else
	return false;
	}

	case Stmt::BinaryConditionalOperatorClass:
	case Stmt::ConditionalOperatorClass: {
	const AbstractConditionalOperator *ACO =
	cast<AbstractConditionalOperator>(TypeExpr);
	bool Result;
	if (ACO->getCond()->EvaluateAsBooleanCondition(Result, Ctx)) {
	if (Result)
	TypeExpr = ACO->getTrueExpr();
	else
	TypeExpr = ACO->getFalseExpr();
	continue;
	}
	return false;
	}

	case Stmt::BinaryOperatorClass: {
	const BinaryOperator *BO = cast<BinaryOperator>(TypeExpr);
	if (BO->getOpcode() == BO_Comma) {
	TypeExpr = BO->getRHS();
	continue;
	}
	return false;
	}

	default:
	return false;
	}
	}
	}

	/// \brief Retrieve the C type corresponding to type tag TypeExpr.
	///
	/// \param TypeExpr Expression that specifies a type tag.
	///
	/// \param MagicValues Registered magic values.
	///
	/// \param FoundWrongKind Set to true if a type tag was found, but of a wrong
	/// kind.
	///
	/// \param TypeInfo Information about the corresponding C type.
	///
	/// \returns true if the corresponding C type was found.
	static bool GetMatchingCType(
	const IdentifierInfo *ArgumentKind,
	const Expr *TypeExpr, const ASTContext &Ctx,
	const llvm::DenseMap<Sema::TypeTagMagicValue,
	Sema::TypeTagData> *MagicValues,
	bool &FoundWrongKind,
	Sema::TypeTagData &TypeInfo) {
	FoundWrongKind = false;

	// Variable declaration that has type_tag_for_datatype attribute.
	const ValueDecl *VD = nullptr;

	uint64_t MagicValue;

	if (!FindTypeTagExpr(TypeExpr, Ctx, &VD, &MagicValue))
	return false;

	if (VD) {
	if (TypeTagForDatatypeAttr *I = VD->getAttr<TypeTagForDatatypeAttr>()) {
	if (I->getArgumentKind() != ArgumentKind) {
	FoundWrongKind = true;
	return false;
	}
	TypeInfo.Type = I->getMatchingCType();
	TypeInfo.LayoutCompatible = I->getLayoutCompatible();
	TypeInfo.MustBeNull = I->getMustBeNull();
	return true;
	}
	return false;
	}

	if (!MagicValues)
	return false;

	llvm::DenseMap<Sema::TypeTagMagicValue,
	Sema::TypeTagData>::const_iterator I =
	MagicValues->find(std::make_pair(ArgumentKind, MagicValue));
	if (I == MagicValues->end())
	return false;

	TypeInfo = I->second;
	return true;
	}

	void Sema::RegisterTypeTagForDatatype(const IdentifierInfo *ArgumentKind,
	uint64_t MagicValue, QualType Type,
	bool LayoutCompatible,
	bool MustBeNull) {
	if (!TypeTagForDatatypeMagicValues)
	TypeTagForDatatypeMagicValues.reset(
	new llvm::DenseMap<TypeTagMagicValue, TypeTagData>);

	TypeTagMagicValue Magic(ArgumentKind, MagicValue);
	(*TypeTagForDatatypeMagicValues)[Magic] =
	TypeTagData(Type, LayoutCompatible, MustBeNull);
	}

	static bool IsSameCharType(QualType T1, QualType T2) {
	const BuiltinType *BT1 = T1->getAs<BuiltinType>();
	if (!BT1)
	return false;

	const BuiltinType *BT2 = T2->getAs<BuiltinType>();
	if (!BT2)
	return false;

	BuiltinType::Kind T1Kind = BT1->getKind();
	BuiltinType::Kind T2Kind = BT2->getKind();

	return (T1Kind == BuiltinType::SChar && T2Kind == BuiltinType::Char_S) \|\|
	(T1Kind == BuiltinType::UChar && T2Kind == BuiltinType::Char_U) \|\|
	(T1Kind == BuiltinType::Char_U && T2Kind == BuiltinType::UChar) \|\|
	(T1Kind == BuiltinType::Char_S && T2Kind == BuiltinType::SChar);
	}

	void Sema::CheckArgumentWithTypeTag(const ArgumentWithTypeTagAttr *Attr,
	const ArrayRef<const Expr *> ExprArgs,
	SourceLocation CallSiteLoc) {
	const IdentifierInfo *ArgumentKind = Attr->getArgumentKind();
	bool IsPointerAttr = Attr->getIsPointer();

	// Retrieve the argument representing the 'type_tag'.
	if (Attr->getTypeTagIdx() >= ExprArgs.size()) {
	// Add 1 to display the user's specified value.
	Diag(CallSiteLoc, diag::err_tag_index_out_of_range)
	<< 0 << Attr->getTypeTagIdx() + 1;
	return;
	}
	const Expr *TypeTagExpr = ExprArgs[Attr->getTypeTagIdx()];
	bool FoundWrongKind;
	TypeTagData TypeInfo;
	if (!GetMatchingCType(ArgumentKind, TypeTagExpr, Context,
	TypeTagForDatatypeMagicValues.get(),
	FoundWrongKind, TypeInfo)) {
	if (FoundWrongKind)
	Diag(TypeTagExpr->getExprLoc(),
	diag::warn_type_tag_for_datatype_wrong_kind)
	<< TypeTagExpr->getSourceRange();
	return;
	}

	// Retrieve the argument representing the 'arg_idx'.
	if (Attr->getArgumentIdx() >= ExprArgs.size()) {
	// Add 1 to display the user's specified value.
	Diag(CallSiteLoc, diag::err_tag_index_out_of_range)
	<< 1 << Attr->getArgumentIdx() + 1;
	return;
	}
	const Expr *ArgumentExpr = ExprArgs[Attr->getArgumentIdx()];
	if (IsPointerAttr) {
	// Skip implicit cast of pointer to `void *' (as a function argument).
	if (const ImplicitCastExpr *ICE = dyn_cast<ImplicitCastExpr>(ArgumentExpr))
	if (ICE->getType()->isVoidPointerType() &&
	ICE->getCastKind() == CK_BitCast)
	ArgumentExpr = ICE->getSubExpr();
	}
	QualType ArgumentType = ArgumentExpr->getType();

	// Passing a `void*' pointer shouldn't trigger a warning.
	if (IsPointerAttr && ArgumentType->isVoidPointerType())
	return;

	if (TypeInfo.MustBeNull) {
	// Type tag with matching void type requires a null pointer.
	if (!ArgumentExpr->isNullPointerConstant(Context,
	Expr::NPC_ValueDependentIsNotNull)) {
	Diag(ArgumentExpr->getExprLoc(),
	diag::warn_type_safety_null_pointer_required)
	<< ArgumentKind->getName()
	<< ArgumentExpr->getSourceRange()
	<< TypeTagExpr->getSourceRange();
	}
	return;
	}

	QualType RequiredType = TypeInfo.Type;
	if (IsPointerAttr)
	RequiredType = Context.getPointerType(RequiredType);

	bool mismatch = false;
	if (!TypeInfo.LayoutCompatible) {
	mismatch = !Context.hasSameType(ArgumentType, RequiredType);

	// C++11 [basic.fundamental] p1:
	// Plain char, signed char, and unsigned char are three distinct types.
	//
	// But we treat plain `char' as equivalent to `signed char' or `unsigned
	// char' depending on the current char signedness mode.
	if (mismatch)
	if ((IsPointerAttr && IsSameCharType(ArgumentType->getPointeeType(),
	RequiredType->getPointeeType())) \|\|
	(!IsPointerAttr && IsSameCharType(ArgumentType, RequiredType)))
	mismatch = false;
	} else
	if (IsPointerAttr)
	mismatch = !isLayoutCompatible(Context,
	ArgumentType->getPointeeType(),
	RequiredType->getPointeeType());
	else
	mismatch = !isLayoutCompatible(Context, ArgumentType, RequiredType);

	if (mismatch)
	Diag(ArgumentExpr->getExprLoc(), diag::warn_type_safety_type_mismatch)
	<< ArgumentType << ArgumentKind
	<< TypeInfo.LayoutCompatible << RequiredType
	<< ArgumentExpr->getSourceRange()
	<< TypeTagExpr->getSourceRange();
	}

	void Sema::AddPotentialMisalignedMembers(Expr E, RecordDecl RD, ValueDecl *MD,
	CharUnits Alignment) {
	MisalignedMembers.emplace_back(E, RD, MD, Alignment);
	}

	void Sema::DiagnoseMisalignedMembers() {
	for (MisalignedMember &m : MisalignedMembers) {
	const NamedDecl *ND = m.RD;
	if (ND->getName().empty()) {
	if (const TypedefNameDecl *TD = m.RD->getTypedefNameForAnonDecl())
	ND = TD;
	}
	Diag(m.E->getLocStart(), diag::warn_taking_address_of_packed_member)
	<< m.MD << ND << m.E->getSourceRange();
	}
	MisalignedMembers.clear();
	}

	void Sema::DiscardMisalignedMemberAddress(const Type T, Expr E) {
	E = E->IgnoreParens();
	if (!T->isPointerType() && !T->isIntegerType())
	return;
	if (isa<UnaryOperator>(E) &&
	cast<UnaryOperator>(E)->getOpcode() == UO_AddrOf) {
	auto *Op = cast<UnaryOperator>(E)->getSubExpr()->IgnoreParens();
	if (isa<MemberExpr>(Op)) {
	auto MA = std::find(MisalignedMembers.begin(), MisalignedMembers.end(),
	MisalignedMember(Op));
	if (MA != MisalignedMembers.end() &&
	(T->isIntegerType() \|\|
	(T->isPointerType() && (T->getPointeeType()->isIncompleteType() \|\|
	Context.getTypeAlignInChars(
	T->getPointeeType()) <= MA->Alignment))))
	MisalignedMembers.erase(MA);
	}
	}
	}

	void Sema::RefersToMemberWithReducedAlignment(
	Expr *E,
	llvm::function_ref<void(Expr , RecordDecl , FieldDecl *, CharUnits)>
	Action) {
	const auto *ME = dyn_cast<MemberExpr>(E);
	if (!ME)
	return;

	// No need to check expressions with an __unaligned-qualified type.
	if (E->getType().getQualifiers().hasUnaligned())
	return;

	// For a chain of MemberExpr like "a.b.c.d" this list
	// will keep FieldDecl's like [d, c, b].
	SmallVector<FieldDecl *, 4> ReverseMemberChain;
	const MemberExpr *TopME = nullptr;
	bool AnyIsPacked = false;
	do {
	QualType BaseType = ME->getBase()->getType();
	if (ME->isArrow())
	BaseType = BaseType->getPointeeType();
	RecordDecl *RD = BaseType->getAs<RecordType>()->getDecl();
	if (RD->isInvalidDecl())
	return;

	ValueDecl *MD = ME->getMemberDecl();
	auto *FD = dyn_cast<FieldDecl>(MD);
	// We do not care about non-data members.
	if (!FD \|\| FD->isInvalidDecl())
	return;

	AnyIsPacked =
	AnyIsPacked \|\| (RD->hasAttr<PackedAttr>() \|\| MD->hasAttr<PackedAttr>());
	ReverseMemberChain.push_back(FD);

	TopME = ME;
	ME = dyn_cast<MemberExpr>(ME->getBase()->IgnoreParens());
	} while (ME);
	assert(TopME && "We did not compute a topmost MemberExpr!");

	// Not the scope of this diagnostic.
	if (!AnyIsPacked)
	return;

	const Expr *TopBase = TopME->getBase()->IgnoreParenImpCasts();
	const auto *DRE = dyn_cast<DeclRefExpr>(TopBase);
	// TODO: The innermost base of the member expression may be too complicated.
	// For now, just disregard these cases. This is left for future
	// improvement.
	if (!DRE && !isa<CXXThisExpr>(TopBase))
	return;

	// Alignment expected by the whole expression.
	CharUnits ExpectedAlignment = Context.getTypeAlignInChars(E->getType());

	// No need to do anything else with this case.
	if (ExpectedAlignment.isOne())
	return;

	// Synthesize offset of the whole access.
	CharUnits Offset;
	for (auto I = ReverseMemberChain.rbegin(); I != ReverseMemberChain.rend();
	I++) {
	Offset += Context.toCharUnitsFromBits(Context.getFieldOffset(*I));
	}

	// Compute the CompleteObjectAlignment as the alignment of the whole chain.
	CharUnits CompleteObjectAlignment = Context.getTypeAlignInChars(
	ReverseMemberChain.back()->getParent()->getTypeForDecl());

	// The base expression of the innermost MemberExpr may give
	// stronger guarantees than the class containing the member.
	if (DRE && !TopME->isArrow()) {
	const ValueDecl *VD = DRE->getDecl();
	if (!VD->getType()->isReferenceType())
	CompleteObjectAlignment =
	std::max(CompleteObjectAlignment, Context.getDeclAlign(VD));
	}

	// Check if the synthesized offset fulfills the alignment.
	if (Offset % ExpectedAlignment != 0 \|\|
	// It may fulfill the offset it but the effective alignment may still be
	// lower than the expected expression alignment.
	CompleteObjectAlignment < ExpectedAlignment) {
	// If this happens, we want to determine a sensible culprit of this.
	// Intuitively, watching the chain of member expressions from right to
	// left, we start with the required alignment (as required by the field
	// type) but some packed attribute in that chain has reduced the alignment.
	// It may happen that another packed structure increases it again. But if
	// we are here such increase has not been enough. So pointing the first
	// FieldDecl that either is packed or else its RecordDecl is,
	// seems reasonable.
	FieldDecl *FD = nullptr;
	CharUnits Alignment;
	for (FieldDecl *FDI : ReverseMemberChain) {
	if (FDI->hasAttr<PackedAttr>() \|\|
	FDI->getParent()->hasAttr<PackedAttr>()) {
	FD = FDI;
	Alignment = std::min(
	Context.getTypeAlignInChars(FD->getType()),
	Context.getTypeAlignInChars(FD->getParent()->getTypeForDecl()));
	break;
	}
	}
	assert(FD && "We did not find a packed FieldDecl!");
	Action(E, FD->getParent(), FD, Alignment);
	}
	}

	void Sema::CheckAddressOfPackedMember(Expr *rhs) {
	using namespace std::placeholders;

	RefersToMemberWithReducedAlignment(
	rhs, std::bind(&Sema::AddPotentialMisalignedMembers, std::ref(*this), _1,
	_2, _3, _4));
	}
	Index: head/contrib/llvm/tools/clang/lib/Sema/SemaInit.cpp
	===================================================================
	--- head/contrib/llvm/tools/clang/lib/Sema/SemaInit.cpp (revision 329409)
	+++ head/contrib/llvm/tools/clang/lib/Sema/SemaInit.cpp (revision 329410)
	@@ -1,8674 +1,8699 @@
	//===--- SemaInit.cpp - Semantic Analysis for Initializers ----------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements semantic analysis for initializers.
	//
	//===----------------------------------------------------------------------===//

	#include "clang/AST/ASTContext.h"
	#include "clang/AST/DeclObjC.h"
	#include "clang/AST/ExprCXX.h"
	#include "clang/AST/ExprObjC.h"
	#include "clang/AST/TypeLoc.h"
	#include "clang/Basic/TargetInfo.h"
	#include "clang/Sema/Designator.h"
	#include "clang/Sema/Initialization.h"
	#include "clang/Sema/Lookup.h"
	#include "clang/Sema/SemaInternal.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/SmallString.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/raw_ostream.h"

	using namespace clang;

	//===----------------------------------------------------------------------===//
	// Sema Initialization Checking
	//===----------------------------------------------------------------------===//

	/// \brief Check whether T is compatible with a wide character type (wchar_t,
	/// char16_t or char32_t).
	static bool IsWideCharCompatible(QualType T, ASTContext &Context) {
	if (Context.typesAreCompatible(Context.getWideCharType(), T))
	return true;
	if (Context.getLangOpts().CPlusPlus \|\| Context.getLangOpts().C11) {
	return Context.typesAreCompatible(Context.Char16Ty, T) \|\|
	Context.typesAreCompatible(Context.Char32Ty, T);
	}
	return false;
	}

	enum StringInitFailureKind {
	SIF_None,
	SIF_NarrowStringIntoWideChar,
	SIF_WideStringIntoChar,
	SIF_IncompatWideStringIntoWideChar,
	SIF_Other
	};

	/// \brief Check whether the array of type AT can be initialized by the Init
	/// expression by means of string initialization. Returns SIF_None if so,
	/// otherwise returns a StringInitFailureKind that describes why the
	/// initialization would not work.
	static StringInitFailureKind IsStringInit(Expr Init, const ArrayType AT,
	ASTContext &Context) {
	if (!isa<ConstantArrayType>(AT) && !isa<IncompleteArrayType>(AT))
	return SIF_Other;

	// See if this is a string literal or @encode.
	Init = Init->IgnoreParens();

	// Handle @encode, which is a narrow string.
	if (isa<ObjCEncodeExpr>(Init) && AT->getElementType()->isCharType())
	return SIF_None;

	// Otherwise we can only handle string literals.
	StringLiteral *SL = dyn_cast<StringLiteral>(Init);
	if (!SL)
	return SIF_Other;

	const QualType ElemTy =
	Context.getCanonicalType(AT->getElementType()).getUnqualifiedType();

	switch (SL->getKind()) {
	case StringLiteral::Ascii:
	case StringLiteral::UTF8:
	// char array can be initialized with a narrow string.
	// Only allow char x[] = "foo"; not char x[] = L"foo";
	if (ElemTy->isCharType())
	return SIF_None;
	if (IsWideCharCompatible(ElemTy, Context))
	return SIF_NarrowStringIntoWideChar;
	return SIF_Other;
	// C99 6.7.8p15 (with correction from DR343), or C11 6.7.9p15:
	// "An array with element type compatible with a qualified or unqualified
	// version of wchar_t, char16_t, or char32_t may be initialized by a wide
	// string literal with the corresponding encoding prefix (L, u, or U,
	// respectively), optionally enclosed in braces.
	case StringLiteral::UTF16:
	if (Context.typesAreCompatible(Context.Char16Ty, ElemTy))
	return SIF_None;
	if (ElemTy->isCharType())
	return SIF_WideStringIntoChar;
	if (IsWideCharCompatible(ElemTy, Context))
	return SIF_IncompatWideStringIntoWideChar;
	return SIF_Other;
	case StringLiteral::UTF32:
	if (Context.typesAreCompatible(Context.Char32Ty, ElemTy))
	return SIF_None;
	if (ElemTy->isCharType())
	return SIF_WideStringIntoChar;
	if (IsWideCharCompatible(ElemTy, Context))
	return SIF_IncompatWideStringIntoWideChar;
	return SIF_Other;
	case StringLiteral::Wide:
	if (Context.typesAreCompatible(Context.getWideCharType(), ElemTy))
	return SIF_None;
	if (ElemTy->isCharType())
	return SIF_WideStringIntoChar;
	if (IsWideCharCompatible(ElemTy, Context))
	return SIF_IncompatWideStringIntoWideChar;
	return SIF_Other;
	}

	llvm_unreachable("missed a StringLiteral kind?");
	}

	static StringInitFailureKind IsStringInit(Expr *init, QualType declType,
	ASTContext &Context) {
	const ArrayType *arrayType = Context.getAsArrayType(declType);
	if (!arrayType)
	return SIF_Other;
	return IsStringInit(init, arrayType, Context);
	}

	/// Update the type of a string literal, including any surrounding parentheses,
	/// to match the type of the object which it is initializing.
	static void updateStringLiteralType(Expr *E, QualType Ty) {
	while (true) {
	E->setType(Ty);
	if (isa<StringLiteral>(E) \|\| isa<ObjCEncodeExpr>(E))
	break;
	else if (ParenExpr *PE = dyn_cast<ParenExpr>(E))
	E = PE->getSubExpr();
	else if (UnaryOperator *UO = dyn_cast<UnaryOperator>(E))
	E = UO->getSubExpr();
	else if (GenericSelectionExpr *GSE = dyn_cast<GenericSelectionExpr>(E))
	E = GSE->getResultExpr();
	else
	llvm_unreachable("unexpected expr in string literal init");
	}
	}

	static void CheckStringInit(Expr Str, QualType &DeclT, const ArrayType AT,
	Sema &S) {
	// Get the length of the string as parsed.
	auto *ConstantArrayTy =
	cast<ConstantArrayType>(Str->getType()->getAsArrayTypeUnsafe());
	uint64_t StrLength = ConstantArrayTy->getSize().getZExtValue();

	if (const IncompleteArrayType *IAT = dyn_cast<IncompleteArrayType>(AT)) {
	// C99 6.7.8p14. We have an array of character type with unknown size
	// being initialized to a string literal.
	llvm::APInt ConstVal(32, StrLength);
	// Return a new array type (C99 6.7.8p22).
	DeclT = S.Context.getConstantArrayType(IAT->getElementType(),
	ConstVal,
	ArrayType::Normal, 0);
	updateStringLiteralType(Str, DeclT);
	return;
	}

	const ConstantArrayType *CAT = cast<ConstantArrayType>(AT);

	// We have an array of character type with known size. However,
	// the size may be smaller or larger than the string we are initializing.
	// FIXME: Avoid truncation for 64-bit length strings.
	if (S.getLangOpts().CPlusPlus) {
	if (StringLiteral *SL = dyn_cast<StringLiteral>(Str->IgnoreParens())) {
	// For Pascal strings it's OK to strip off the terminating null character,
	// so the example below is valid:
	//
	// unsigned char a[2] = "\pa";
	if (SL->isPascal())
	StrLength--;
	}

	// [dcl.init.string]p2
	if (StrLength > CAT->getSize().getZExtValue())
	S.Diag(Str->getLocStart(),
	diag::err_initializer_string_for_char_array_too_long)
	<< Str->getSourceRange();
	} else {
	// C99 6.7.8p14.
	if (StrLength-1 > CAT->getSize().getZExtValue())
	S.Diag(Str->getLocStart(),
	diag::ext_initializer_string_for_char_array_too_long)
	<< Str->getSourceRange();
	}

	// Set the type to the actual size that we are initializing. If we have
	// something like:
	// char x[1] = "foo";
	// then this will set the string literal's type to char[1].
	updateStringLiteralType(Str, DeclT);
	}

	//===----------------------------------------------------------------------===//
	// Semantic checking for initializer lists.
	//===----------------------------------------------------------------------===//

	namespace {

	/// @brief Semantic checking for initializer lists.
	///
	/// The InitListChecker class contains a set of routines that each
	/// handle the initialization of a certain kind of entity, e.g.,
	/// arrays, vectors, struct/union types, scalars, etc. The
	/// InitListChecker itself performs a recursive walk of the subobject
	/// structure of the type to be initialized, while stepping through
	/// the initializer list one element at a time. The IList and Index
	/// parameters to each of the Check* routines contain the active
	/// (syntactic) initializer list and the index into that initializer
	/// list that represents the current initializer. Each routine is
	/// responsible for moving that Index forward as it consumes elements.
	///
	/// Each Check* routine also has a StructuredList/StructuredIndex
	/// arguments, which contains the current "structured" (semantic)
	/// initializer list and the index into that initializer list where we
	/// are copying initializers as we map them over to the semantic
	/// list. Once we have completed our recursive walk of the subobject
	/// structure, we will have constructed a full semantic initializer
	/// list.
	///
	/// C99 designators cause changes in the initializer list traversal,
	/// because they make the initialization "jump" into a specific
	/// subobject and then continue the initialization from that
	/// point. CheckDesignatedInitializer() recursively steps into the
	/// designated subobject and manages backing out the recursion to
	/// initialize the subobjects after the one designated.
	class InitListChecker {
	Sema &SemaRef;
	bool hadError;
	bool VerifyOnly; // no diagnostics, no structure building
	bool TreatUnavailableAsInvalid; // Used only in VerifyOnly mode.
	llvm::DenseMap<InitListExpr , InitListExpr > SyntacticToSemantic;
	InitListExpr *FullyStructuredList;

	void CheckImplicitInitList(const InitializedEntity &Entity,
	InitListExpr *ParentIList, QualType T,
	unsigned &Index, InitListExpr *StructuredList,
	unsigned &StructuredIndex);
	void CheckExplicitInitList(const InitializedEntity &Entity,
	InitListExpr *IList, QualType &T,
	InitListExpr *StructuredList,
	bool TopLevelObject = false);
	void CheckListElementTypes(const InitializedEntity &Entity,
	InitListExpr *IList, QualType &DeclType,
	bool SubobjectIsDesignatorContext,
	unsigned &Index,
	InitListExpr *StructuredList,
	unsigned &StructuredIndex,
	bool TopLevelObject = false);
	void CheckSubElementType(const InitializedEntity &Entity,
	InitListExpr *IList, QualType ElemType,
	unsigned &Index,
	InitListExpr *StructuredList,
	unsigned &StructuredIndex);
	void CheckComplexType(const InitializedEntity &Entity,
	InitListExpr *IList, QualType DeclType,
	unsigned &Index,
	InitListExpr *StructuredList,
	unsigned &StructuredIndex);
	void CheckScalarType(const InitializedEntity &Entity,
	InitListExpr *IList, QualType DeclType,
	unsigned &Index,
	InitListExpr *StructuredList,
	unsigned &StructuredIndex);
	void CheckReferenceType(const InitializedEntity &Entity,
	InitListExpr *IList, QualType DeclType,
	unsigned &Index,
	InitListExpr *StructuredList,
	unsigned &StructuredIndex);
	void CheckVectorType(const InitializedEntity &Entity,
	InitListExpr *IList, QualType DeclType, unsigned &Index,
	InitListExpr *StructuredList,
	unsigned &StructuredIndex);
	void CheckStructUnionTypes(const InitializedEntity &Entity,
	InitListExpr *IList, QualType DeclType,
	CXXRecordDecl::base_class_range Bases,
	RecordDecl::field_iterator Field,
	bool SubobjectIsDesignatorContext, unsigned &Index,
	InitListExpr *StructuredList,
	unsigned &StructuredIndex,
	bool TopLevelObject = false);
	void CheckArrayType(const InitializedEntity &Entity,
	InitListExpr *IList, QualType &DeclType,
	llvm::APSInt elementIndex,
	bool SubobjectIsDesignatorContext, unsigned &Index,
	InitListExpr *StructuredList,
	unsigned &StructuredIndex);
	bool CheckDesignatedInitializer(const InitializedEntity &Entity,
	InitListExpr IList, DesignatedInitExpr DIE,
	unsigned DesigIdx,
	QualType &CurrentObjectType,
	RecordDecl::field_iterator *NextField,
	llvm::APSInt *NextElementIndex,
	unsigned &Index,
	InitListExpr *StructuredList,
	unsigned &StructuredIndex,
	bool FinishSubobjectInit,
	bool TopLevelObject);
	InitListExpr getStructuredSubobjectInit(InitListExpr IList, unsigned Index,
	QualType CurrentObjectType,
	InitListExpr *StructuredList,
	unsigned StructuredIndex,
	SourceRange InitRange,
	bool IsFullyOverwritten = false);
	void UpdateStructuredListElement(InitListExpr *StructuredList,
	unsigned &StructuredIndex,
	Expr *expr);
	int numArrayElements(QualType DeclType);
	int numStructUnionElements(QualType DeclType);

	static ExprResult PerformEmptyInit(Sema &SemaRef,
	SourceLocation Loc,
	const InitializedEntity &Entity,
	bool VerifyOnly,
	bool TreatUnavailableAsInvalid);

	// Explanation on the "FillWithNoInit" mode:
	//
	// Assume we have the following definitions (Case#1):
	// struct P { char x[6][6]; } xp = { .x[1] = "bar" };
	// struct PP { struct P lp; } l = { .lp = xp, .lp.x[1][2] = 'f' };
	//
	// l.lp.x[1][0..1] should not be filled with implicit initializers because the
	// "base" initializer "xp" will provide values for them; l.lp.x[1] will be "baf".
	//
	// But if we have (Case#2):
	// struct PP l = { .lp = xp, .lp.x[1] = { [2] = 'f' } };
	//
	// l.lp.x[1][0..1] are implicitly initialized and do not use values from the
	// "base" initializer; l.lp.x[1] will be "\0\0f\0\0\0".
	//
	// To distinguish Case#1 from Case#2, and also to avoid leaving many "holes"
	// in the InitListExpr, the "holes" in Case#1 are filled not with empty
	// initializers but with special "NoInitExpr" place holders, which tells the
	// CodeGen not to generate any initializers for these parts.
	void FillInEmptyInitForBase(unsigned Init, const CXXBaseSpecifier &Base,
	const InitializedEntity &ParentEntity,
	InitListExpr *ILE, bool &RequiresSecondPass,
	bool FillWithNoInit);
	void FillInEmptyInitForField(unsigned Init, FieldDecl *Field,
	const InitializedEntity &ParentEntity,
	InitListExpr *ILE, bool &RequiresSecondPass,
	bool FillWithNoInit = false);
	void FillInEmptyInitializations(const InitializedEntity &Entity,
	InitListExpr *ILE, bool &RequiresSecondPass,
	+ InitListExpr *OuterILE, unsigned OuterIndex,
	bool FillWithNoInit = false);
	bool CheckFlexibleArrayInit(const InitializedEntity &Entity,
	Expr InitExpr, FieldDecl Field,
	bool TopLevelObject);
	void CheckEmptyInitializable(const InitializedEntity &Entity,
	SourceLocation Loc);

	public:
	InitListChecker(Sema &S, const InitializedEntity &Entity,
	InitListExpr *IL, QualType &T, bool VerifyOnly,
	bool TreatUnavailableAsInvalid);
	bool HadError() { return hadError; }

	// @brief Retrieves the fully-structured initializer list used for
	// semantic analysis and code generation.
	InitListExpr *getFullyStructuredList() const { return FullyStructuredList; }
	};

	} // end anonymous namespace

	ExprResult InitListChecker::PerformEmptyInit(Sema &SemaRef,
	SourceLocation Loc,
	const InitializedEntity &Entity,
	bool VerifyOnly,
	bool TreatUnavailableAsInvalid) {
	InitializationKind Kind = InitializationKind::CreateValue(Loc, Loc, Loc,
	true);
	MultiExprArg SubInit;
	Expr *InitExpr;
	InitListExpr DummyInitList(SemaRef.Context, Loc, None, Loc);

	// C++ [dcl.init.aggr]p7:
	// If there are fewer initializer-clauses in the list than there are
	// members in the aggregate, then each member not explicitly initialized
	// ...
	bool EmptyInitList = SemaRef.getLangOpts().CPlusPlus11 &&
	Entity.getType()->getBaseElementTypeUnsafe()->isRecordType();
	if (EmptyInitList) {
	// C++1y / DR1070:
	// shall be initialized [...] from an empty initializer list.
	//
	// We apply the resolution of this DR to C++11 but not C++98, since C++98
	// does not have useful semantics for initialization from an init list.
	// We treat this as copy-initialization, because aggregate initialization
	// always performs copy-initialization on its elements.
	//
	// Only do this if we're initializing a class type, to avoid filling in
	// the initializer list where possible.
	InitExpr = VerifyOnly ? &DummyInitList : new (SemaRef.Context)
	InitListExpr(SemaRef.Context, Loc, None, Loc);
	InitExpr->setType(SemaRef.Context.VoidTy);
	SubInit = InitExpr;
	Kind = InitializationKind::CreateCopy(Loc, Loc);
	} else {
	// C++03:
	// shall be value-initialized.
	}

	InitializationSequence InitSeq(SemaRef, Entity, Kind, SubInit);
	// libstdc++4.6 marks the vector default constructor as explicit in
	// _GLIBCXX_DEBUG mode, so recover using the C++03 logic in that case.
	// stlport does so too. Look for std::__debug for libstdc++, and for
	// std:: for stlport. This is effectively a compiler-side implementation of
	// LWG2193.
	if (!InitSeq && EmptyInitList && InitSeq.getFailureKind() ==
	InitializationSequence::FK_ExplicitConstructor) {
	OverloadCandidateSet::iterator Best;
	OverloadingResult O =
	InitSeq.getFailedCandidateSet()
	.BestViableFunction(SemaRef, Kind.getLocation(), Best);
	(void)O;
	assert(O == OR_Success && "Inconsistent overload resolution");
	CXXConstructorDecl *CtorDecl = cast<CXXConstructorDecl>(Best->Function);
	CXXRecordDecl *R = CtorDecl->getParent();

	if (CtorDecl->getMinRequiredArguments() == 0 &&
	CtorDecl->isExplicit() && R->getDeclName() &&
	SemaRef.SourceMgr.isInSystemHeader(CtorDecl->getLocation())) {
	bool IsInStd = false;
	for (NamespaceDecl *ND = dyn_cast<NamespaceDecl>(R->getDeclContext());
	ND && !IsInStd; ND = dyn_cast<NamespaceDecl>(ND->getParent())) {
	if (SemaRef.getStdNamespace()->InEnclosingNamespaceSetOf(ND))
	IsInStd = true;
	}

	if (IsInStd && llvm::StringSwitch<bool>(R->getName())
	.Cases("basic_string", "deque", "forward_list", true)
	.Cases("list", "map", "multimap", "multiset", true)
	.Cases("priority_queue", "queue", "set", "stack", true)
	.Cases("unordered_map", "unordered_set", "vector", true)
	.Default(false)) {
	InitSeq.InitializeFrom(
	SemaRef, Entity,
	InitializationKind::CreateValue(Loc, Loc, Loc, true),
	MultiExprArg(), /TopLevelOfInitList=/false,
	TreatUnavailableAsInvalid);
	// Emit a warning for this. System header warnings aren't shown
	// by default, but people working on system headers should see it.
	if (!VerifyOnly) {
	SemaRef.Diag(CtorDecl->getLocation(),
	diag::warn_invalid_initializer_from_system_header);
	if (Entity.getKind() == InitializedEntity::EK_Member)
	SemaRef.Diag(Entity.getDecl()->getLocation(),
	diag::note_used_in_initialization_here);
	else if (Entity.getKind() == InitializedEntity::EK_ArrayElement)
	SemaRef.Diag(Loc, diag::note_used_in_initialization_here);
	}
	}
	}
	}
	if (!InitSeq) {
	if (!VerifyOnly) {
	InitSeq.Diagnose(SemaRef, Entity, Kind, SubInit);
	if (Entity.getKind() == InitializedEntity::EK_Member)
	SemaRef.Diag(Entity.getDecl()->getLocation(),
	diag::note_in_omitted_aggregate_initializer)
	<< /field/1 << Entity.getDecl();
	else if (Entity.getKind() == InitializedEntity::EK_ArrayElement) {
	bool IsTrailingArrayNewMember =
	Entity.getParent() &&
	Entity.getParent()->isVariableLengthArrayNew();
	SemaRef.Diag(Loc, diag::note_in_omitted_aggregate_initializer)
	<< (IsTrailingArrayNewMember ? 2 : /array element/0)
	<< Entity.getElementIndex();
	}
	}
	return ExprError();
	}

	return VerifyOnly ? ExprResult(static_cast<Expr *>(nullptr))
	: InitSeq.Perform(SemaRef, Entity, Kind, SubInit);
	}

	void InitListChecker::CheckEmptyInitializable(const InitializedEntity &Entity,
	SourceLocation Loc) {
	assert(VerifyOnly &&
	"CheckEmptyInitializable is only inteded for verification mode.");
	if (PerformEmptyInit(SemaRef, Loc, Entity, /VerifyOnly/true,
	TreatUnavailableAsInvalid).isInvalid())
	hadError = true;
	}

	void InitListChecker::FillInEmptyInitForBase(
	unsigned Init, const CXXBaseSpecifier &Base,
	const InitializedEntity &ParentEntity, InitListExpr *ILE,
	bool &RequiresSecondPass, bool FillWithNoInit) {
	assert(Init < ILE->getNumInits() && "should have been expanded");

	InitializedEntity BaseEntity = InitializedEntity::InitializeBase(
	SemaRef.Context, &Base, false, &ParentEntity);

	if (!ILE->getInit(Init)) {
	ExprResult BaseInit =
	FillWithNoInit ? new (SemaRef.Context) NoInitExpr(Base.getType())
	: PerformEmptyInit(SemaRef, ILE->getLocEnd(), BaseEntity,
	/VerifyOnly/ false,
	TreatUnavailableAsInvalid);
	if (BaseInit.isInvalid()) {
	hadError = true;
	return;
	}

	ILE->setInit(Init, BaseInit.getAs<Expr>());
	} else if (InitListExpr *InnerILE =
	dyn_cast<InitListExpr>(ILE->getInit(Init))) {
	- FillInEmptyInitializations(BaseEntity, InnerILE,
	- RequiresSecondPass, FillWithNoInit);
	+ FillInEmptyInitializations(BaseEntity, InnerILE, RequiresSecondPass,
	+ ILE, Init, FillWithNoInit);
	} else if (DesignatedInitUpdateExpr *InnerDIUE =
	dyn_cast<DesignatedInitUpdateExpr>(ILE->getInit(Init))) {
	FillInEmptyInitializations(BaseEntity, InnerDIUE->getUpdater(),
	- RequiresSecondPass, /FillWithNoInit =/true);
	+ RequiresSecondPass, ILE, Init,
	+ /FillWithNoInit =/true);
	}
	}

	void InitListChecker::FillInEmptyInitForField(unsigned Init, FieldDecl *Field,
	const InitializedEntity &ParentEntity,
	InitListExpr *ILE,
	bool &RequiresSecondPass,
	bool FillWithNoInit) {
	SourceLocation Loc = ILE->getLocEnd();
	unsigned NumInits = ILE->getNumInits();
	InitializedEntity MemberEntity
	= InitializedEntity::InitializeMember(Field, &ParentEntity);

	if (const RecordType *RType = ILE->getType()->getAs<RecordType>())
	if (!RType->getDecl()->isUnion())
	assert(Init < NumInits && "This ILE should have been expanded");

	if (Init >= NumInits \|\| !ILE->getInit(Init)) {
	if (FillWithNoInit) {
	Expr *Filler = new (SemaRef.Context) NoInitExpr(Field->getType());
	if (Init < NumInits)
	ILE->setInit(Init, Filler);
	else
	ILE->updateInit(SemaRef.Context, Init, Filler);
	return;
	}
	// C++1y [dcl.init.aggr]p7:
	// If there are fewer initializer-clauses in the list than there are
	// members in the aggregate, then each member not explicitly initialized
	// shall be initialized from its brace-or-equal-initializer [...]
	if (Field->hasInClassInitializer()) {
	ExprResult DIE = SemaRef.BuildCXXDefaultInitExpr(Loc, Field);
	if (DIE.isInvalid()) {
	hadError = true;
	return;
	}
	if (Init < NumInits)
	ILE->setInit(Init, DIE.get());
	else {
	ILE->updateInit(SemaRef.Context, Init, DIE.get());
	RequiresSecondPass = true;
	}
	return;
	}

	if (Field->getType()->isReferenceType()) {
	// C++ [dcl.init.aggr]p9:
	// If an incomplete or empty initializer-list leaves a
	// member of reference type uninitialized, the program is
	// ill-formed.
	SemaRef.Diag(Loc, diag::err_init_reference_member_uninitialized)
	<< Field->getType()
	<< ILE->getSyntacticForm()->getSourceRange();
	SemaRef.Diag(Field->getLocation(),
	diag::note_uninit_reference_member);
	hadError = true;
	return;
	}

	ExprResult MemberInit = PerformEmptyInit(SemaRef, Loc, MemberEntity,
	/VerifyOnly/false,
	TreatUnavailableAsInvalid);
	if (MemberInit.isInvalid()) {
	hadError = true;
	return;
	}

	if (hadError) {
	// Do nothing
	} else if (Init < NumInits) {
	ILE->setInit(Init, MemberInit.getAs<Expr>());
	} else if (!isa<ImplicitValueInitExpr>(MemberInit.get())) {
	// Empty initialization requires a constructor call, so
	// extend the initializer list to include the constructor
	// call and make a note that we'll need to take another pass
	// through the initializer list.
	ILE->updateInit(SemaRef.Context, Init, MemberInit.getAs<Expr>());
	RequiresSecondPass = true;
	}
	} else if (InitListExpr *InnerILE
	= dyn_cast<InitListExpr>(ILE->getInit(Init)))
	FillInEmptyInitializations(MemberEntity, InnerILE,
	- RequiresSecondPass, FillWithNoInit);
	+ RequiresSecondPass, ILE, Init, FillWithNoInit);
	else if (DesignatedInitUpdateExpr *InnerDIUE
	= dyn_cast<DesignatedInitUpdateExpr>(ILE->getInit(Init)))
	FillInEmptyInitializations(MemberEntity, InnerDIUE->getUpdater(),
	- RequiresSecondPass, /FillWithNoInit =/ true);
	+ RequiresSecondPass, ILE, Init,
	+ /FillWithNoInit =/true);
	}

	/// Recursively replaces NULL values within the given initializer list
	/// with expressions that perform value-initialization of the
	-/// appropriate type.
	+/// appropriate type, and finish off the InitListExpr formation.
	void
	InitListChecker::FillInEmptyInitializations(const InitializedEntity &Entity,
	InitListExpr *ILE,
	bool &RequiresSecondPass,
	+ InitListExpr *OuterILE,
	+ unsigned OuterIndex,
	bool FillWithNoInit) {
	assert((ILE->getType() != SemaRef.Context.VoidTy) &&
	"Should not have void type");

	+ // If this is a nested initializer list, we might have changed its contents
	+ // (and therefore some of its properties, such as instantiation-dependence)
	+ // while filling it in. Inform the outer initializer list so that its state
	+ // can be updated to match.
	+ // FIXME: We should fully build the inner initializers before constructing
	+ // the outer InitListExpr instead of mutating AST nodes after they have
	+ // been used as subexpressions of other nodes.
	+ struct UpdateOuterILEWithUpdatedInit {
	+ InitListExpr *Outer;
	+ unsigned OuterIndex;
	+ ~UpdateOuterILEWithUpdatedInit() {
	+ if (Outer)
	+ Outer->setInit(OuterIndex, Outer->getInit(OuterIndex));
	+ }
	+ } UpdateOuterRAII = {OuterILE, OuterIndex};
	+
	// A transparent ILE is not performing aggregate initialization and should
	// not be filled in.
	if (ILE->isTransparent())
	return;

	if (const RecordType *RType = ILE->getType()->getAs<RecordType>()) {
	const RecordDecl *RDecl = RType->getDecl();
	if (RDecl->isUnion() && ILE->getInitializedFieldInUnion())
	FillInEmptyInitForField(0, ILE->getInitializedFieldInUnion(),
	Entity, ILE, RequiresSecondPass, FillWithNoInit);
	else if (RDecl->isUnion() && isa<CXXRecordDecl>(RDecl) &&
	cast<CXXRecordDecl>(RDecl)->hasInClassInitializer()) {
	for (auto *Field : RDecl->fields()) {
	if (Field->hasInClassInitializer()) {
	FillInEmptyInitForField(0, Field, Entity, ILE, RequiresSecondPass,
	FillWithNoInit);
	break;
	}
	}
	} else {
	// The fields beyond ILE->getNumInits() are default initialized, so in
	// order to leave them uninitialized, the ILE is expanded and the extra
	// fields are then filled with NoInitExpr.
	unsigned NumElems = numStructUnionElements(ILE->getType());
	if (RDecl->hasFlexibleArrayMember())
	++NumElems;
	if (ILE->getNumInits() < NumElems)
	ILE->resizeInits(SemaRef.Context, NumElems);

	unsigned Init = 0;

	if (auto *CXXRD = dyn_cast<CXXRecordDecl>(RDecl)) {
	for (auto &Base : CXXRD->bases()) {
	if (hadError)
	return;

	FillInEmptyInitForBase(Init, Base, Entity, ILE, RequiresSecondPass,
	FillWithNoInit);
	++Init;
	}
	}

	for (auto *Field : RDecl->fields()) {
	if (Field->isUnnamedBitfield())
	continue;

	if (hadError)
	return;

	FillInEmptyInitForField(Init, Field, Entity, ILE, RequiresSecondPass,
	FillWithNoInit);
	if (hadError)
	return;

	++Init;

	// Only look at the first initialization of a union.
	if (RDecl->isUnion())
	break;
	}
	}

	return;
	}

	QualType ElementType;

	InitializedEntity ElementEntity = Entity;
	unsigned NumInits = ILE->getNumInits();
	unsigned NumElements = NumInits;
	if (const ArrayType *AType = SemaRef.Context.getAsArrayType(ILE->getType())) {
	ElementType = AType->getElementType();
	if (const auto *CAType = dyn_cast<ConstantArrayType>(AType))
	NumElements = CAType->getSize().getZExtValue();
	// For an array new with an unknown bound, ask for one additional element
	// in order to populate the array filler.
	if (Entity.isVariableLengthArrayNew())
	++NumElements;
	ElementEntity = InitializedEntity::InitializeElement(SemaRef.Context,
	0, Entity);
	} else if (const VectorType *VType = ILE->getType()->getAs<VectorType>()) {
	ElementType = VType->getElementType();
	NumElements = VType->getNumElements();
	ElementEntity = InitializedEntity::InitializeElement(SemaRef.Context,
	0, Entity);
	} else
	ElementType = ILE->getType();

	for (unsigned Init = 0; Init != NumElements; ++Init) {
	if (hadError)
	return;

	if (ElementEntity.getKind() == InitializedEntity::EK_ArrayElement \|\|
	ElementEntity.getKind() == InitializedEntity::EK_VectorElement)
	ElementEntity.setElementIndex(Init);

	Expr *InitExpr = (Init < NumInits ? ILE->getInit(Init) : nullptr);
	if (!InitExpr && Init < NumInits && ILE->hasArrayFiller())
	ILE->setInit(Init, ILE->getArrayFiller());
	else if (!InitExpr && !ILE->hasArrayFiller()) {
	Expr *Filler = nullptr;

	if (FillWithNoInit)
	Filler = new (SemaRef.Context) NoInitExpr(ElementType);
	else {
	ExprResult ElementInit = PerformEmptyInit(SemaRef, ILE->getLocEnd(),
	ElementEntity,
	/VerifyOnly/false,
	TreatUnavailableAsInvalid);
	if (ElementInit.isInvalid()) {
	hadError = true;
	return;
	}

	Filler = ElementInit.getAs<Expr>();
	}

	if (hadError) {
	// Do nothing
	} else if (Init < NumInits) {
	// For arrays, just set the expression used for value-initialization
	// of the "holes" in the array.
	if (ElementEntity.getKind() == InitializedEntity::EK_ArrayElement)
	ILE->setArrayFiller(Filler);
	else
	ILE->setInit(Init, Filler);
	} else {
	// For arrays, just set the expression used for value-initialization
	// of the rest of elements and exit.
	if (ElementEntity.getKind() == InitializedEntity::EK_ArrayElement) {
	ILE->setArrayFiller(Filler);
	return;
	}

	if (!isa<ImplicitValueInitExpr>(Filler) && !isa<NoInitExpr>(Filler)) {
	// Empty initialization requires a constructor call, so
	// extend the initializer list to include the constructor
	// call and make a note that we'll need to take another pass
	// through the initializer list.
	ILE->updateInit(SemaRef.Context, Init, Filler);
	RequiresSecondPass = true;
	}
	}
	} else if (InitListExpr *InnerILE
	= dyn_cast_or_null<InitListExpr>(InitExpr))
	FillInEmptyInitializations(ElementEntity, InnerILE, RequiresSecondPass,
	- FillWithNoInit);
	+ ILE, Init, FillWithNoInit);
	else if (DesignatedInitUpdateExpr *InnerDIUE
	= dyn_cast_or_null<DesignatedInitUpdateExpr>(InitExpr))
	FillInEmptyInitializations(ElementEntity, InnerDIUE->getUpdater(),
	- RequiresSecondPass, /FillWithNoInit =/ true);
	+ RequiresSecondPass, ILE, Init,
	+ /FillWithNoInit =/true);
	}
	}

	InitListChecker::InitListChecker(Sema &S, const InitializedEntity &Entity,
	InitListExpr *IL, QualType &T,
	bool VerifyOnly,
	bool TreatUnavailableAsInvalid)
	: SemaRef(S), VerifyOnly(VerifyOnly),
	TreatUnavailableAsInvalid(TreatUnavailableAsInvalid) {
	// FIXME: Check that IL isn't already the semantic form of some other
	// InitListExpr. If it is, we'd create a broken AST.

	hadError = false;

	FullyStructuredList =
	getStructuredSubobjectInit(IL, 0, T, nullptr, 0, IL->getSourceRange());
	CheckExplicitInitList(Entity, IL, T, FullyStructuredList,
	/TopLevelObject=/true);

	if (!hadError && !VerifyOnly) {
	bool RequiresSecondPass = false;
	- FillInEmptyInitializations(Entity, FullyStructuredList, RequiresSecondPass);
	+ FillInEmptyInitializations(Entity, FullyStructuredList, RequiresSecondPass,
	+ /OuterILE=/nullptr, /OuterIndex=/0);
	if (RequiresSecondPass && !hadError)
	FillInEmptyInitializations(Entity, FullyStructuredList,
	- RequiresSecondPass);
	+ RequiresSecondPass, nullptr, 0);
	}
	}

	int InitListChecker::numArrayElements(QualType DeclType) {
	// FIXME: use a proper constant
	int maxElements = 0x7FFFFFFF;
	if (const ConstantArrayType *CAT =
	SemaRef.Context.getAsConstantArrayType(DeclType)) {
	maxElements = static_cast<int>(CAT->getSize().getZExtValue());
	}
	return maxElements;
	}

	int InitListChecker::numStructUnionElements(QualType DeclType) {
	RecordDecl *structDecl = DeclType->getAs<RecordType>()->getDecl();
	int InitializableMembers = 0;
	if (auto *CXXRD = dyn_cast<CXXRecordDecl>(structDecl))
	InitializableMembers += CXXRD->getNumBases();
	for (const auto *Field : structDecl->fields())
	if (!Field->isUnnamedBitfield())
	++InitializableMembers;

	if (structDecl->isUnion())
	return std::min(InitializableMembers, 1);
	return InitializableMembers - structDecl->hasFlexibleArrayMember();
	}

	/// Determine whether Entity is an entity for which it is idiomatic to elide
	/// the braces in aggregate initialization.
	static bool isIdiomaticBraceElisionEntity(const InitializedEntity &Entity) {
	// Recursive initialization of the one and only field within an aggregate
	// class is considered idiomatic. This case arises in particular for
	// initialization of std::array, where the C++ standard suggests the idiom of
	//
	// std::array<T, N> arr = {1, 2, 3};
	//
	// (where std::array is an aggregate struct containing a single array field.

	// FIXME: Should aggregate initialization of a struct with a single
	// base class and no members also suppress the warning?
	if (Entity.getKind() != InitializedEntity::EK_Member \|\| !Entity.getParent())
	return false;

	auto *ParentRD =
	Entity.getParent()->getType()->castAs<RecordType>()->getDecl();
	if (CXXRecordDecl *CXXRD = dyn_cast<CXXRecordDecl>(ParentRD))
	if (CXXRD->getNumBases())
	return false;

	auto FieldIt = ParentRD->field_begin();
	assert(FieldIt != ParentRD->field_end() &&
	"no fields but have initializer for member?");
	return ++FieldIt == ParentRD->field_end();
	}

	/// Check whether the range of the initializer \p ParentIList from element
	/// \p Index onwards can be used to initialize an object of type \p T. Update
	/// \p Index to indicate how many elements of the list were consumed.
	///
	/// This also fills in \p StructuredList, from element \p StructuredIndex
	/// onwards, with the fully-braced, desugared form of the initialization.
	void InitListChecker::CheckImplicitInitList(const InitializedEntity &Entity,
	InitListExpr *ParentIList,
	QualType T, unsigned &Index,
	InitListExpr *StructuredList,
	unsigned &StructuredIndex) {
	int maxElements = 0;

	if (T->isArrayType())
	maxElements = numArrayElements(T);
	else if (T->isRecordType())
	maxElements = numStructUnionElements(T);
	else if (T->isVectorType())
	maxElements = T->getAs<VectorType>()->getNumElements();
	else
	llvm_unreachable("CheckImplicitInitList(): Illegal type");

	if (maxElements == 0) {
	if (!VerifyOnly)
	SemaRef.Diag(ParentIList->getInit(Index)->getLocStart(),
	diag::err_implicit_empty_initializer);
	++Index;
	hadError = true;
	return;
	}

	// Build a structured initializer list corresponding to this subobject.
	InitListExpr *StructuredSubobjectInitList
	= getStructuredSubobjectInit(ParentIList, Index, T, StructuredList,
	StructuredIndex,
	SourceRange(ParentIList->getInit(Index)->getLocStart(),
	ParentIList->getSourceRange().getEnd()));
	unsigned StructuredSubobjectInitIndex = 0;

	// Check the element types and build the structural subobject.
	unsigned StartIndex = Index;
	CheckListElementTypes(Entity, ParentIList, T,
	/SubobjectIsDesignatorContext=/false, Index,
	StructuredSubobjectInitList,
	StructuredSubobjectInitIndex);

	if (!VerifyOnly) {
	StructuredSubobjectInitList->setType(T);

	unsigned EndIndex = (Index == StartIndex? StartIndex : Index - 1);
	// Update the structured sub-object initializer so that it's ending
	// range corresponds with the end of the last initializer it used.
	if (EndIndex < ParentIList->getNumInits() &&
	ParentIList->getInit(EndIndex)) {
	SourceLocation EndLoc
	= ParentIList->getInit(EndIndex)->getSourceRange().getEnd();
	StructuredSubobjectInitList->setRBraceLoc(EndLoc);
	}

	// Complain about missing braces.
	if ((T->isArrayType() \|\| T->isRecordType()) &&
	!ParentIList->isIdiomaticZeroInitializer(SemaRef.getLangOpts()) &&
	!isIdiomaticBraceElisionEntity(Entity)) {
	SemaRef.Diag(StructuredSubobjectInitList->getLocStart(),
	diag::warn_missing_braces)
	<< StructuredSubobjectInitList->getSourceRange()
	<< FixItHint::CreateInsertion(
	StructuredSubobjectInitList->getLocStart(), "{")
	<< FixItHint::CreateInsertion(
	SemaRef.getLocForEndOfToken(
	StructuredSubobjectInitList->getLocEnd()),
	"}");
	}
	}
	}

	/// Warn that \p Entity was of scalar type and was initialized by a
	/// single-element braced initializer list.
	static void warnBracedScalarInit(Sema &S, const InitializedEntity &Entity,
	SourceRange Braces) {
	// Don't warn during template instantiation. If the initialization was
	// non-dependent, we warned during the initial parse; otherwise, the
	// type might not be scalar in some uses of the template.
	if (S.inTemplateInstantiation())
	return;

	unsigned DiagID = 0;

	switch (Entity.getKind()) {
	case InitializedEntity::EK_VectorElement:
	case InitializedEntity::EK_ComplexElement:
	case InitializedEntity::EK_ArrayElement:
	case InitializedEntity::EK_Parameter:
	case InitializedEntity::EK_Parameter_CF_Audited:
	case InitializedEntity::EK_Result:
	// Extra braces here are suspicious.
	DiagID = diag::warn_braces_around_scalar_init;
	break;

	case InitializedEntity::EK_Member:
	// Warn on aggregate initialization but not on ctor init list or
	// default member initializer.
	if (Entity.getParent())
	DiagID = diag::warn_braces_around_scalar_init;
	break;

	case InitializedEntity::EK_Variable:
	case InitializedEntity::EK_LambdaCapture:
	// No warning, might be direct-list-initialization.
	// FIXME: Should we warn for copy-list-initialization in these cases?
	break;

	case InitializedEntity::EK_New:
	case InitializedEntity::EK_Temporary:
	case InitializedEntity::EK_CompoundLiteralInit:
	// No warning, braces are part of the syntax of the underlying construct.
	break;

	case InitializedEntity::EK_RelatedResult:
	// No warning, we already warned when initializing the result.
	break;

	case InitializedEntity::EK_Exception:
	case InitializedEntity::EK_Base:
	case InitializedEntity::EK_Delegating:
	case InitializedEntity::EK_BlockElement:
	case InitializedEntity::EK_LambdaToBlockConversionBlockElement:
	case InitializedEntity::EK_Binding:
	llvm_unreachable("unexpected braced scalar init");
	}

	if (DiagID) {
	S.Diag(Braces.getBegin(), DiagID)
	<< Braces
	<< FixItHint::CreateRemoval(Braces.getBegin())
	<< FixItHint::CreateRemoval(Braces.getEnd());
	}
	}

	/// Check whether the initializer \p IList (that was written with explicit
	/// braces) can be used to initialize an object of type \p T.
	///
	/// This also fills in \p StructuredList with the fully-braced, desugared
	/// form of the initialization.
	void InitListChecker::CheckExplicitInitList(const InitializedEntity &Entity,
	InitListExpr *IList, QualType &T,
	InitListExpr *StructuredList,
	bool TopLevelObject) {
	if (!VerifyOnly) {
	SyntacticToSemantic[IList] = StructuredList;
	StructuredList->setSyntacticForm(IList);
	}

	unsigned Index = 0, StructuredIndex = 0;
	CheckListElementTypes(Entity, IList, T, /SubobjectIsDesignatorContext=/true,
	Index, StructuredList, StructuredIndex, TopLevelObject);
	if (!VerifyOnly) {
	QualType ExprTy = T;
	if (!ExprTy->isArrayType())
	ExprTy = ExprTy.getNonLValueExprType(SemaRef.Context);
	IList->setType(ExprTy);
	StructuredList->setType(ExprTy);
	}
	if (hadError)
	return;

	if (Index < IList->getNumInits()) {
	// We have leftover initializers
	if (VerifyOnly) {
	if (SemaRef.getLangOpts().CPlusPlus \|\|
	(SemaRef.getLangOpts().OpenCL &&
	IList->getType()->isVectorType())) {
	hadError = true;
	}
	return;
	}

	if (StructuredIndex == 1 &&
	IsStringInit(StructuredList->getInit(0), T, SemaRef.Context) ==
	SIF_None) {
	unsigned DK = diag::ext_excess_initializers_in_char_array_initializer;
	if (SemaRef.getLangOpts().CPlusPlus) {
	DK = diag::err_excess_initializers_in_char_array_initializer;
	hadError = true;
	}
	// Special-case
	SemaRef.Diag(IList->getInit(Index)->getLocStart(), DK)
	<< IList->getInit(Index)->getSourceRange();
	} else if (!T->isIncompleteType()) {
	// Don't complain for incomplete types, since we'll get an error
	// elsewhere
	QualType CurrentObjectType = StructuredList->getType();
	int initKind =
	CurrentObjectType->isArrayType()? 0 :
	CurrentObjectType->isVectorType()? 1 :
	CurrentObjectType->isScalarType()? 2 :
	CurrentObjectType->isUnionType()? 3 :
	4;

	unsigned DK = diag::ext_excess_initializers;
	if (SemaRef.getLangOpts().CPlusPlus) {
	DK = diag::err_excess_initializers;
	hadError = true;
	}
	if (SemaRef.getLangOpts().OpenCL && initKind == 1) {
	DK = diag::err_excess_initializers;
	hadError = true;
	}

	SemaRef.Diag(IList->getInit(Index)->getLocStart(), DK)
	<< initKind << IList->getInit(Index)->getSourceRange();
	}
	}

	if (!VerifyOnly && T->isScalarType() &&
	IList->getNumInits() == 1 && !isa<InitListExpr>(IList->getInit(0)))
	warnBracedScalarInit(SemaRef, Entity, IList->getSourceRange());
	}

	void InitListChecker::CheckListElementTypes(const InitializedEntity &Entity,
	InitListExpr *IList,
	QualType &DeclType,
	bool SubobjectIsDesignatorContext,
	unsigned &Index,
	InitListExpr *StructuredList,
	unsigned &StructuredIndex,
	bool TopLevelObject) {
	if (DeclType->isAnyComplexType() && SubobjectIsDesignatorContext) {
	// Explicitly braced initializer for complex type can be real+imaginary
	// parts.
	CheckComplexType(Entity, IList, DeclType, Index,
	StructuredList, StructuredIndex);
	} else if (DeclType->isScalarType()) {
	CheckScalarType(Entity, IList, DeclType, Index,
	StructuredList, StructuredIndex);
	} else if (DeclType->isVectorType()) {
	CheckVectorType(Entity, IList, DeclType, Index,
	StructuredList, StructuredIndex);
	} else if (DeclType->isRecordType()) {
	assert(DeclType->isAggregateType() &&
	"non-aggregate records should be handed in CheckSubElementType");
	RecordDecl *RD = DeclType->getAs<RecordType>()->getDecl();
	auto Bases =
	CXXRecordDecl::base_class_range(CXXRecordDecl::base_class_iterator(),
	CXXRecordDecl::base_class_iterator());
	if (auto *CXXRD = dyn_cast<CXXRecordDecl>(RD))
	Bases = CXXRD->bases();
	CheckStructUnionTypes(Entity, IList, DeclType, Bases, RD->field_begin(),
	SubobjectIsDesignatorContext, Index, StructuredList,
	StructuredIndex, TopLevelObject);
	} else if (DeclType->isArrayType()) {
	llvm::APSInt Zero(
	SemaRef.Context.getTypeSize(SemaRef.Context.getSizeType()),
	false);
	CheckArrayType(Entity, IList, DeclType, Zero,
	SubobjectIsDesignatorContext, Index,
	StructuredList, StructuredIndex);
	} else if (DeclType->isVoidType() \|\| DeclType->isFunctionType()) {
	// This type is invalid, issue a diagnostic.
	++Index;
	if (!VerifyOnly)
	SemaRef.Diag(IList->getLocStart(), diag::err_illegal_initializer_type)
	<< DeclType;
	hadError = true;
	} else if (DeclType->isReferenceType()) {
	CheckReferenceType(Entity, IList, DeclType, Index,
	StructuredList, StructuredIndex);
	} else if (DeclType->isObjCObjectType()) {
	if (!VerifyOnly)
	SemaRef.Diag(IList->getLocStart(), diag::err_init_objc_class)
	<< DeclType;
	hadError = true;
	} else {
	if (!VerifyOnly)
	SemaRef.Diag(IList->getLocStart(), diag::err_illegal_initializer_type)
	<< DeclType;
	hadError = true;
	}
	}

	void InitListChecker::CheckSubElementType(const InitializedEntity &Entity,
	InitListExpr *IList,
	QualType ElemType,
	unsigned &Index,
	InitListExpr *StructuredList,
	unsigned &StructuredIndex) {
	Expr *expr = IList->getInit(Index);

	if (ElemType->isReferenceType())
	return CheckReferenceType(Entity, IList, ElemType, Index,
	StructuredList, StructuredIndex);

	if (InitListExpr *SubInitList = dyn_cast<InitListExpr>(expr)) {
	if (SubInitList->getNumInits() == 1 &&
	IsStringInit(SubInitList->getInit(0), ElemType, SemaRef.Context) ==
	SIF_None) {
	expr = SubInitList->getInit(0);
	} else if (!SemaRef.getLangOpts().CPlusPlus) {
	InitListExpr *InnerStructuredList
	= getStructuredSubobjectInit(IList, Index, ElemType,
	StructuredList, StructuredIndex,
	SubInitList->getSourceRange(), true);
	CheckExplicitInitList(Entity, SubInitList, ElemType,
	InnerStructuredList);

	if (!hadError && !VerifyOnly) {
	bool RequiresSecondPass = false;
	FillInEmptyInitializations(Entity, InnerStructuredList,
	- RequiresSecondPass);
	+ RequiresSecondPass, StructuredList,
	+ StructuredIndex);
	if (RequiresSecondPass && !hadError)
	FillInEmptyInitializations(Entity, InnerStructuredList,
	- RequiresSecondPass);
	+ RequiresSecondPass, StructuredList,
	+ StructuredIndex);
	}
	++StructuredIndex;
	++Index;
	return;
	}
	// C++ initialization is handled later.
	} else if (isa<ImplicitValueInitExpr>(expr)) {
	// This happens during template instantiation when we see an InitListExpr
	// that we've already checked once.
	assert(SemaRef.Context.hasSameType(expr->getType(), ElemType) &&
	"found implicit initialization for the wrong type");
	if (!VerifyOnly)
	UpdateStructuredListElement(StructuredList, StructuredIndex, expr);
	++Index;
	return;
	}

	if (SemaRef.getLangOpts().CPlusPlus) {
	// C++ [dcl.init.aggr]p2:
	// Each member is copy-initialized from the corresponding
	// initializer-clause.

	// FIXME: Better EqualLoc?
	InitializationKind Kind =
	InitializationKind::CreateCopy(expr->getLocStart(), SourceLocation());
	InitializationSequence Seq(SemaRef, Entity, Kind, expr,
	/TopLevelOfInitList/ true);

	// C++14 [dcl.init.aggr]p13:
	// If the assignment-expression can initialize a member, the member is
	// initialized. Otherwise [...] brace elision is assumed
	//
	// Brace elision is never performed if the element is not an
	// assignment-expression.
	if (Seq \|\| isa<InitListExpr>(expr)) {
	if (!VerifyOnly) {
	ExprResult Result =
	Seq.Perform(SemaRef, Entity, Kind, expr);
	if (Result.isInvalid())
	hadError = true;

	UpdateStructuredListElement(StructuredList, StructuredIndex,
	Result.getAs<Expr>());
	} else if (!Seq)
	hadError = true;
	++Index;
	return;
	}

	// Fall through for subaggregate initialization
	} else if (ElemType->isScalarType() \|\| ElemType->isAtomicType()) {
	// FIXME: Need to handle atomic aggregate types with implicit init lists.
	return CheckScalarType(Entity, IList, ElemType, Index,
	StructuredList, StructuredIndex);
	} else if (const ArrayType *arrayType =
	SemaRef.Context.getAsArrayType(ElemType)) {
	// arrayType can be incomplete if we're initializing a flexible
	// array member. There's nothing we can do with the completed
	// type here, though.

	if (IsStringInit(expr, arrayType, SemaRef.Context) == SIF_None) {
	if (!VerifyOnly) {
	CheckStringInit(expr, ElemType, arrayType, SemaRef);
	UpdateStructuredListElement(StructuredList, StructuredIndex, expr);
	}
	++Index;
	return;
	}

	// Fall through for subaggregate initialization.

	} else {
	assert((ElemType->isRecordType() \|\| ElemType->isVectorType() \|\|
	ElemType->isOpenCLSpecificType()) && "Unexpected type");

	// C99 6.7.8p13:
	//
	// The initializer for a structure or union object that has
	// automatic storage duration shall be either an initializer
	// list as described below, or a single expression that has
	// compatible structure or union type. In the latter case, the
	// initial value of the object, including unnamed members, is
	// that of the expression.
	ExprResult ExprRes = expr;
	if (SemaRef.CheckSingleAssignmentConstraints(
	ElemType, ExprRes, !VerifyOnly) != Sema::Incompatible) {
	if (ExprRes.isInvalid())
	hadError = true;
	else {
	ExprRes = SemaRef.DefaultFunctionArrayLvalueConversion(ExprRes.get());
	if (ExprRes.isInvalid())
	hadError = true;
	}
	UpdateStructuredListElement(StructuredList, StructuredIndex,
	ExprRes.getAs<Expr>());
	++Index;
	return;
	}
	ExprRes.get();
	// Fall through for subaggregate initialization
	}

	// C++ [dcl.init.aggr]p12:
	//
	// [...] Otherwise, if the member is itself a non-empty
	// subaggregate, brace elision is assumed and the initializer is
	// considered for the initialization of the first member of
	// the subaggregate.
	// OpenCL vector initializer is handled elsewhere.
	if ((!SemaRef.getLangOpts().OpenCL && ElemType->isVectorType()) \|\|
	ElemType->isAggregateType()) {
	CheckImplicitInitList(Entity, IList, ElemType, Index, StructuredList,
	StructuredIndex);
	++StructuredIndex;
	} else {
	if (!VerifyOnly) {
	// We cannot initialize this element, so let
	// PerformCopyInitialization produce the appropriate diagnostic.
	SemaRef.PerformCopyInitialization(Entity, SourceLocation(), expr,
	/TopLevelOfInitList=/true);
	}
	hadError = true;
	++Index;
	++StructuredIndex;
	}
	}

	void InitListChecker::CheckComplexType(const InitializedEntity &Entity,
	InitListExpr *IList, QualType DeclType,
	unsigned &Index,
	InitListExpr *StructuredList,
	unsigned &StructuredIndex) {
	assert(Index == 0 && "Index in explicit init list must be zero");

	// As an extension, clang supports complex initializers, which initialize
	// a complex number component-wise. When an explicit initializer list for
	// a complex number contains two two initializers, this extension kicks in:
	// it exepcts the initializer list to contain two elements convertible to
	// the element type of the complex type. The first element initializes
	// the real part, and the second element intitializes the imaginary part.

	if (IList->getNumInits() != 2)
	return CheckScalarType(Entity, IList, DeclType, Index, StructuredList,
	StructuredIndex);

	// This is an extension in C. (The builtin _Complex type does not exist
	// in the C++ standard.)
	if (!SemaRef.getLangOpts().CPlusPlus && !VerifyOnly)
	SemaRef.Diag(IList->getLocStart(), diag::ext_complex_component_init)
	<< IList->getSourceRange();

	// Initialize the complex number.
	QualType elementType = DeclType->getAs<ComplexType>()->getElementType();
	InitializedEntity ElementEntity =
	InitializedEntity::InitializeElement(SemaRef.Context, 0, Entity);

	for (unsigned i = 0; i < 2; ++i) {
	ElementEntity.setElementIndex(Index);
	CheckSubElementType(ElementEntity, IList, elementType, Index,
	StructuredList, StructuredIndex);
	}
	}

	void InitListChecker::CheckScalarType(const InitializedEntity &Entity,
	InitListExpr *IList, QualType DeclType,
	unsigned &Index,
	InitListExpr *StructuredList,
	unsigned &StructuredIndex) {
	if (Index >= IList->getNumInits()) {
	if (!VerifyOnly)
	SemaRef.Diag(IList->getLocStart(),
	SemaRef.getLangOpts().CPlusPlus11 ?
	diag::warn_cxx98_compat_empty_scalar_initializer :
	diag::err_empty_scalar_initializer)
	<< IList->getSourceRange();
	hadError = !SemaRef.getLangOpts().CPlusPlus11;
	++Index;
	++StructuredIndex;
	return;
	}

	Expr *expr = IList->getInit(Index);
	if (InitListExpr *SubIList = dyn_cast<InitListExpr>(expr)) {
	// FIXME: This is invalid, and accepting it causes overload resolution
	// to pick the wrong overload in some corner cases.
	if (!VerifyOnly)
	SemaRef.Diag(SubIList->getLocStart(),
	diag::ext_many_braces_around_scalar_init)
	<< SubIList->getSourceRange();

	CheckScalarType(Entity, SubIList, DeclType, Index, StructuredList,
	StructuredIndex);
	return;
	} else if (isa<DesignatedInitExpr>(expr)) {
	if (!VerifyOnly)
	SemaRef.Diag(expr->getLocStart(),
	diag::err_designator_for_scalar_init)
	<< DeclType << expr->getSourceRange();
	hadError = true;
	++Index;
	++StructuredIndex;
	return;
	}

	if (VerifyOnly) {
	if (!SemaRef.CanPerformCopyInitialization(Entity,expr))
	hadError = true;
	++Index;
	return;
	}

	ExprResult Result =
	SemaRef.PerformCopyInitialization(Entity, expr->getLocStart(), expr,
	/TopLevelOfInitList=/true);

	Expr *ResultExpr = nullptr;

	if (Result.isInvalid())
	hadError = true; // types weren't compatible.
	else {
	ResultExpr = Result.getAs<Expr>();

	if (ResultExpr != expr) {
	// The type was promoted, update initializer list.
	IList->setInit(Index, ResultExpr);
	}
	}
	if (hadError)
	++StructuredIndex;
	else
	UpdateStructuredListElement(StructuredList, StructuredIndex, ResultExpr);
	++Index;
	}

	void InitListChecker::CheckReferenceType(const InitializedEntity &Entity,
	InitListExpr *IList, QualType DeclType,
	unsigned &Index,
	InitListExpr *StructuredList,
	unsigned &StructuredIndex) {
	if (Index >= IList->getNumInits()) {
	// FIXME: It would be wonderful if we could point at the actual member. In
	// general, it would be useful to pass location information down the stack,
	// so that we know the location (or decl) of the "current object" being
	// initialized.
	if (!VerifyOnly)
	SemaRef.Diag(IList->getLocStart(),
	diag::err_init_reference_member_uninitialized)
	<< DeclType
	<< IList->getSourceRange();
	hadError = true;
	++Index;
	++StructuredIndex;
	return;
	}

	Expr *expr = IList->getInit(Index);
	if (isa<InitListExpr>(expr) && !SemaRef.getLangOpts().CPlusPlus11) {
	if (!VerifyOnly)
	SemaRef.Diag(IList->getLocStart(), diag::err_init_non_aggr_init_list)
	<< DeclType << IList->getSourceRange();
	hadError = true;
	++Index;
	++StructuredIndex;
	return;
	}

	if (VerifyOnly) {
	if (!SemaRef.CanPerformCopyInitialization(Entity,expr))
	hadError = true;
	++Index;
	return;
	}

	ExprResult Result =
	SemaRef.PerformCopyInitialization(Entity, expr->getLocStart(), expr,
	/TopLevelOfInitList=/true);

	if (Result.isInvalid())
	hadError = true;

	expr = Result.getAs<Expr>();
	IList->setInit(Index, expr);

	if (hadError)
	++StructuredIndex;
	else
	UpdateStructuredListElement(StructuredList, StructuredIndex, expr);
	++Index;
	}

	void InitListChecker::CheckVectorType(const InitializedEntity &Entity,
	InitListExpr *IList, QualType DeclType,
	unsigned &Index,
	InitListExpr *StructuredList,
	unsigned &StructuredIndex) {
	const VectorType *VT = DeclType->getAs<VectorType>();
	unsigned maxElements = VT->getNumElements();
	unsigned numEltsInit = 0;
	QualType elementType = VT->getElementType();

	if (Index >= IList->getNumInits()) {
	// Make sure the element type can be value-initialized.
	if (VerifyOnly)
	CheckEmptyInitializable(
	InitializedEntity::InitializeElement(SemaRef.Context, 0, Entity),
	IList->getLocEnd());
	return;
	}

	if (!SemaRef.getLangOpts().OpenCL) {
	// If the initializing element is a vector, try to copy-initialize
	// instead of breaking it apart (which is doomed to failure anyway).
	Expr *Init = IList->getInit(Index);
	if (!isa<InitListExpr>(Init) && Init->getType()->isVectorType()) {
	if (VerifyOnly) {
	if (!SemaRef.CanPerformCopyInitialization(Entity, Init))
	hadError = true;
	++Index;
	return;
	}

	ExprResult Result =
	SemaRef.PerformCopyInitialization(Entity, Init->getLocStart(), Init,
	/TopLevelOfInitList=/true);

	Expr *ResultExpr = nullptr;
	if (Result.isInvalid())
	hadError = true; // types weren't compatible.
	else {
	ResultExpr = Result.getAs<Expr>();

	if (ResultExpr != Init) {
	// The type was promoted, update initializer list.
	IList->setInit(Index, ResultExpr);
	}
	}
	if (hadError)
	++StructuredIndex;
	else
	UpdateStructuredListElement(StructuredList, StructuredIndex,
	ResultExpr);
	++Index;
	return;
	}

	InitializedEntity ElementEntity =
	InitializedEntity::InitializeElement(SemaRef.Context, 0, Entity);

	for (unsigned i = 0; i < maxElements; ++i, ++numEltsInit) {
	// Don't attempt to go past the end of the init list
	if (Index >= IList->getNumInits()) {
	if (VerifyOnly)
	CheckEmptyInitializable(ElementEntity, IList->getLocEnd());
	break;
	}

	ElementEntity.setElementIndex(Index);
	CheckSubElementType(ElementEntity, IList, elementType, Index,
	StructuredList, StructuredIndex);
	}

	if (VerifyOnly)
	return;

	bool isBigEndian = SemaRef.Context.getTargetInfo().isBigEndian();
	const VectorType *T = Entity.getType()->getAs<VectorType>();
	if (isBigEndian && (T->getVectorKind() == VectorType::NeonVector \|\|
	T->getVectorKind() == VectorType::NeonPolyVector)) {
	// The ability to use vector initializer lists is a GNU vector extension
	// and is unrelated to the NEON intrinsics in arm_neon.h. On little
	// endian machines it works fine, however on big endian machines it
	// exhibits surprising behaviour:
	//
	// uint32x2_t x = {42, 64};
	// return vget_lane_u32(x, 0); // Will return 64.
	//
	// Because of this, explicitly call out that it is non-portable.
	//
	SemaRef.Diag(IList->getLocStart(),
	diag::warn_neon_vector_initializer_non_portable);

	const char *typeCode;
	unsigned typeSize = SemaRef.Context.getTypeSize(elementType);

	if (elementType->isFloatingType())
	typeCode = "f";
	else if (elementType->isSignedIntegerType())
	typeCode = "s";
	else if (elementType->isUnsignedIntegerType())
	typeCode = "u";
	else
	llvm_unreachable("Invalid element type!");

	SemaRef.Diag(IList->getLocStart(),
	SemaRef.Context.getTypeSize(VT) > 64 ?
	diag::note_neon_vector_initializer_non_portable_q :
	diag::note_neon_vector_initializer_non_portable)
	<< typeCode << typeSize;
	}

	return;
	}

	InitializedEntity ElementEntity =
	InitializedEntity::InitializeElement(SemaRef.Context, 0, Entity);

	// OpenCL initializers allows vectors to be constructed from vectors.
	for (unsigned i = 0; i < maxElements; ++i) {
	// Don't attempt to go past the end of the init list
	if (Index >= IList->getNumInits())
	break;

	ElementEntity.setElementIndex(Index);

	QualType IType = IList->getInit(Index)->getType();
	if (!IType->isVectorType()) {
	CheckSubElementType(ElementEntity, IList, elementType, Index,
	StructuredList, StructuredIndex);
	++numEltsInit;
	} else {
	QualType VecType;
	const VectorType *IVT = IType->getAs<VectorType>();
	unsigned numIElts = IVT->getNumElements();

	if (IType->isExtVectorType())
	VecType = SemaRef.Context.getExtVectorType(elementType, numIElts);
	else
	VecType = SemaRef.Context.getVectorType(elementType, numIElts,
	IVT->getVectorKind());
	CheckSubElementType(ElementEntity, IList, VecType, Index,
	StructuredList, StructuredIndex);
	numEltsInit += numIElts;
	}
	}

	// OpenCL requires all elements to be initialized.
	if (numEltsInit != maxElements) {
	if (!VerifyOnly)
	SemaRef.Diag(IList->getLocStart(),
	diag::err_vector_incorrect_num_initializers)
	<< (numEltsInit < maxElements) << maxElements << numEltsInit;
	hadError = true;
	}
	}

	void InitListChecker::CheckArrayType(const InitializedEntity &Entity,
	InitListExpr *IList, QualType &DeclType,
	llvm::APSInt elementIndex,
	bool SubobjectIsDesignatorContext,
	unsigned &Index,
	InitListExpr *StructuredList,
	unsigned &StructuredIndex) {
	const ArrayType *arrayType = SemaRef.Context.getAsArrayType(DeclType);

	// Check for the special-case of initializing an array with a string.
	if (Index < IList->getNumInits()) {
	if (IsStringInit(IList->getInit(Index), arrayType, SemaRef.Context) ==
	SIF_None) {
	// We place the string literal directly into the resulting
	// initializer list. This is the only place where the structure
	// of the structured initializer list doesn't match exactly,
	// because doing so would involve allocating one character
	// constant for each string.
	if (!VerifyOnly) {
	CheckStringInit(IList->getInit(Index), DeclType, arrayType, SemaRef);
	UpdateStructuredListElement(StructuredList, StructuredIndex,
	IList->getInit(Index));
	StructuredList->resizeInits(SemaRef.Context, StructuredIndex);
	}
	++Index;
	return;
	}
	}
	if (const VariableArrayType *VAT = dyn_cast<VariableArrayType>(arrayType)) {
	// Check for VLAs; in standard C it would be possible to check this
	// earlier, but I don't know where clang accepts VLAs (gcc accepts
	// them in all sorts of strange places).
	if (!VerifyOnly)
	SemaRef.Diag(VAT->getSizeExpr()->getLocStart(),
	diag::err_variable_object_no_init)
	<< VAT->getSizeExpr()->getSourceRange();
	hadError = true;
	++Index;
	++StructuredIndex;
	return;
	}

	// We might know the maximum number of elements in advance.
	llvm::APSInt maxElements(elementIndex.getBitWidth(),
	elementIndex.isUnsigned());
	bool maxElementsKnown = false;
	if (const ConstantArrayType *CAT = dyn_cast<ConstantArrayType>(arrayType)) {
	maxElements = CAT->getSize();
	elementIndex = elementIndex.extOrTrunc(maxElements.getBitWidth());
	elementIndex.setIsUnsigned(maxElements.isUnsigned());
	maxElementsKnown = true;
	}

	QualType elementType = arrayType->getElementType();
	while (Index < IList->getNumInits()) {
	Expr *Init = IList->getInit(Index);
	if (DesignatedInitExpr *DIE = dyn_cast<DesignatedInitExpr>(Init)) {
	// If we're not the subobject that matches up with the '{' for
	// the designator, we shouldn't be handling the
	// designator. Return immediately.
	if (!SubobjectIsDesignatorContext)
	return;

	// Handle this designated initializer. elementIndex will be
	// updated to be the next array element we'll initialize.
	if (CheckDesignatedInitializer(Entity, IList, DIE, 0,
	DeclType, nullptr, &elementIndex, Index,
	StructuredList, StructuredIndex, true,
	false)) {
	hadError = true;
	continue;
	}

	if (elementIndex.getBitWidth() > maxElements.getBitWidth())
	maxElements = maxElements.extend(elementIndex.getBitWidth());
	else if (elementIndex.getBitWidth() < maxElements.getBitWidth())
	elementIndex = elementIndex.extend(maxElements.getBitWidth());
	elementIndex.setIsUnsigned(maxElements.isUnsigned());

	// If the array is of incomplete type, keep track of the number of
	// elements in the initializer.
	if (!maxElementsKnown && elementIndex > maxElements)
	maxElements = elementIndex;

	continue;
	}

	// If we know the maximum number of elements, and we've already
	// hit it, stop consuming elements in the initializer list.
	if (maxElementsKnown && elementIndex == maxElements)
	break;

	InitializedEntity ElementEntity =
	InitializedEntity::InitializeElement(SemaRef.Context, StructuredIndex,
	Entity);
	// Check this element.
	CheckSubElementType(ElementEntity, IList, elementType, Index,
	StructuredList, StructuredIndex);
	++elementIndex;

	// If the array is of incomplete type, keep track of the number of
	// elements in the initializer.
	if (!maxElementsKnown && elementIndex > maxElements)
	maxElements = elementIndex;
	}
	if (!hadError && DeclType->isIncompleteArrayType() && !VerifyOnly) {
	// If this is an incomplete array type, the actual type needs to
	// be calculated here.
	llvm::APSInt Zero(maxElements.getBitWidth(), maxElements.isUnsigned());
	if (maxElements == Zero && !Entity.isVariableLengthArrayNew()) {
	// Sizing an array implicitly to zero is not allowed by ISO C,
	// but is supported by GNU.
	SemaRef.Diag(IList->getLocStart(),
	diag::ext_typecheck_zero_array_size);
	}

	DeclType = SemaRef.Context.getConstantArrayType(elementType, maxElements,
	ArrayType::Normal, 0);
	}
	if (!hadError && VerifyOnly) {
	// If there are any members of the array that get value-initialized, check
	// that is possible. That happens if we know the bound and don't have
	// enough elements, or if we're performing an array new with an unknown
	// bound.
	// FIXME: This needs to detect holes left by designated initializers too.
	if ((maxElementsKnown && elementIndex < maxElements) \|\|
	Entity.isVariableLengthArrayNew())
	CheckEmptyInitializable(InitializedEntity::InitializeElement(
	SemaRef.Context, 0, Entity),
	IList->getLocEnd());
	}
	}

	bool InitListChecker::CheckFlexibleArrayInit(const InitializedEntity &Entity,
	Expr *InitExpr,
	FieldDecl *Field,
	bool TopLevelObject) {
	// Handle GNU flexible array initializers.
	unsigned FlexArrayDiag;
	if (isa<InitListExpr>(InitExpr) &&
	cast<InitListExpr>(InitExpr)->getNumInits() == 0) {
	// Empty flexible array init always allowed as an extension
	FlexArrayDiag = diag::ext_flexible_array_init;
	} else if (SemaRef.getLangOpts().CPlusPlus) {
	// Disallow flexible array init in C++; it is not required for gcc
	// compatibility, and it needs work to IRGen correctly in general.
	FlexArrayDiag = diag::err_flexible_array_init;
	} else if (!TopLevelObject) {
	// Disallow flexible array init on non-top-level object
	FlexArrayDiag = diag::err_flexible_array_init;
	} else if (Entity.getKind() != InitializedEntity::EK_Variable) {
	// Disallow flexible array init on anything which is not a variable.
	FlexArrayDiag = diag::err_flexible_array_init;
	} else if (cast<VarDecl>(Entity.getDecl())->hasLocalStorage()) {
	// Disallow flexible array init on local variables.
	FlexArrayDiag = diag::err_flexible_array_init;
	} else {
	// Allow other cases.
	FlexArrayDiag = diag::ext_flexible_array_init;
	}

	if (!VerifyOnly) {
	SemaRef.Diag(InitExpr->getLocStart(),
	FlexArrayDiag)
	<< InitExpr->getLocStart();
	SemaRef.Diag(Field->getLocation(), diag::note_flexible_array_member)
	<< Field;
	}

	return FlexArrayDiag != diag::ext_flexible_array_init;
	}

	void InitListChecker::CheckStructUnionTypes(
	const InitializedEntity &Entity, InitListExpr *IList, QualType DeclType,
	CXXRecordDecl::base_class_range Bases, RecordDecl::field_iterator Field,
	bool SubobjectIsDesignatorContext, unsigned &Index,
	InitListExpr *StructuredList, unsigned &StructuredIndex,
	bool TopLevelObject) {
	RecordDecl *structDecl = DeclType->getAs<RecordType>()->getDecl();

	// If the record is invalid, some of it's members are invalid. To avoid
	// confusion, we forgo checking the intializer for the entire record.
	if (structDecl->isInvalidDecl()) {
	// Assume it was supposed to consume a single initializer.
	++Index;
	hadError = true;
	return;
	}

	if (DeclType->isUnionType() && IList->getNumInits() == 0) {
	RecordDecl *RD = DeclType->getAs<RecordType>()->getDecl();

	// If there's a default initializer, use it.
	if (isa<CXXRecordDecl>(RD) && cast<CXXRecordDecl>(RD)->hasInClassInitializer()) {
	if (VerifyOnly)
	return;
	for (RecordDecl::field_iterator FieldEnd = RD->field_end();
	Field != FieldEnd; ++Field) {
	if (Field->hasInClassInitializer()) {
	StructuredList->setInitializedFieldInUnion(*Field);
	// FIXME: Actually build a CXXDefaultInitExpr?
	return;
	}
	}
	}

	// Value-initialize the first member of the union that isn't an unnamed
	// bitfield.
	for (RecordDecl::field_iterator FieldEnd = RD->field_end();
	Field != FieldEnd; ++Field) {
	if (!Field->isUnnamedBitfield()) {
	if (VerifyOnly)
	CheckEmptyInitializable(
	InitializedEntity::InitializeMember(*Field, &Entity),
	IList->getLocEnd());
	else
	StructuredList->setInitializedFieldInUnion(*Field);
	break;
	}
	}
	return;
	}

	bool InitializedSomething = false;

	// If we have any base classes, they are initialized prior to the fields.
	for (auto &Base : Bases) {
	Expr *Init = Index < IList->getNumInits() ? IList->getInit(Index) : nullptr;
	SourceLocation InitLoc = Init ? Init->getLocStart() : IList->getLocEnd();

	// Designated inits always initialize fields, so if we see one, all
	// remaining base classes have no explicit initializer.
	if (Init && isa<DesignatedInitExpr>(Init))
	Init = nullptr;

	InitializedEntity BaseEntity = InitializedEntity::InitializeBase(
	SemaRef.Context, &Base, false, &Entity);
	if (Init) {
	CheckSubElementType(BaseEntity, IList, Base.getType(), Index,
	StructuredList, StructuredIndex);
	InitializedSomething = true;
	} else if (VerifyOnly) {
	CheckEmptyInitializable(BaseEntity, InitLoc);
	}
	}

	// If structDecl is a forward declaration, this loop won't do
	// anything except look at designated initializers; That's okay,
	// because an error should get printed out elsewhere. It might be
	// worthwhile to skip over the rest of the initializer, though.
	RecordDecl *RD = DeclType->getAs<RecordType>()->getDecl();
	RecordDecl::field_iterator FieldEnd = RD->field_end();
	bool CheckForMissingFields =
	!IList->isIdiomaticZeroInitializer(SemaRef.getLangOpts());

	while (Index < IList->getNumInits()) {
	Expr *Init = IList->getInit(Index);

	if (DesignatedInitExpr *DIE = dyn_cast<DesignatedInitExpr>(Init)) {
	// If we're not the subobject that matches up with the '{' for
	// the designator, we shouldn't be handling the
	// designator. Return immediately.
	if (!SubobjectIsDesignatorContext)
	return;

	// Handle this designated initializer. Field will be updated to
	// the next field that we'll be initializing.
	if (CheckDesignatedInitializer(Entity, IList, DIE, 0,
	DeclType, &Field, nullptr, Index,
	StructuredList, StructuredIndex,
	true, TopLevelObject))
	hadError = true;

	InitializedSomething = true;

	// Disable check for missing fields when designators are used.
	// This matches gcc behaviour.
	CheckForMissingFields = false;
	continue;
	}

	if (Field == FieldEnd) {
	// We've run out of fields. We're done.
	break;
	}

	// We've already initialized a member of a union. We're done.
	if (InitializedSomething && DeclType->isUnionType())
	break;

	// If we've hit the flexible array member at the end, we're done.
	if (Field->getType()->isIncompleteArrayType())
	break;

	if (Field->isUnnamedBitfield()) {
	// Don't initialize unnamed bitfields, e.g. "int : 20;"
	++Field;
	continue;
	}

	// Make sure we can use this declaration.
	bool InvalidUse;
	if (VerifyOnly)
	InvalidUse = !SemaRef.CanUseDecl(*Field, TreatUnavailableAsInvalid);
	else
	InvalidUse = SemaRef.DiagnoseUseOfDecl(*Field,
	IList->getInit(Index)->getLocStart());
	if (InvalidUse) {
	++Index;
	++Field;
	hadError = true;
	continue;
	}

	InitializedEntity MemberEntity =
	InitializedEntity::InitializeMember(*Field, &Entity);
	CheckSubElementType(MemberEntity, IList, Field->getType(), Index,
	StructuredList, StructuredIndex);
	InitializedSomething = true;

	if (DeclType->isUnionType() && !VerifyOnly) {
	// Initialize the first field within the union.
	StructuredList->setInitializedFieldInUnion(*Field);
	}

	++Field;
	}

	// Emit warnings for missing struct field initializers.
	if (!VerifyOnly && InitializedSomething && CheckForMissingFields &&
	Field != FieldEnd && !Field->getType()->isIncompleteArrayType() &&
	!DeclType->isUnionType()) {
	// It is possible we have one or more unnamed bitfields remaining.
	// Find first (if any) named field and emit warning.
	for (RecordDecl::field_iterator it = Field, end = RD->field_end();
	it != end; ++it) {
	if (!it->isUnnamedBitfield() && !it->hasInClassInitializer()) {
	SemaRef.Diag(IList->getSourceRange().getEnd(),
	diag::warn_missing_field_initializers) << *it;
	break;
	}
	}
	}

	// Check that any remaining fields can be value-initialized.
	if (VerifyOnly && Field != FieldEnd && !DeclType->isUnionType() &&
	!Field->getType()->isIncompleteArrayType()) {
	// FIXME: Should check for holes left by designated initializers too.
	for (; Field != FieldEnd && !hadError; ++Field) {
	if (!Field->isUnnamedBitfield() && !Field->hasInClassInitializer())
	CheckEmptyInitializable(
	InitializedEntity::InitializeMember(*Field, &Entity),
	IList->getLocEnd());
	}
	}

	if (Field == FieldEnd \|\| !Field->getType()->isIncompleteArrayType() \|\|
	Index >= IList->getNumInits())
	return;

	if (CheckFlexibleArrayInit(Entity, IList->getInit(Index), *Field,
	TopLevelObject)) {
	hadError = true;
	++Index;
	return;
	}

	InitializedEntity MemberEntity =
	InitializedEntity::InitializeMember(*Field, &Entity);

	if (isa<InitListExpr>(IList->getInit(Index)))
	CheckSubElementType(MemberEntity, IList, Field->getType(), Index,
	StructuredList, StructuredIndex);
	else
	CheckImplicitInitList(MemberEntity, IList, Field->getType(), Index,
	StructuredList, StructuredIndex);
	}

	/// \brief Expand a field designator that refers to a member of an
	/// anonymous struct or union into a series of field designators that
	/// refers to the field within the appropriate subobject.
	///
	static void ExpandAnonymousFieldDesignator(Sema &SemaRef,
	DesignatedInitExpr *DIE,
	unsigned DesigIdx,
	IndirectFieldDecl *IndirectField) {
	typedef DesignatedInitExpr::Designator Designator;

	// Build the replacement designators.
	SmallVector<Designator, 4> Replacements;
	for (IndirectFieldDecl::chain_iterator PI = IndirectField->chain_begin(),
	PE = IndirectField->chain_end(); PI != PE; ++PI) {
	if (PI + 1 == PE)
	Replacements.push_back(Designator((IdentifierInfo *)nullptr,
	DIE->getDesignator(DesigIdx)->getDotLoc(),
	DIE->getDesignator(DesigIdx)->getFieldLoc()));
	else
	Replacements.push_back(Designator((IdentifierInfo *)nullptr,
	SourceLocation(), SourceLocation()));
	assert(isa<FieldDecl>(*PI));
	Replacements.back().setField(cast<FieldDecl>(*PI));
	}

	// Expand the current designator into the set of replacement
	// designators, so we have a full subobject path down to where the
	// member of the anonymous struct/union is actually stored.
	DIE->ExpandDesignator(SemaRef.Context, DesigIdx, &Replacements[0],
	&Replacements[0] + Replacements.size());
	}

	static DesignatedInitExpr *CloneDesignatedInitExpr(Sema &SemaRef,
	DesignatedInitExpr *DIE) {
	unsigned NumIndexExprs = DIE->getNumSubExprs() - 1;
	SmallVector<Expr*, 4> IndexExprs(NumIndexExprs);
	for (unsigned I = 0; I < NumIndexExprs; ++I)
	IndexExprs[I] = DIE->getSubExpr(I + 1);
	return DesignatedInitExpr::Create(SemaRef.Context, DIE->designators(),
	IndexExprs,
	DIE->getEqualOrColonLoc(),
	DIE->usesGNUSyntax(), DIE->getInit());
	}

	namespace {

	// Callback to only accept typo corrections that are for field members of
	// the given struct or union.
	class FieldInitializerValidatorCCC : public CorrectionCandidateCallback {
	public:
	explicit FieldInitializerValidatorCCC(RecordDecl *RD)
	: Record(RD) {}

	bool ValidateCandidate(const TypoCorrection &candidate) override {
	FieldDecl *FD = candidate.getCorrectionDeclAs<FieldDecl>();
	return FD && FD->getDeclContext()->getRedeclContext()->Equals(Record);
	}

	private:
	RecordDecl *Record;
	};

	} // end anonymous namespace

	/// @brief Check the well-formedness of a C99 designated initializer.
	///
	/// Determines whether the designated initializer @p DIE, which
	/// resides at the given @p Index within the initializer list @p
	/// IList, is well-formed for a current object of type @p DeclType
	/// (C99 6.7.8). The actual subobject that this designator refers to
	/// within the current subobject is returned in either
	/// @p NextField or @p NextElementIndex (whichever is appropriate).
	///
	/// @param IList The initializer list in which this designated
	/// initializer occurs.
	///
	/// @param DIE The designated initializer expression.
	///
	/// @param DesigIdx The index of the current designator.
	///
	/// @param CurrentObjectType The type of the "current object" (C99 6.7.8p17),
	/// into which the designation in @p DIE should refer.
	///
	/// @param NextField If non-NULL and the first designator in @p DIE is
	/// a field, this will be set to the field declaration corresponding
	/// to the field named by the designator.
	///
	/// @param NextElementIndex If non-NULL and the first designator in @p
	/// DIE is an array designator or GNU array-range designator, this
	/// will be set to the last index initialized by this designator.
	///
	/// @param Index Index into @p IList where the designated initializer
	/// @p DIE occurs.
	///
	/// @param StructuredList The initializer list expression that
	/// describes all of the subobject initializers in the order they'll
	/// actually be initialized.
	///
	/// @returns true if there was an error, false otherwise.
	bool
	InitListChecker::CheckDesignatedInitializer(const InitializedEntity &Entity,
	InitListExpr *IList,
	DesignatedInitExpr *DIE,
	unsigned DesigIdx,
	QualType &CurrentObjectType,
	RecordDecl::field_iterator *NextField,
	llvm::APSInt *NextElementIndex,
	unsigned &Index,
	InitListExpr *StructuredList,
	unsigned &StructuredIndex,
	bool FinishSubobjectInit,
	bool TopLevelObject) {
	if (DesigIdx == DIE->size()) {
	// Check the actual initialization for the designated object type.
	bool prevHadError = hadError;

	// Temporarily remove the designator expression from the
	// initializer list that the child calls see, so that we don't try
	// to re-process the designator.
	unsigned OldIndex = Index;
	IList->setInit(OldIndex, DIE->getInit());

	CheckSubElementType(Entity, IList, CurrentObjectType, Index,
	StructuredList, StructuredIndex);

	// Restore the designated initializer expression in the syntactic
	// form of the initializer list.
	if (IList->getInit(OldIndex) != DIE->getInit())
	DIE->setInit(IList->getInit(OldIndex));
	IList->setInit(OldIndex, DIE);

	return hadError && !prevHadError;
	}

	DesignatedInitExpr::Designator *D = DIE->getDesignator(DesigIdx);
	bool IsFirstDesignator = (DesigIdx == 0);
	if (!VerifyOnly) {
	assert((IsFirstDesignator \|\| StructuredList) &&
	"Need a non-designated initializer list to start from");

	// Determine the structural initializer list that corresponds to the
	// current subobject.
	if (IsFirstDesignator)
	StructuredList = SyntacticToSemantic.lookup(IList);
	else {
	Expr *ExistingInit = StructuredIndex < StructuredList->getNumInits() ?
	StructuredList->getInit(StructuredIndex) : nullptr;
	if (!ExistingInit && StructuredList->hasArrayFiller())
	ExistingInit = StructuredList->getArrayFiller();

	if (!ExistingInit)
	StructuredList =
	getStructuredSubobjectInit(IList, Index, CurrentObjectType,
	StructuredList, StructuredIndex,
	SourceRange(D->getLocStart(),
	DIE->getLocEnd()));
	else if (InitListExpr *Result = dyn_cast<InitListExpr>(ExistingInit))
	StructuredList = Result;
	else {
	if (DesignatedInitUpdateExpr *E =
	dyn_cast<DesignatedInitUpdateExpr>(ExistingInit))
	StructuredList = E->getUpdater();
	else {
	DesignatedInitUpdateExpr *DIUE =
	new (SemaRef.Context) DesignatedInitUpdateExpr(SemaRef.Context,
	D->getLocStart(), ExistingInit,
	DIE->getLocEnd());
	StructuredList->updateInit(SemaRef.Context, StructuredIndex, DIUE);
	StructuredList = DIUE->getUpdater();
	}

	// We need to check on source range validity because the previous
	// initializer does not have to be an explicit initializer. e.g.,
	//
	// struct P { int a, b; };
	// struct PP { struct P p } l = { { .a = 2 }, .p.b = 3 };
	//
	// There is an overwrite taking place because the first braced initializer
	// list "{ .a = 2 }" already provides value for .p.b (which is zero).
	if (ExistingInit->getSourceRange().isValid()) {
	// We are creating an initializer list that initializes the
	// subobjects of the current object, but there was already an
	// initialization that completely initialized the current
	// subobject, e.g., by a compound literal:
	//
	// struct X { int a, b; };
	// struct X xs[] = { [0] = (struct X) { 1, 2 }, [0].b = 3 };
	//
	// Here, xs[0].a == 0 and xs[0].b == 3, since the second,
	// designated initializer re-initializes the whole
	// subobject [0], overwriting previous initializers.
	SemaRef.Diag(D->getLocStart(),
	diag::warn_subobject_initializer_overrides)
	<< SourceRange(D->getLocStart(), DIE->getLocEnd());

	SemaRef.Diag(ExistingInit->getLocStart(),
	diag::note_previous_initializer)
	<< /FIXME:has side effects=/0
	<< ExistingInit->getSourceRange();
	}
	}
	}
	assert(StructuredList && "Expected a structured initializer list");
	}

	if (D->isFieldDesignator()) {
	// C99 6.7.8p7:
	//
	// If a designator has the form
	//
	// . identifier
	//
	// then the current object (defined below) shall have
	// structure or union type and the identifier shall be the
	// name of a member of that type.
	const RecordType *RT = CurrentObjectType->getAs<RecordType>();
	if (!RT) {
	SourceLocation Loc = D->getDotLoc();
	if (Loc.isInvalid())
	Loc = D->getFieldLoc();
	if (!VerifyOnly)
	SemaRef.Diag(Loc, diag::err_field_designator_non_aggr)
	<< SemaRef.getLangOpts().CPlusPlus << CurrentObjectType;
	++Index;
	return true;
	}

	FieldDecl *KnownField = D->getField();
	if (!KnownField) {
	IdentifierInfo *FieldName = D->getFieldName();
	DeclContext::lookup_result Lookup = RT->getDecl()->lookup(FieldName);
	for (NamedDecl *ND : Lookup) {
	if (auto *FD = dyn_cast<FieldDecl>(ND)) {
	KnownField = FD;
	break;
	}
	if (auto *IFD = dyn_cast<IndirectFieldDecl>(ND)) {
	// In verify mode, don't modify the original.
	if (VerifyOnly)
	DIE = CloneDesignatedInitExpr(SemaRef, DIE);
	ExpandAnonymousFieldDesignator(SemaRef, DIE, DesigIdx, IFD);
	D = DIE->getDesignator(DesigIdx);
	KnownField = cast<FieldDecl>(*IFD->chain_begin());
	break;
	}
	}
	if (!KnownField) {
	if (VerifyOnly) {
	++Index;
	return true; // No typo correction when just trying this out.
	}

	// Name lookup found something, but it wasn't a field.
	if (!Lookup.empty()) {
	SemaRef.Diag(D->getFieldLoc(), diag::err_field_designator_nonfield)
	<< FieldName;
	SemaRef.Diag(Lookup.front()->getLocation(),
	diag::note_field_designator_found);
	++Index;
	return true;
	}

	// Name lookup didn't find anything.
	// Determine whether this was a typo for another field name.
	if (TypoCorrection Corrected = SemaRef.CorrectTypo(
	DeclarationNameInfo(FieldName, D->getFieldLoc()),
	Sema::LookupMemberName, /Scope=/nullptr, /SS=/nullptr,
	llvm::make_unique<FieldInitializerValidatorCCC>(RT->getDecl()),
	Sema::CTK_ErrorRecovery, RT->getDecl())) {
	SemaRef.diagnoseTypo(
	Corrected,
	SemaRef.PDiag(diag::err_field_designator_unknown_suggest)
	<< FieldName << CurrentObjectType);
	KnownField = Corrected.getCorrectionDeclAs<FieldDecl>();
	hadError = true;
	} else {
	// Typo correction didn't find anything.
	SemaRef.Diag(D->getFieldLoc(), diag::err_field_designator_unknown)
	<< FieldName << CurrentObjectType;
	++Index;
	return true;
	}
	}
	}

	unsigned FieldIndex = 0;

	if (auto *CXXRD = dyn_cast<CXXRecordDecl>(RT->getDecl()))
	FieldIndex = CXXRD->getNumBases();

	for (auto *FI : RT->getDecl()->fields()) {
	if (FI->isUnnamedBitfield())
	continue;
	if (declaresSameEntity(KnownField, FI)) {
	KnownField = FI;
	break;
	}
	++FieldIndex;
	}

	RecordDecl::field_iterator Field =
	RecordDecl::field_iterator(DeclContext::decl_iterator(KnownField));

	// All of the fields of a union are located at the same place in
	// the initializer list.
	if (RT->getDecl()->isUnion()) {
	FieldIndex = 0;
	if (!VerifyOnly) {
	FieldDecl *CurrentField = StructuredList->getInitializedFieldInUnion();
	if (CurrentField && !declaresSameEntity(CurrentField, *Field)) {
	assert(StructuredList->getNumInits() == 1
	&& "A union should never have more than one initializer!");

	Expr *ExistingInit = StructuredList->getInit(0);
	if (ExistingInit) {
	// We're about to throw away an initializer, emit warning.
	SemaRef.Diag(D->getFieldLoc(),
	diag::warn_initializer_overrides)
	<< D->getSourceRange();
	SemaRef.Diag(ExistingInit->getLocStart(),
	diag::note_previous_initializer)
	<< /FIXME:has side effects=/0
	<< ExistingInit->getSourceRange();
	}

	// remove existing initializer
	StructuredList->resizeInits(SemaRef.Context, 0);
	StructuredList->setInitializedFieldInUnion(nullptr);
	}

	StructuredList->setInitializedFieldInUnion(*Field);
	}
	}

	// Make sure we can use this declaration.
	bool InvalidUse;
	if (VerifyOnly)
	InvalidUse = !SemaRef.CanUseDecl(*Field, TreatUnavailableAsInvalid);
	else
	InvalidUse = SemaRef.DiagnoseUseOfDecl(*Field, D->getFieldLoc());
	if (InvalidUse) {
	++Index;
	return true;
	}

	if (!VerifyOnly) {
	// Update the designator with the field declaration.
	D->setField(*Field);

	// Make sure that our non-designated initializer list has space
	// for a subobject corresponding to this field.
	if (FieldIndex >= StructuredList->getNumInits())
	StructuredList->resizeInits(SemaRef.Context, FieldIndex + 1);
	}

	// This designator names a flexible array member.
	if (Field->getType()->isIncompleteArrayType()) {
	bool Invalid = false;
	if ((DesigIdx + 1) != DIE->size()) {
	// We can't designate an object within the flexible array
	// member (because GCC doesn't allow it).
	if (!VerifyOnly) {
	DesignatedInitExpr::Designator *NextD
	= DIE->getDesignator(DesigIdx + 1);
	SemaRef.Diag(NextD->getLocStart(),
	diag::err_designator_into_flexible_array_member)
	<< SourceRange(NextD->getLocStart(),
	DIE->getLocEnd());
	SemaRef.Diag(Field->getLocation(), diag::note_flexible_array_member)
	<< *Field;
	}
	Invalid = true;
	}

	if (!hadError && !isa<InitListExpr>(DIE->getInit()) &&
	!isa<StringLiteral>(DIE->getInit())) {
	// The initializer is not an initializer list.
	if (!VerifyOnly) {
	SemaRef.Diag(DIE->getInit()->getLocStart(),
	diag::err_flexible_array_init_needs_braces)
	<< DIE->getInit()->getSourceRange();
	SemaRef.Diag(Field->getLocation(), diag::note_flexible_array_member)
	<< *Field;
	}
	Invalid = true;
	}

	// Check GNU flexible array initializer.
	if (!Invalid && CheckFlexibleArrayInit(Entity, DIE->getInit(), *Field,
	TopLevelObject))
	Invalid = true;

	if (Invalid) {
	++Index;
	return true;
	}

	// Initialize the array.
	bool prevHadError = hadError;
	unsigned newStructuredIndex = FieldIndex;
	unsigned OldIndex = Index;
	IList->setInit(Index, DIE->getInit());

	InitializedEntity MemberEntity =
	InitializedEntity::InitializeMember(*Field, &Entity);
	CheckSubElementType(MemberEntity, IList, Field->getType(), Index,
	StructuredList, newStructuredIndex);

	IList->setInit(OldIndex, DIE);
	if (hadError && !prevHadError) {
	++Field;
	++FieldIndex;
	if (NextField)
	*NextField = Field;
	StructuredIndex = FieldIndex;
	return true;
	}
	} else {
	// Recurse to check later designated subobjects.
	QualType FieldType = Field->getType();
	unsigned newStructuredIndex = FieldIndex;

	InitializedEntity MemberEntity =
	InitializedEntity::InitializeMember(*Field, &Entity);
	if (CheckDesignatedInitializer(MemberEntity, IList, DIE, DesigIdx + 1,
	FieldType, nullptr, nullptr, Index,
	StructuredList, newStructuredIndex,
	FinishSubobjectInit, false))
	return true;
	}

	// Find the position of the next field to be initialized in this
	// subobject.
	++Field;
	++FieldIndex;

	// If this the first designator, our caller will continue checking
	// the rest of this struct/class/union subobject.
	if (IsFirstDesignator) {
	if (NextField)
	*NextField = Field;
	StructuredIndex = FieldIndex;
	return false;
	}

	if (!FinishSubobjectInit)
	return false;

	// We've already initialized something in the union; we're done.
	if (RT->getDecl()->isUnion())
	return hadError;

	// Check the remaining fields within this class/struct/union subobject.
	bool prevHadError = hadError;

	auto NoBases =
	CXXRecordDecl::base_class_range(CXXRecordDecl::base_class_iterator(),
	CXXRecordDecl::base_class_iterator());
	CheckStructUnionTypes(Entity, IList, CurrentObjectType, NoBases, Field,
	false, Index, StructuredList, FieldIndex);
	return hadError && !prevHadError;
	}

	// C99 6.7.8p6:
	//
	// If a designator has the form
	//
	// [ constant-expression ]
	//
	// then the current object (defined below) shall have array
	// type and the expression shall be an integer constant
	// expression. If the array is of unknown size, any
	// nonnegative value is valid.
	//
	// Additionally, cope with the GNU extension that permits
	// designators of the form
	//
	// [ constant-expression ... constant-expression ]
	const ArrayType *AT = SemaRef.Context.getAsArrayType(CurrentObjectType);
	if (!AT) {
	if (!VerifyOnly)
	SemaRef.Diag(D->getLBracketLoc(), diag::err_array_designator_non_array)
	<< CurrentObjectType;
	++Index;
	return true;
	}

	Expr *IndexExpr = nullptr;
	llvm::APSInt DesignatedStartIndex, DesignatedEndIndex;
	if (D->isArrayDesignator()) {
	IndexExpr = DIE->getArrayIndex(*D);
	DesignatedStartIndex = IndexExpr->EvaluateKnownConstInt(SemaRef.Context);
	DesignatedEndIndex = DesignatedStartIndex;
	} else {
	assert(D->isArrayRangeDesignator() && "Need array-range designator");

	DesignatedStartIndex =
	DIE->getArrayRangeStart(*D)->EvaluateKnownConstInt(SemaRef.Context);
	DesignatedEndIndex =
	DIE->getArrayRangeEnd(*D)->EvaluateKnownConstInt(SemaRef.Context);
	IndexExpr = DIE->getArrayRangeEnd(*D);

	// Codegen can't handle evaluating array range designators that have side
	// effects, because we replicate the AST value for each initialized element.
	// As such, set the sawArrayRangeDesignator() bit if we initialize multiple
	// elements with something that has a side effect, so codegen can emit an
	// "error unsupported" error instead of miscompiling the app.
	if (DesignatedStartIndex.getZExtValue()!=DesignatedEndIndex.getZExtValue()&&
	DIE->getInit()->HasSideEffects(SemaRef.Context) && !VerifyOnly)
	FullyStructuredList->sawArrayRangeDesignator();
	}

	if (isa<ConstantArrayType>(AT)) {
	llvm::APSInt MaxElements(cast<ConstantArrayType>(AT)->getSize(), false);
	DesignatedStartIndex
	= DesignatedStartIndex.extOrTrunc(MaxElements.getBitWidth());
	DesignatedStartIndex.setIsUnsigned(MaxElements.isUnsigned());
	DesignatedEndIndex
	= DesignatedEndIndex.extOrTrunc(MaxElements.getBitWidth());
	DesignatedEndIndex.setIsUnsigned(MaxElements.isUnsigned());
	if (DesignatedEndIndex >= MaxElements) {
	if (!VerifyOnly)
	SemaRef.Diag(IndexExpr->getLocStart(),
	diag::err_array_designator_too_large)
	<< DesignatedEndIndex.toString(10) << MaxElements.toString(10)
	<< IndexExpr->getSourceRange();
	++Index;
	return true;
	}
	} else {
	unsigned DesignatedIndexBitWidth =
	ConstantArrayType::getMaxSizeBits(SemaRef.Context);
	DesignatedStartIndex =
	DesignatedStartIndex.extOrTrunc(DesignatedIndexBitWidth);
	DesignatedEndIndex =
	DesignatedEndIndex.extOrTrunc(DesignatedIndexBitWidth);
	DesignatedStartIndex.setIsUnsigned(true);
	DesignatedEndIndex.setIsUnsigned(true);
	}

	if (!VerifyOnly && StructuredList->isStringLiteralInit()) {
	// We're modifying a string literal init; we have to decompose the string
	// so we can modify the individual characters.
	ASTContext &Context = SemaRef.Context;
	Expr *SubExpr = StructuredList->getInit(0)->IgnoreParens();

	// Compute the character type
	QualType CharTy = AT->getElementType();

	// Compute the type of the integer literals.
	QualType PromotedCharTy = CharTy;
	if (CharTy->isPromotableIntegerType())
	PromotedCharTy = Context.getPromotedIntegerType(CharTy);
	unsigned PromotedCharTyWidth = Context.getTypeSize(PromotedCharTy);

	if (StringLiteral *SL = dyn_cast<StringLiteral>(SubExpr)) {
	// Get the length of the string.
	uint64_t StrLen = SL->getLength();
	if (cast<ConstantArrayType>(AT)->getSize().ult(StrLen))
	StrLen = cast<ConstantArrayType>(AT)->getSize().getZExtValue();
	StructuredList->resizeInits(Context, StrLen);

	// Build a literal for each character in the string, and put them into
	// the init list.
	for (unsigned i = 0, e = StrLen; i != e; ++i) {
	llvm::APInt CodeUnit(PromotedCharTyWidth, SL->getCodeUnit(i));
	Expr *Init = new (Context) IntegerLiteral(
	Context, CodeUnit, PromotedCharTy, SubExpr->getExprLoc());
	if (CharTy != PromotedCharTy)
	Init = ImplicitCastExpr::Create(Context, CharTy, CK_IntegralCast,
	Init, nullptr, VK_RValue);
	StructuredList->updateInit(Context, i, Init);
	}
	} else {
	ObjCEncodeExpr *E = cast<ObjCEncodeExpr>(SubExpr);
	std::string Str;
	Context.getObjCEncodingForType(E->getEncodedType(), Str);

	// Get the length of the string.
	uint64_t StrLen = Str.size();
	if (cast<ConstantArrayType>(AT)->getSize().ult(StrLen))
	StrLen = cast<ConstantArrayType>(AT)->getSize().getZExtValue();
	StructuredList->resizeInits(Context, StrLen);

	// Build a literal for each character in the string, and put them into
	// the init list.
	for (unsigned i = 0, e = StrLen; i != e; ++i) {
	llvm::APInt CodeUnit(PromotedCharTyWidth, Str[i]);
	Expr *Init = new (Context) IntegerLiteral(
	Context, CodeUnit, PromotedCharTy, SubExpr->getExprLoc());
	if (CharTy != PromotedCharTy)
	Init = ImplicitCastExpr::Create(Context, CharTy, CK_IntegralCast,
	Init, nullptr, VK_RValue);
	StructuredList->updateInit(Context, i, Init);
	}
	}
	}

	// Make sure that our non-designated initializer list has space
	// for a subobject corresponding to this array element.
	if (!VerifyOnly &&
	DesignatedEndIndex.getZExtValue() >= StructuredList->getNumInits())
	StructuredList->resizeInits(SemaRef.Context,
	DesignatedEndIndex.getZExtValue() + 1);

	// Repeatedly perform subobject initializations in the range
	// [DesignatedStartIndex, DesignatedEndIndex].

	// Move to the next designator
	unsigned ElementIndex = DesignatedStartIndex.getZExtValue();
	unsigned OldIndex = Index;

	InitializedEntity ElementEntity =
	InitializedEntity::InitializeElement(SemaRef.Context, 0, Entity);

	while (DesignatedStartIndex <= DesignatedEndIndex) {
	// Recurse to check later designated subobjects.
	QualType ElementType = AT->getElementType();
	Index = OldIndex;

	ElementEntity.setElementIndex(ElementIndex);
	if (CheckDesignatedInitializer(
	ElementEntity, IList, DIE, DesigIdx + 1, ElementType, nullptr,
	nullptr, Index, StructuredList, ElementIndex,
	FinishSubobjectInit && (DesignatedStartIndex == DesignatedEndIndex),
	false))
	return true;

	// Move to the next index in the array that we'll be initializing.
	++DesignatedStartIndex;
	ElementIndex = DesignatedStartIndex.getZExtValue();
	}

	// If this the first designator, our caller will continue checking
	// the rest of this array subobject.
	if (IsFirstDesignator) {
	if (NextElementIndex)
	*NextElementIndex = DesignatedStartIndex;
	StructuredIndex = ElementIndex;
	return false;
	}

	if (!FinishSubobjectInit)
	return false;

	// Check the remaining elements within this array subobject.
	bool prevHadError = hadError;
	CheckArrayType(Entity, IList, CurrentObjectType, DesignatedStartIndex,
	/SubobjectIsDesignatorContext=/false, Index,
	StructuredList, ElementIndex);
	return hadError && !prevHadError;
	}

	// Get the structured initializer list for a subobject of type
	// @p CurrentObjectType.
	InitListExpr *
	InitListChecker::getStructuredSubobjectInit(InitListExpr *IList, unsigned Index,
	QualType CurrentObjectType,
	InitListExpr *StructuredList,
	unsigned StructuredIndex,
	SourceRange InitRange,
	bool IsFullyOverwritten) {
	if (VerifyOnly)
	return nullptr; // No structured list in verification-only mode.
	Expr *ExistingInit = nullptr;
	if (!StructuredList)
	ExistingInit = SyntacticToSemantic.lookup(IList);
	else if (StructuredIndex < StructuredList->getNumInits())
	ExistingInit = StructuredList->getInit(StructuredIndex);

	if (InitListExpr *Result = dyn_cast_or_null<InitListExpr>(ExistingInit))
	// There might have already been initializers for subobjects of the current
	// object, but a subsequent initializer list will overwrite the entirety
	// of the current object. (See DR 253 and C99 6.7.8p21). e.g.,
	//
	// struct P { char x[6]; };
	// struct P l = { .x[2] = 'x', .x = { [0] = 'f' } };
	//
	// The first designated initializer is ignored, and l.x is just "f".
	if (!IsFullyOverwritten)
	return Result;

	if (ExistingInit) {
	// We are creating an initializer list that initializes the
	// subobjects of the current object, but there was already an
	// initialization that completely initialized the current
	// subobject, e.g., by a compound literal:
	//
	// struct X { int a, b; };
	// struct X xs[] = { [0] = (struct X) { 1, 2 }, [0].b = 3 };
	//
	// Here, xs[0].a == 0 and xs[0].b == 3, since the second,
	// designated initializer re-initializes the whole
	// subobject [0], overwriting previous initializers.
	SemaRef.Diag(InitRange.getBegin(),
	diag::warn_subobject_initializer_overrides)
	<< InitRange;
	SemaRef.Diag(ExistingInit->getLocStart(),
	diag::note_previous_initializer)
	<< /FIXME:has side effects=/0
	<< ExistingInit->getSourceRange();
	}

	InitListExpr *Result
	= new (SemaRef.Context) InitListExpr(SemaRef.Context,
	InitRange.getBegin(), None,
	InitRange.getEnd());

	QualType ResultType = CurrentObjectType;
	if (!ResultType->isArrayType())
	ResultType = ResultType.getNonLValueExprType(SemaRef.Context);
	Result->setType(ResultType);

	// Pre-allocate storage for the structured initializer list.
	unsigned NumElements = 0;
	unsigned NumInits = 0;
	bool GotNumInits = false;
	if (!StructuredList) {
	NumInits = IList->getNumInits();
	GotNumInits = true;
	} else if (Index < IList->getNumInits()) {
	if (InitListExpr *SubList = dyn_cast<InitListExpr>(IList->getInit(Index))) {
	NumInits = SubList->getNumInits();
	GotNumInits = true;
	}
	}

	if (const ArrayType *AType
	= SemaRef.Context.getAsArrayType(CurrentObjectType)) {
	if (const ConstantArrayType *CAType = dyn_cast<ConstantArrayType>(AType)) {
	NumElements = CAType->getSize().getZExtValue();
	// Simple heuristic so that we don't allocate a very large
	// initializer with many empty entries at the end.
	if (GotNumInits && NumElements > NumInits)
	NumElements = 0;
	}
	} else if (const VectorType *VType = CurrentObjectType->getAs<VectorType>())
	NumElements = VType->getNumElements();
	else if (const RecordType *RType = CurrentObjectType->getAs<RecordType>()) {
	RecordDecl *RDecl = RType->getDecl();
	if (RDecl->isUnion())
	NumElements = 1;
	else
	NumElements = std::distance(RDecl->field_begin(), RDecl->field_end());
	}

	Result->reserveInits(SemaRef.Context, NumElements);

	// Link this new initializer list into the structured initializer
	// lists.
	if (StructuredList)
	StructuredList->updateInit(SemaRef.Context, StructuredIndex, Result);
	else {
	Result->setSyntacticForm(IList);
	SyntacticToSemantic[IList] = Result;
	}

	return Result;
	}

	/// Update the initializer at index @p StructuredIndex within the
	/// structured initializer list to the value @p expr.
	void InitListChecker::UpdateStructuredListElement(InitListExpr *StructuredList,
	unsigned &StructuredIndex,
	Expr *expr) {
	// No structured initializer list to update
	if (!StructuredList)
	return;

	if (Expr *PrevInit = StructuredList->updateInit(SemaRef.Context,
	StructuredIndex, expr)) {
	// This initializer overwrites a previous initializer. Warn.
	// We need to check on source range validity because the previous
	// initializer does not have to be an explicit initializer.
	// struct P { int a, b; };
	// struct PP { struct P p } l = { { .a = 2 }, .p.b = 3 };
	// There is an overwrite taking place because the first braced initializer
	// list "{ .a = 2 }' already provides value for .p.b (which is zero).
	if (PrevInit->getSourceRange().isValid()) {
	SemaRef.Diag(expr->getLocStart(),
	diag::warn_initializer_overrides)
	<< expr->getSourceRange();

	SemaRef.Diag(PrevInit->getLocStart(),
	diag::note_previous_initializer)
	<< /FIXME:has side effects=/0
	<< PrevInit->getSourceRange();
	}
	}

	++StructuredIndex;
	}

	/// Check that the given Index expression is a valid array designator
	/// value. This is essentially just a wrapper around
	/// VerifyIntegerConstantExpression that also checks for negative values
	/// and produces a reasonable diagnostic if there is a
	/// failure. Returns the index expression, possibly with an implicit cast
	/// added, on success. If everything went okay, Value will receive the
	/// value of the constant expression.
	static ExprResult
	CheckArrayDesignatorExpr(Sema &S, Expr *Index, llvm::APSInt &Value) {
	SourceLocation Loc = Index->getLocStart();

	// Make sure this is an integer constant expression.
	ExprResult Result = S.VerifyIntegerConstantExpression(Index, &Value);
	if (Result.isInvalid())
	return Result;

	if (Value.isSigned() && Value.isNegative())
	return S.Diag(Loc, diag::err_array_designator_negative)
	<< Value.toString(10) << Index->getSourceRange();

	Value.setIsUnsigned(true);
	return Result;
	}

	ExprResult Sema::ActOnDesignatedInitializer(Designation &Desig,
	SourceLocation Loc,
	bool GNUSyntax,
	ExprResult Init) {
	typedef DesignatedInitExpr::Designator ASTDesignator;

	bool Invalid = false;
	SmallVector<ASTDesignator, 32> Designators;
	SmallVector<Expr *, 32> InitExpressions;

	// Build designators and check array designator expressions.
	for (unsigned Idx = 0; Idx < Desig.getNumDesignators(); ++Idx) {
	const Designator &D = Desig.getDesignator(Idx);
	switch (D.getKind()) {
	case Designator::FieldDesignator:
	Designators.push_back(ASTDesignator(D.getField(), D.getDotLoc(),
	D.getFieldLoc()));
	break;

	case Designator::ArrayDesignator: {
	Expr Index = static_cast<Expr >(D.getArrayIndex());
	llvm::APSInt IndexValue;
	if (!Index->isTypeDependent() && !Index->isValueDependent())
	Index = CheckArrayDesignatorExpr(*this, Index, IndexValue).get();
	if (!Index)
	Invalid = true;
	else {
	Designators.push_back(ASTDesignator(InitExpressions.size(),
	D.getLBracketLoc(),
	D.getRBracketLoc()));
	InitExpressions.push_back(Index);
	}
	break;
	}

	case Designator::ArrayRangeDesignator: {
	Expr StartIndex = static_cast<Expr >(D.getArrayRangeStart());
	Expr EndIndex = static_cast<Expr >(D.getArrayRangeEnd());
	llvm::APSInt StartValue;
	llvm::APSInt EndValue;
	bool StartDependent = StartIndex->isTypeDependent() \|\|
	StartIndex->isValueDependent();
	bool EndDependent = EndIndex->isTypeDependent() \|\|
	EndIndex->isValueDependent();
	if (!StartDependent)
	StartIndex =
	CheckArrayDesignatorExpr(*this, StartIndex, StartValue).get();
	if (!EndDependent)
	EndIndex = CheckArrayDesignatorExpr(*this, EndIndex, EndValue).get();

	if (!StartIndex \|\| !EndIndex)
	Invalid = true;
	else {
	// Make sure we're comparing values with the same bit width.
	if (StartDependent \|\| EndDependent) {
	// Nothing to compute.
	} else if (StartValue.getBitWidth() > EndValue.getBitWidth())
	EndValue = EndValue.extend(StartValue.getBitWidth());
	else if (StartValue.getBitWidth() < EndValue.getBitWidth())
	StartValue = StartValue.extend(EndValue.getBitWidth());

	if (!StartDependent && !EndDependent && EndValue < StartValue) {
	Diag(D.getEllipsisLoc(), diag::err_array_designator_empty_range)
	<< StartValue.toString(10) << EndValue.toString(10)
	<< StartIndex->getSourceRange() << EndIndex->getSourceRange();
	Invalid = true;
	} else {
	Designators.push_back(ASTDesignator(InitExpressions.size(),
	D.getLBracketLoc(),
	D.getEllipsisLoc(),
	D.getRBracketLoc()));
	InitExpressions.push_back(StartIndex);
	InitExpressions.push_back(EndIndex);
	}
	}
	break;
	}
	}
	}

	if (Invalid \|\| Init.isInvalid())
	return ExprError();

	// Clear out the expressions within the designation.
	Desig.ClearExprs(*this);

	DesignatedInitExpr *DIE
	= DesignatedInitExpr::Create(Context,
	Designators,
	InitExpressions, Loc, GNUSyntax,
	Init.getAs<Expr>());

	if (!getLangOpts().C99)
	Diag(DIE->getLocStart(), diag::ext_designated_init)
	<< DIE->getSourceRange();

	return DIE;
	}

	//===----------------------------------------------------------------------===//
	// Initialization entity
	//===----------------------------------------------------------------------===//

	InitializedEntity::InitializedEntity(ASTContext &Context, unsigned Index,
	const InitializedEntity &Parent)
	: Parent(&Parent), Index(Index)
	{
	if (const ArrayType *AT = Context.getAsArrayType(Parent.getType())) {
	Kind = EK_ArrayElement;
	Type = AT->getElementType();
	} else if (const VectorType *VT = Parent.getType()->getAs<VectorType>()) {
	Kind = EK_VectorElement;
	Type = VT->getElementType();
	} else {
	const ComplexType *CT = Parent.getType()->getAs<ComplexType>();
	assert(CT && "Unexpected type");
	Kind = EK_ComplexElement;
	Type = CT->getElementType();
	}
	}

	InitializedEntity
	InitializedEntity::InitializeBase(ASTContext &Context,
	const CXXBaseSpecifier *Base,
	bool IsInheritedVirtualBase,
	const InitializedEntity *Parent) {
	InitializedEntity Result;
	Result.Kind = EK_Base;
	Result.Parent = Parent;
	Result.Base = reinterpret_cast<uintptr_t>(Base);
	if (IsInheritedVirtualBase)
	Result.Base \|= 0x01;

	Result.Type = Base->getType();
	return Result;
	}

	DeclarationName InitializedEntity::getName() const {
	switch (getKind()) {
	case EK_Parameter:
	case EK_Parameter_CF_Audited: {
	ParmVarDecl D = reinterpret_cast<ParmVarDecl>(Parameter & ~0x1);
	return (D ? D->getDeclName() : DeclarationName());
	}

	case EK_Variable:
	case EK_Member:
	case EK_Binding:
	return Variable.VariableOrMember->getDeclName();

	case EK_LambdaCapture:
	return DeclarationName(Capture.VarID);

	case EK_Result:
	case EK_Exception:
	case EK_New:
	case EK_Temporary:
	case EK_Base:
	case EK_Delegating:
	case EK_ArrayElement:
	case EK_VectorElement:
	case EK_ComplexElement:
	case EK_BlockElement:
	case EK_LambdaToBlockConversionBlockElement:
	case EK_CompoundLiteralInit:
	case EK_RelatedResult:
	return DeclarationName();
	}

	llvm_unreachable("Invalid EntityKind!");
	}

	ValueDecl *InitializedEntity::getDecl() const {
	switch (getKind()) {
	case EK_Variable:
	case EK_Member:
	case EK_Binding:
	return Variable.VariableOrMember;

	case EK_Parameter:
	case EK_Parameter_CF_Audited:
	return reinterpret_cast<ParmVarDecl*>(Parameter & ~0x1);

	case EK_Result:
	case EK_Exception:
	case EK_New:
	case EK_Temporary:
	case EK_Base:
	case EK_Delegating:
	case EK_ArrayElement:
	case EK_VectorElement:
	case EK_ComplexElement:
	case EK_BlockElement:
	case EK_LambdaToBlockConversionBlockElement:
	case EK_LambdaCapture:
	case EK_CompoundLiteralInit:
	case EK_RelatedResult:
	return nullptr;
	}

	llvm_unreachable("Invalid EntityKind!");
	}

	bool InitializedEntity::allowsNRVO() const {
	switch (getKind()) {
	case EK_Result:
	case EK_Exception:
	return LocAndNRVO.NRVO;

	case EK_Variable:
	case EK_Parameter:
	case EK_Parameter_CF_Audited:
	case EK_Member:
	case EK_Binding:
	case EK_New:
	case EK_Temporary:
	case EK_CompoundLiteralInit:
	case EK_Base:
	case EK_Delegating:
	case EK_ArrayElement:
	case EK_VectorElement:
	case EK_ComplexElement:
	case EK_BlockElement:
	case EK_LambdaToBlockConversionBlockElement:
	case EK_LambdaCapture:
	case EK_RelatedResult:
	break;
	}

	return false;
	}

	unsigned InitializedEntity::dumpImpl(raw_ostream &OS) const {
	assert(getParent() != this);
	unsigned Depth = getParent() ? getParent()->dumpImpl(OS) : 0;
	for (unsigned I = 0; I != Depth; ++I)
	OS << "`-";

	switch (getKind()) {
	case EK_Variable: OS << "Variable"; break;
	case EK_Parameter: OS << "Parameter"; break;
	case EK_Parameter_CF_Audited: OS << "CF audited function Parameter";
	break;
	case EK_Result: OS << "Result"; break;
	case EK_Exception: OS << "Exception"; break;
	case EK_Member: OS << "Member"; break;
	case EK_Binding: OS << "Binding"; break;
	case EK_New: OS << "New"; break;
	case EK_Temporary: OS << "Temporary"; break;
	case EK_CompoundLiteralInit: OS << "CompoundLiteral";break;
	case EK_RelatedResult: OS << "RelatedResult"; break;
	case EK_Base: OS << "Base"; break;
	case EK_Delegating: OS << "Delegating"; break;
	case EK_ArrayElement: OS << "ArrayElement " << Index; break;
	case EK_VectorElement: OS << "VectorElement " << Index; break;
	case EK_ComplexElement: OS << "ComplexElement " << Index; break;
	case EK_BlockElement: OS << "Block"; break;
	case EK_LambdaToBlockConversionBlockElement:
	OS << "Block (lambda)";
	break;
	case EK_LambdaCapture:
	OS << "LambdaCapture ";
	OS << DeclarationName(Capture.VarID);
	break;
	}

	if (auto *D = getDecl()) {
	OS << " ";
	D->printQualifiedName(OS);
	}

	OS << " '" << getType().getAsString() << "'\n";

	return Depth + 1;
	}

	LLVM_DUMP_METHOD void InitializedEntity::dump() const {
	dumpImpl(llvm::errs());
	}

	//===----------------------------------------------------------------------===//
	// Initialization sequence
	//===----------------------------------------------------------------------===//

	void InitializationSequence::Step::Destroy() {
	switch (Kind) {
	case SK_ResolveAddressOfOverloadedFunction:
	case SK_CastDerivedToBaseRValue:
	case SK_CastDerivedToBaseXValue:
	case SK_CastDerivedToBaseLValue:
	case SK_BindReference:
	case SK_BindReferenceToTemporary:
	case SK_FinalCopy:
	case SK_ExtraneousCopyToTemporary:
	case SK_UserConversion:
	case SK_QualificationConversionRValue:
	case SK_QualificationConversionXValue:
	case SK_QualificationConversionLValue:
	case SK_AtomicConversion:
	case SK_LValueToRValue:
	case SK_ListInitialization:
	case SK_UnwrapInitList:
	case SK_RewrapInitList:
	case SK_ConstructorInitialization:
	case SK_ConstructorInitializationFromList:
	case SK_ZeroInitialization:
	case SK_CAssignment:
	case SK_StringInit:
	case SK_ObjCObjectConversion:
	case SK_ArrayLoopIndex:
	case SK_ArrayLoopInit:
	case SK_ArrayInit:
	case SK_GNUArrayInit:
	case SK_ParenthesizedArrayInit:
	case SK_PassByIndirectCopyRestore:
	case SK_PassByIndirectRestore:
	case SK_ProduceObjCObject:
	case SK_StdInitializerList:
	case SK_StdInitializerListConstructorCall:
	case SK_OCLSamplerInit:
	case SK_OCLZeroEvent:
	case SK_OCLZeroQueue:
	break;

	case SK_ConversionSequence:
	case SK_ConversionSequenceNoNarrowing:
	delete ICS;
	}
	}

	bool InitializationSequence::isDirectReferenceBinding() const {
	// There can be some lvalue adjustments after the SK_BindReference step.
	for (auto I = Steps.rbegin(); I != Steps.rend(); ++I) {
	if (I->Kind == SK_BindReference)
	return true;
	if (I->Kind == SK_BindReferenceToTemporary)
	return false;
	}
	return false;
	}

	bool InitializationSequence::isAmbiguous() const {
	if (!Failed())
	return false;

	switch (getFailureKind()) {
	case FK_TooManyInitsForReference:
	case FK_ParenthesizedListInitForReference:
	case FK_ArrayNeedsInitList:
	case FK_ArrayNeedsInitListOrStringLiteral:
	case FK_ArrayNeedsInitListOrWideStringLiteral:
	case FK_NarrowStringIntoWideCharArray:
	case FK_WideStringIntoCharArray:
	case FK_IncompatWideStringIntoWideChar:
	case FK_AddressOfOverloadFailed: // FIXME: Could do better
	case FK_NonConstLValueReferenceBindingToTemporary:
	case FK_NonConstLValueReferenceBindingToBitfield:
	case FK_NonConstLValueReferenceBindingToVectorElement:
	case FK_NonConstLValueReferenceBindingToUnrelated:
	case FK_RValueReferenceBindingToLValue:
	case FK_ReferenceInitDropsQualifiers:
	case FK_ReferenceInitFailed:
	case FK_ConversionFailed:
	case FK_ConversionFromPropertyFailed:
	case FK_TooManyInitsForScalar:
	case FK_ParenthesizedListInitForScalar:
	case FK_ReferenceBindingToInitList:
	case FK_InitListBadDestinationType:
	case FK_DefaultInitOfConst:
	case FK_Incomplete:
	case FK_ArrayTypeMismatch:
	case FK_NonConstantArrayInit:
	case FK_ListInitializationFailed:
	case FK_VariableLengthArrayHasInitializer:
	case FK_PlaceholderType:
	case FK_ExplicitConstructor:
	case FK_AddressOfUnaddressableFunction:
	return false;

	case FK_ReferenceInitOverloadFailed:
	case FK_UserConversionOverloadFailed:
	case FK_ConstructorOverloadFailed:
	case FK_ListConstructorOverloadFailed:
	return FailedOverloadResult == OR_Ambiguous;
	}

	llvm_unreachable("Invalid EntityKind!");
	}

	bool InitializationSequence::isConstructorInitialization() const {
	return !Steps.empty() && Steps.back().Kind == SK_ConstructorInitialization;
	}

	void
	InitializationSequence
	::AddAddressOverloadResolutionStep(FunctionDecl *Function,
	DeclAccessPair Found,
	bool HadMultipleCandidates) {
	Step S;
	S.Kind = SK_ResolveAddressOfOverloadedFunction;
	S.Type = Function->getType();
	S.Function.HadMultipleCandidates = HadMultipleCandidates;
	S.Function.Function = Function;
	S.Function.FoundDecl = Found;
	Steps.push_back(S);
	}

	void InitializationSequence::AddDerivedToBaseCastStep(QualType BaseType,
	ExprValueKind VK) {
	Step S;
	switch (VK) {
	case VK_RValue: S.Kind = SK_CastDerivedToBaseRValue; break;
	case VK_XValue: S.Kind = SK_CastDerivedToBaseXValue; break;
	case VK_LValue: S.Kind = SK_CastDerivedToBaseLValue; break;
	}
	S.Type = BaseType;
	Steps.push_back(S);
	}

	void InitializationSequence::AddReferenceBindingStep(QualType T,
	bool BindingTemporary) {
	Step S;
	S.Kind = BindingTemporary? SK_BindReferenceToTemporary : SK_BindReference;
	S.Type = T;
	Steps.push_back(S);
	}

	void InitializationSequence::AddFinalCopy(QualType T) {
	Step S;
	S.Kind = SK_FinalCopy;
	S.Type = T;
	Steps.push_back(S);
	}

	void InitializationSequence::AddExtraneousCopyToTemporary(QualType T) {
	Step S;
	S.Kind = SK_ExtraneousCopyToTemporary;
	S.Type = T;
	Steps.push_back(S);
	}

	void
	InitializationSequence::AddUserConversionStep(FunctionDecl *Function,
	DeclAccessPair FoundDecl,
	QualType T,
	bool HadMultipleCandidates) {
	Step S;
	S.Kind = SK_UserConversion;
	S.Type = T;
	S.Function.HadMultipleCandidates = HadMultipleCandidates;
	S.Function.Function = Function;
	S.Function.FoundDecl = FoundDecl;
	Steps.push_back(S);
	}

	void InitializationSequence::AddQualificationConversionStep(QualType Ty,
	ExprValueKind VK) {
	Step S;
	S.Kind = SK_QualificationConversionRValue; // work around a gcc warning
	switch (VK) {
	case VK_RValue:
	S.Kind = SK_QualificationConversionRValue;
	break;
	case VK_XValue:
	S.Kind = SK_QualificationConversionXValue;
	break;
	case VK_LValue:
	S.Kind = SK_QualificationConversionLValue;
	break;
	}
	S.Type = Ty;
	Steps.push_back(S);
	}

	void InitializationSequence::AddAtomicConversionStep(QualType Ty) {
	Step S;
	S.Kind = SK_AtomicConversion;
	S.Type = Ty;
	Steps.push_back(S);
	}

	void InitializationSequence::AddLValueToRValueStep(QualType Ty) {
	assert(!Ty.hasQualifiers() && "rvalues may not have qualifiers");

	Step S;
	S.Kind = SK_LValueToRValue;
	S.Type = Ty;
	Steps.push_back(S);
	}

	void InitializationSequence::AddConversionSequenceStep(
	const ImplicitConversionSequence &ICS, QualType T,
	bool TopLevelOfInitList) {
	Step S;
	S.Kind = TopLevelOfInitList ? SK_ConversionSequenceNoNarrowing
	: SK_ConversionSequence;
	S.Type = T;
	S.ICS = new ImplicitConversionSequence(ICS);
	Steps.push_back(S);
	}

	void InitializationSequence::AddListInitializationStep(QualType T) {
	Step S;
	S.Kind = SK_ListInitialization;
	S.Type = T;
	Steps.push_back(S);
	}

	void InitializationSequence::AddConstructorInitializationStep(
	DeclAccessPair FoundDecl, CXXConstructorDecl *Constructor, QualType T,
	bool HadMultipleCandidates, bool FromInitList, bool AsInitList) {
	Step S;
	S.Kind = FromInitList ? AsInitList ? SK_StdInitializerListConstructorCall
	: SK_ConstructorInitializationFromList
	: SK_ConstructorInitialization;
	S.Type = T;
	S.Function.HadMultipleCandidates = HadMultipleCandidates;
	S.Function.Function = Constructor;
	S.Function.FoundDecl = FoundDecl;
	Steps.push_back(S);
	}

	void InitializationSequence::AddZeroInitializationStep(QualType T) {
	Step S;
	S.Kind = SK_ZeroInitialization;
	S.Type = T;
	Steps.push_back(S);
	}

	void InitializationSequence::AddCAssignmentStep(QualType T) {
	Step S;
	S.Kind = SK_CAssignment;
	S.Type = T;
	Steps.push_back(S);
	}

	void InitializationSequence::AddStringInitStep(QualType T) {
	Step S;
	S.Kind = SK_StringInit;
	S.Type = T;
	Steps.push_back(S);
	}

	void InitializationSequence::AddObjCObjectConversionStep(QualType T) {
	Step S;
	S.Kind = SK_ObjCObjectConversion;
	S.Type = T;
	Steps.push_back(S);
	}

	void InitializationSequence::AddArrayInitStep(QualType T, bool IsGNUExtension) {
	Step S;
	S.Kind = IsGNUExtension ? SK_GNUArrayInit : SK_ArrayInit;
	S.Type = T;
	Steps.push_back(S);
	}

	void InitializationSequence::AddArrayInitLoopStep(QualType T, QualType EltT) {
	Step S;
	S.Kind = SK_ArrayLoopIndex;
	S.Type = EltT;
	Steps.insert(Steps.begin(), S);

	S.Kind = SK_ArrayLoopInit;
	S.Type = T;
	Steps.push_back(S);
	}

	void InitializationSequence::AddParenthesizedArrayInitStep(QualType T) {
	Step S;
	S.Kind = SK_ParenthesizedArrayInit;
	S.Type = T;
	Steps.push_back(S);
	}

	void InitializationSequence::AddPassByIndirectCopyRestoreStep(QualType type,
	bool shouldCopy) {
	Step s;
	s.Kind = (shouldCopy ? SK_PassByIndirectCopyRestore
	: SK_PassByIndirectRestore);
	s.Type = type;
	Steps.push_back(s);
	}

	void InitializationSequence::AddProduceObjCObjectStep(QualType T) {
	Step S;
	S.Kind = SK_ProduceObjCObject;
	S.Type = T;
	Steps.push_back(S);
	}

	void InitializationSequence::AddStdInitializerListConstructionStep(QualType T) {
	Step S;
	S.Kind = SK_StdInitializerList;
	S.Type = T;
	Steps.push_back(S);
	}

	void InitializationSequence::AddOCLSamplerInitStep(QualType T) {
	Step S;
	S.Kind = SK_OCLSamplerInit;
	S.Type = T;
	Steps.push_back(S);
	}

	void InitializationSequence::AddOCLZeroEventStep(QualType T) {
	Step S;
	S.Kind = SK_OCLZeroEvent;
	S.Type = T;
	Steps.push_back(S);
	}

	void InitializationSequence::AddOCLZeroQueueStep(QualType T) {
	Step S;
	S.Kind = SK_OCLZeroQueue;
	S.Type = T;
	Steps.push_back(S);
	}

	void InitializationSequence::RewrapReferenceInitList(QualType T,
	InitListExpr *Syntactic) {
	assert(Syntactic->getNumInits() == 1 &&
	"Can only rewrap trivial init lists.");
	Step S;
	S.Kind = SK_UnwrapInitList;
	S.Type = Syntactic->getInit(0)->getType();
	Steps.insert(Steps.begin(), S);

	S.Kind = SK_RewrapInitList;
	S.Type = T;
	S.WrappingSyntacticList = Syntactic;
	Steps.push_back(S);
	}

	void InitializationSequence::SetOverloadFailure(FailureKind Failure,
	OverloadingResult Result) {
	setSequenceKind(FailedSequence);
	this->Failure = Failure;
	this->FailedOverloadResult = Result;
	}

	//===----------------------------------------------------------------------===//
	// Attempt initialization
	//===----------------------------------------------------------------------===//

	/// Tries to add a zero initializer. Returns true if that worked.
	static bool
	maybeRecoverWithZeroInitialization(Sema &S, InitializationSequence &Sequence,
	const InitializedEntity &Entity) {
	if (Entity.getKind() != InitializedEntity::EK_Variable)
	return false;

	VarDecl *VD = cast<VarDecl>(Entity.getDecl());
	if (VD->getInit() \|\| VD->getLocEnd().isMacroID())
	return false;

	QualType VariableTy = VD->getType().getCanonicalType();
	SourceLocation Loc = S.getLocForEndOfToken(VD->getLocEnd());
	std::string Init = S.getFixItZeroInitializerForType(VariableTy, Loc);
	if (!Init.empty()) {
	Sequence.AddZeroInitializationStep(Entity.getType());
	Sequence.SetZeroInitializationFixit(Init, Loc);
	return true;
	}
	return false;
	}

	static void MaybeProduceObjCObject(Sema &S,
	InitializationSequence &Sequence,
	const InitializedEntity &Entity) {
	if (!S.getLangOpts().ObjCAutoRefCount) return;

	/// When initializing a parameter, produce the value if it's marked
	/// __attribute__((ns_consumed)).
	if (Entity.isParameterKind()) {
	if (!Entity.isParameterConsumed())
	return;

	assert(Entity.getType()->isObjCRetainableType() &&
	"consuming an object of unretainable type?");
	Sequence.AddProduceObjCObjectStep(Entity.getType());

	/// When initializing a return value, if the return type is a
	/// retainable type, then returns need to immediately retain the
	/// object. If an autorelease is required, it will be done at the
	/// last instant.
	} else if (Entity.getKind() == InitializedEntity::EK_Result) {
	if (!Entity.getType()->isObjCRetainableType())
	return;

	Sequence.AddProduceObjCObjectStep(Entity.getType());
	}
	}

	static void TryListInitialization(Sema &S,
	const InitializedEntity &Entity,
	const InitializationKind &Kind,
	InitListExpr *InitList,
	InitializationSequence &Sequence,
	bool TreatUnavailableAsInvalid);

	/// \brief When initializing from init list via constructor, handle
	/// initialization of an object of type std::initializer_list<T>.
	///
	/// \return true if we have handled initialization of an object of type
	/// std::initializer_list<T>, false otherwise.
	static bool TryInitializerListConstruction(Sema &S,
	InitListExpr *List,
	QualType DestType,
	InitializationSequence &Sequence,
	bool TreatUnavailableAsInvalid) {
	QualType E;
	if (!S.isStdInitializerList(DestType, &E))
	return false;

	if (!S.isCompleteType(List->getExprLoc(), E)) {
	Sequence.setIncompleteTypeFailure(E);
	return true;
	}

	// Try initializing a temporary array from the init list.
	QualType ArrayType = S.Context.getConstantArrayType(
	E.withConst(), llvm::APInt(S.Context.getTypeSize(S.Context.getSizeType()),
	List->getNumInits()),
	clang::ArrayType::Normal, 0);
	InitializedEntity HiddenArray =
	InitializedEntity::InitializeTemporary(ArrayType);
	InitializationKind Kind =
	InitializationKind::CreateDirectList(List->getExprLoc());
	TryListInitialization(S, HiddenArray, Kind, List, Sequence,
	TreatUnavailableAsInvalid);
	if (Sequence)
	Sequence.AddStdInitializerListConstructionStep(DestType);
	return true;
	}

	/// Determine if the constructor has the signature of a copy or move
	/// constructor for the type T of the class in which it was found. That is,
	/// determine if its first parameter is of type T or reference to (possibly
	/// cv-qualified) T.
	static bool hasCopyOrMoveCtorParam(ASTContext &Ctx,
	const ConstructorInfo &Info) {
	if (Info.Constructor->getNumParams() == 0)
	return false;

	QualType ParmT =
	Info.Constructor->getParamDecl(0)->getType().getNonReferenceType();
	QualType ClassT =
	Ctx.getRecordType(cast<CXXRecordDecl>(Info.FoundDecl->getDeclContext()));

	return Ctx.hasSameUnqualifiedType(ParmT, ClassT);
	}

	static OverloadingResult
	ResolveConstructorOverload(Sema &S, SourceLocation DeclLoc,
	MultiExprArg Args,
	OverloadCandidateSet &CandidateSet,
	QualType DestType,
	DeclContext::lookup_result Ctors,
	OverloadCandidateSet::iterator &Best,
	bool CopyInitializing, bool AllowExplicit,
	bool OnlyListConstructors, bool IsListInit,
	bool SecondStepOfCopyInit = false) {
	CandidateSet.clear(OverloadCandidateSet::CSK_InitByConstructor);

	for (NamedDecl *D : Ctors) {
	auto Info = getConstructorInfo(D);
	if (!Info.Constructor \|\| Info.Constructor->isInvalidDecl())
	continue;

	if (!AllowExplicit && Info.Constructor->isExplicit())
	continue;

	if (OnlyListConstructors && !S.isInitListConstructor(Info.Constructor))
	continue;

	// C++11 [over.best.ics]p4:
	// ... and the constructor or user-defined conversion function is a
	// candidate by
	// - 13.3.1.3, when the argument is the temporary in the second step
	// of a class copy-initialization, or
	// - 13.3.1.4, 13.3.1.5, or 13.3.1.6 (in all cases), [not handled here]
	// - the second phase of 13.3.1.7 when the initializer list has exactly
	// one element that is itself an initializer list, and the target is
	// the first parameter of a constructor of class X, and the conversion
	// is to X or reference to (possibly cv-qualified X),
	// user-defined conversion sequences are not considered.
	bool SuppressUserConversions =
	SecondStepOfCopyInit \|\|
	(IsListInit && Args.size() == 1 && isa<InitListExpr>(Args[0]) &&
	hasCopyOrMoveCtorParam(S.Context, Info));

	if (Info.ConstructorTmpl)
	S.AddTemplateOverloadCandidate(Info.ConstructorTmpl, Info.FoundDecl,
	/ExplicitArgs/ nullptr, Args,
	CandidateSet, SuppressUserConversions);
	else {
	// C++ [over.match.copy]p1:
	// - When initializing a temporary to be bound to the first parameter
	// of a constructor [for type T] that takes a reference to possibly
	// cv-qualified T as its first argument, called with a single
	// argument in the context of direct-initialization, explicit
	// conversion functions are also considered.
	// FIXME: What if a constructor template instantiates to such a signature?
	bool AllowExplicitConv = AllowExplicit && !CopyInitializing &&
	Args.size() == 1 &&
	hasCopyOrMoveCtorParam(S.Context, Info);
	S.AddOverloadCandidate(Info.Constructor, Info.FoundDecl, Args,
	CandidateSet, SuppressUserConversions,
	/PartialOverloading=/false,
	/AllowExplicit=/AllowExplicitConv);
	}
	}

	// FIXME: Work around a bug in C++17 guaranteed copy elision.
	//
	// When initializing an object of class type T by constructor
	// ([over.match.ctor]) or by list-initialization ([over.match.list])
	// from a single expression of class type U, conversion functions of
	// U that convert to the non-reference type cv T are candidates.
	// Explicit conversion functions are only candidates during
	// direct-initialization.
	//
	// Note: SecondStepOfCopyInit is only ever true in this case when
	// evaluating whether to produce a C++98 compatibility warning.
	if (S.getLangOpts().CPlusPlus17 && Args.size() == 1 &&
	!SecondStepOfCopyInit) {
	Expr *Initializer = Args[0];
	auto *SourceRD = Initializer->getType()->getAsCXXRecordDecl();
	if (SourceRD && S.isCompleteType(DeclLoc, Initializer->getType())) {
	const auto &Conversions = SourceRD->getVisibleConversionFunctions();
	for (auto I = Conversions.begin(), E = Conversions.end(); I != E; ++I) {
	NamedDecl D = I;
	CXXRecordDecl *ActingDC = cast<CXXRecordDecl>(D->getDeclContext());
	D = D->getUnderlyingDecl();

	FunctionTemplateDecl *ConvTemplate = dyn_cast<FunctionTemplateDecl>(D);
	CXXConversionDecl *Conv;
	if (ConvTemplate)
	Conv = cast<CXXConversionDecl>(ConvTemplate->getTemplatedDecl());
	else
	Conv = cast<CXXConversionDecl>(D);

	if ((AllowExplicit && !CopyInitializing) \|\| !Conv->isExplicit()) {
	if (ConvTemplate)
	S.AddTemplateConversionCandidate(ConvTemplate, I.getPair(),
	ActingDC, Initializer, DestType,
	CandidateSet, AllowExplicit,
	/AllowResultConversion/false);
	else
	S.AddConversionCandidate(Conv, I.getPair(), ActingDC, Initializer,
	DestType, CandidateSet, AllowExplicit,
	/AllowResultConversion/false);
	}
	}
	}
	}

	// Perform overload resolution and return the result.
	return CandidateSet.BestViableFunction(S, DeclLoc, Best);
	}

	/// \brief Attempt initialization by constructor (C++ [dcl.init]), which
	/// enumerates the constructors of the initialized entity and performs overload
	/// resolution to select the best.
	/// \param DestType The destination class type.
	/// \param DestArrayType The destination type, which is either DestType or
	/// a (possibly multidimensional) array of DestType.
	/// \param IsListInit Is this list-initialization?
	/// \param IsInitListCopy Is this non-list-initialization resulting from a
	/// list-initialization from {x} where x is the same
	/// type as the entity?
	static void TryConstructorInitialization(Sema &S,
	const InitializedEntity &Entity,
	const InitializationKind &Kind,
	MultiExprArg Args, QualType DestType,
	QualType DestArrayType,
	InitializationSequence &Sequence,
	bool IsListInit = false,
	bool IsInitListCopy = false) {
	assert(((!IsListInit && !IsInitListCopy) \|\|
	(Args.size() == 1 && isa<InitListExpr>(Args[0]))) &&
	"IsListInit/IsInitListCopy must come with a single initializer list "
	"argument.");
	InitListExpr *ILE =
	(IsListInit \|\| IsInitListCopy) ? cast<InitListExpr>(Args[0]) : nullptr;
	MultiExprArg UnwrappedArgs =
	ILE ? MultiExprArg(ILE->getInits(), ILE->getNumInits()) : Args;

	// The type we're constructing needs to be complete.
	if (!S.isCompleteType(Kind.getLocation(), DestType)) {
	Sequence.setIncompleteTypeFailure(DestType);
	return;
	}

	// C++17 [dcl.init]p17:
	// - If the initializer expression is a prvalue and the cv-unqualified
	// version of the source type is the same class as the class of the
	// destination, the initializer expression is used to initialize the
	// destination object.
	// Per DR (no number yet), this does not apply when initializing a base
	// class or delegating to another constructor from a mem-initializer.
	// ObjC++: Lambda captured by the block in the lambda to block conversion
	// should avoid copy elision.
	if (S.getLangOpts().CPlusPlus17 &&
	Entity.getKind() != InitializedEntity::EK_Base &&
	Entity.getKind() != InitializedEntity::EK_Delegating &&
	Entity.getKind() !=
	InitializedEntity::EK_LambdaToBlockConversionBlockElement &&
	UnwrappedArgs.size() == 1 && UnwrappedArgs[0]->isRValue() &&
	S.Context.hasSameUnqualifiedType(UnwrappedArgs[0]->getType(), DestType)) {
	// Convert qualifications if necessary.
	Sequence.AddQualificationConversionStep(DestType, VK_RValue);
	if (ILE)
	Sequence.RewrapReferenceInitList(DestType, ILE);
	return;
	}

	const RecordType *DestRecordType = DestType->getAs<RecordType>();
	assert(DestRecordType && "Constructor initialization requires record type");
	CXXRecordDecl *DestRecordDecl
	= cast<CXXRecordDecl>(DestRecordType->getDecl());

	// Build the candidate set directly in the initialization sequence
	// structure, so that it will persist if we fail.
	OverloadCandidateSet &CandidateSet = Sequence.getFailedCandidateSet();

	// Determine whether we are allowed to call explicit constructors or
	// explicit conversion operators.
	bool AllowExplicit = Kind.AllowExplicit() \|\| IsListInit;
	bool CopyInitialization = Kind.getKind() == InitializationKind::IK_Copy;

	// - Otherwise, if T is a class type, constructors are considered. The
	// applicable constructors are enumerated, and the best one is chosen
	// through overload resolution.
	DeclContext::lookup_result Ctors = S.LookupConstructors(DestRecordDecl);

	OverloadingResult Result = OR_No_Viable_Function;
	OverloadCandidateSet::iterator Best;
	bool AsInitializerList = false;

	// C++11 [over.match.list]p1, per DR1467:
	// When objects of non-aggregate type T are list-initialized, such that
	// 8.5.4 [dcl.init.list] specifies that overload resolution is performed
	// according to the rules in this section, overload resolution selects
	// the constructor in two phases:
	//
	// - Initially, the candidate functions are the initializer-list
	// constructors of the class T and the argument list consists of the
	// initializer list as a single argument.
	if (IsListInit) {
	AsInitializerList = true;

	// If the initializer list has no elements and T has a default constructor,
	// the first phase is omitted.
	if (!(UnwrappedArgs.empty() && DestRecordDecl->hasDefaultConstructor()))
	Result = ResolveConstructorOverload(S, Kind.getLocation(), Args,
	CandidateSet, DestType, Ctors, Best,
	CopyInitialization, AllowExplicit,
	/OnlyListConstructor=/true,
	IsListInit);
	}

	// C++11 [over.match.list]p1:
	// - If no viable initializer-list constructor is found, overload resolution
	// is performed again, where the candidate functions are all the
	// constructors of the class T and the argument list consists of the
	// elements of the initializer list.
	if (Result == OR_No_Viable_Function) {
	AsInitializerList = false;
	Result = ResolveConstructorOverload(S, Kind.getLocation(), UnwrappedArgs,
	CandidateSet, DestType, Ctors, Best,
	CopyInitialization, AllowExplicit,
	/OnlyListConstructors=/false,
	IsListInit);
	}
	if (Result) {
	Sequence.SetOverloadFailure(IsListInit ?
	InitializationSequence::FK_ListConstructorOverloadFailed :
	InitializationSequence::FK_ConstructorOverloadFailed,
	Result);
	return;
	}

	bool HadMultipleCandidates = (CandidateSet.size() > 1);

	// In C++17, ResolveConstructorOverload can select a conversion function
	// instead of a constructor.
	if (auto *CD = dyn_cast<CXXConversionDecl>(Best->Function)) {
	// Add the user-defined conversion step that calls the conversion function.
	QualType ConvType = CD->getConversionType();
	assert(S.Context.hasSameUnqualifiedType(ConvType, DestType) &&
	"should not have selected this conversion function");
	Sequence.AddUserConversionStep(CD, Best->FoundDecl, ConvType,
	HadMultipleCandidates);
	if (!S.Context.hasSameType(ConvType, DestType))
	Sequence.AddQualificationConversionStep(DestType, VK_RValue);
	if (IsListInit)
	Sequence.RewrapReferenceInitList(Entity.getType(), ILE);
	return;
	}

	// C++11 [dcl.init]p6:
	// If a program calls for the default initialization of an object
	// of a const-qualified type T, T shall be a class type with a
	// user-provided default constructor.
	// C++ core issue 253 proposal:
	// If the implicit default constructor initializes all subobjects, no
	// initializer should be required.
	// The 253 proposal is for example needed to process libstdc++ headers in 5.x.
	CXXConstructorDecl *CtorDecl = cast<CXXConstructorDecl>(Best->Function);
	if (Kind.getKind() == InitializationKind::IK_Default &&
	Entity.getType().isConstQualified()) {
	if (!CtorDecl->getParent()->allowConstDefaultInit()) {
	if (!maybeRecoverWithZeroInitialization(S, Sequence, Entity))
	Sequence.SetFailed(InitializationSequence::FK_DefaultInitOfConst);
	return;
	}
	}

	// C++11 [over.match.list]p1:
	// In copy-list-initialization, if an explicit constructor is chosen, the
	// initializer is ill-formed.
	if (IsListInit && !Kind.AllowExplicit() && CtorDecl->isExplicit()) {
	Sequence.SetFailed(InitializationSequence::FK_ExplicitConstructor);
	return;
	}

	// Add the constructor initialization step. Any cv-qualification conversion is
	// subsumed by the initialization.
	Sequence.AddConstructorInitializationStep(
	Best->FoundDecl, CtorDecl, DestArrayType, HadMultipleCandidates,
	IsListInit \| IsInitListCopy, AsInitializerList);
	}

	static bool
	ResolveOverloadedFunctionForReferenceBinding(Sema &S,
	Expr *Initializer,
	QualType &SourceType,
	QualType &UnqualifiedSourceType,
	QualType UnqualifiedTargetType,
	InitializationSequence &Sequence) {
	if (S.Context.getCanonicalType(UnqualifiedSourceType) ==
	S.Context.OverloadTy) {
	DeclAccessPair Found;
	bool HadMultipleCandidates = false;
	if (FunctionDecl *Fn
	= S.ResolveAddressOfOverloadedFunction(Initializer,
	UnqualifiedTargetType,
	false, Found,
	&HadMultipleCandidates)) {
	Sequence.AddAddressOverloadResolutionStep(Fn, Found,
	HadMultipleCandidates);
	SourceType = Fn->getType();
	UnqualifiedSourceType = SourceType.getUnqualifiedType();
	} else if (!UnqualifiedTargetType->isRecordType()) {
	Sequence.SetFailed(InitializationSequence::FK_AddressOfOverloadFailed);
	return true;
	}
	}
	return false;
	}

	static void TryReferenceInitializationCore(Sema &S,
	const InitializedEntity &Entity,
	const InitializationKind &Kind,
	Expr *Initializer,
	QualType cv1T1, QualType T1,
	Qualifiers T1Quals,
	QualType cv2T2, QualType T2,
	Qualifiers T2Quals,
	InitializationSequence &Sequence);

	static void TryValueInitialization(Sema &S,
	const InitializedEntity &Entity,
	const InitializationKind &Kind,
	InitializationSequence &Sequence,
	InitListExpr *InitList = nullptr);

	/// \brief Attempt list initialization of a reference.
	static void TryReferenceListInitialization(Sema &S,
	const InitializedEntity &Entity,
	const InitializationKind &Kind,
	InitListExpr *InitList,
	InitializationSequence &Sequence,
	bool TreatUnavailableAsInvalid) {
	// First, catch C++03 where this isn't possible.
	if (!S.getLangOpts().CPlusPlus11) {
	Sequence.SetFailed(InitializationSequence::FK_ReferenceBindingToInitList);
	return;
	}
	// Can't reference initialize a compound literal.
	if (Entity.getKind() == InitializedEntity::EK_CompoundLiteralInit) {
	Sequence.SetFailed(InitializationSequence::FK_ReferenceBindingToInitList);
	return;
	}

	QualType DestType = Entity.getType();
	QualType cv1T1 = DestType->getAs<ReferenceType>()->getPointeeType();
	Qualifiers T1Quals;
	QualType T1 = S.Context.getUnqualifiedArrayType(cv1T1, T1Quals);

	// Reference initialization via an initializer list works thus:
	// If the initializer list consists of a single element that is
	// reference-related to the referenced type, bind directly to that element
	// (possibly creating temporaries).
	// Otherwise, initialize a temporary with the initializer list and
	// bind to that.
	if (InitList->getNumInits() == 1) {
	Expr *Initializer = InitList->getInit(0);
	QualType cv2T2 = Initializer->getType();
	Qualifiers T2Quals;
	QualType T2 = S.Context.getUnqualifiedArrayType(cv2T2, T2Quals);

	// If this fails, creating a temporary wouldn't work either.
	if (ResolveOverloadedFunctionForReferenceBinding(S, Initializer, cv2T2, T2,
	T1, Sequence))
	return;

	SourceLocation DeclLoc = Initializer->getLocStart();
	bool dummy1, dummy2, dummy3;
	Sema::ReferenceCompareResult RefRelationship
	= S.CompareReferenceRelationship(DeclLoc, cv1T1, cv2T2, dummy1,
	dummy2, dummy3);
	if (RefRelationship >= Sema::Ref_Related) {
	// Try to bind the reference here.
	TryReferenceInitializationCore(S, Entity, Kind, Initializer, cv1T1, T1,
	T1Quals, cv2T2, T2, T2Quals, Sequence);
	if (Sequence)
	Sequence.RewrapReferenceInitList(cv1T1, InitList);
	return;
	}

	// Update the initializer if we've resolved an overloaded function.
	if (Sequence.step_begin() != Sequence.step_end())
	Sequence.RewrapReferenceInitList(cv1T1, InitList);
	}

	// Not reference-related. Create a temporary and bind to that.
	InitializedEntity TempEntity = InitializedEntity::InitializeTemporary(cv1T1);

	TryListInitialization(S, TempEntity, Kind, InitList, Sequence,
	TreatUnavailableAsInvalid);
	if (Sequence) {
	if (DestType->isRValueReferenceType() \|\|
	(T1Quals.hasConst() && !T1Quals.hasVolatile()))
	Sequence.AddReferenceBindingStep(cv1T1, /bindingTemporary=/true);
	else
	Sequence.SetFailed(
	InitializationSequence::FK_NonConstLValueReferenceBindingToTemporary);
	}
	}

	/// \brief Attempt list initialization (C++0x [dcl.init.list])
	static void TryListInitialization(Sema &S,
	const InitializedEntity &Entity,
	const InitializationKind &Kind,
	InitListExpr *InitList,
	InitializationSequence &Sequence,
	bool TreatUnavailableAsInvalid) {
	QualType DestType = Entity.getType();

	// C++ doesn't allow scalar initialization with more than one argument.
	// But C99 complex numbers are scalars and it makes sense there.
	if (S.getLangOpts().CPlusPlus && DestType->isScalarType() &&
	!DestType->isAnyComplexType() && InitList->getNumInits() > 1) {
	Sequence.SetFailed(InitializationSequence::FK_TooManyInitsForScalar);
	return;
	}
	if (DestType->isReferenceType()) {
	TryReferenceListInitialization(S, Entity, Kind, InitList, Sequence,
	TreatUnavailableAsInvalid);
	return;
	}

	if (DestType->isRecordType() &&
	!S.isCompleteType(InitList->getLocStart(), DestType)) {
	Sequence.setIncompleteTypeFailure(DestType);
	return;
	}

	// C++11 [dcl.init.list]p3, per DR1467:
	// - If T is a class type and the initializer list has a single element of
	// type cv U, where U is T or a class derived from T, the object is
	// initialized from that element (by copy-initialization for
	// copy-list-initialization, or by direct-initialization for
	// direct-list-initialization).
	// - Otherwise, if T is a character array and the initializer list has a
	// single element that is an appropriately-typed string literal
	// (8.5.2 [dcl.init.string]), initialization is performed as described
	// in that section.
	// - Otherwise, if T is an aggregate, [...] (continue below).
	if (S.getLangOpts().CPlusPlus11 && InitList->getNumInits() == 1) {
	if (DestType->isRecordType()) {
	QualType InitType = InitList->getInit(0)->getType();
	if (S.Context.hasSameUnqualifiedType(InitType, DestType) \|\|
	S.IsDerivedFrom(InitList->getLocStart(), InitType, DestType)) {
	Expr *InitListAsExpr = InitList;
	TryConstructorInitialization(S, Entity, Kind, InitListAsExpr, DestType,
	DestType, Sequence,
	/InitListSyntax/false,
	/IsInitListCopy/true);
	return;
	}
	}
	if (const ArrayType *DestAT = S.Context.getAsArrayType(DestType)) {
	Expr *SubInit[1] = {InitList->getInit(0)};
	if (!isa<VariableArrayType>(DestAT) &&
	IsStringInit(SubInit[0], DestAT, S.Context) == SIF_None) {
	InitializationKind SubKind =
	Kind.getKind() == InitializationKind::IK_DirectList
	? InitializationKind::CreateDirect(Kind.getLocation(),
	InitList->getLBraceLoc(),
	InitList->getRBraceLoc())
	: Kind;
	Sequence.InitializeFrom(S, Entity, SubKind, SubInit,
	/TopLevelOfInitList/ true,
	TreatUnavailableAsInvalid);

	// TryStringLiteralInitialization() (in InitializeFrom()) will fail if
	// the element is not an appropriately-typed string literal, in which
	// case we should proceed as in C++11 (below).
	if (Sequence) {
	Sequence.RewrapReferenceInitList(Entity.getType(), InitList);
	return;
	}
	}
	}
	}

	// C++11 [dcl.init.list]p3:
	// - If T is an aggregate, aggregate initialization is performed.
	if ((DestType->isRecordType() && !DestType->isAggregateType()) \|\|
	(S.getLangOpts().CPlusPlus11 &&
	S.isStdInitializerList(DestType, nullptr))) {
	if (S.getLangOpts().CPlusPlus11) {
	// - Otherwise, if the initializer list has no elements and T is a
	// class type with a default constructor, the object is
	// value-initialized.
	if (InitList->getNumInits() == 0) {
	CXXRecordDecl *RD = DestType->getAsCXXRecordDecl();
	if (RD->hasDefaultConstructor()) {
	TryValueInitialization(S, Entity, Kind, Sequence, InitList);
	return;
	}
	}

	// - Otherwise, if T is a specialization of std::initializer_list<E>,
	// an initializer_list object constructed [...]
	if (TryInitializerListConstruction(S, InitList, DestType, Sequence,
	TreatUnavailableAsInvalid))
	return;

	// - Otherwise, if T is a class type, constructors are considered.
	Expr *InitListAsExpr = InitList;
	TryConstructorInitialization(S, Entity, Kind, InitListAsExpr, DestType,
	DestType, Sequence, /InitListSyntax/true);
	} else
	Sequence.SetFailed(InitializationSequence::FK_InitListBadDestinationType);
	return;
	}

	if (S.getLangOpts().CPlusPlus && !DestType->isAggregateType() &&
	InitList->getNumInits() == 1) {
	Expr *E = InitList->getInit(0);

	// - Otherwise, if T is an enumeration with a fixed underlying type,
	// the initializer-list has a single element v, and the initialization
	// is direct-list-initialization, the object is initialized with the
	// value T(v); if a narrowing conversion is required to convert v to
	// the underlying type of T, the program is ill-formed.
	auto *ET = DestType->getAs<EnumType>();
	if (S.getLangOpts().CPlusPlus17 &&
	Kind.getKind() == InitializationKind::IK_DirectList &&
	ET && ET->getDecl()->isFixed() &&
	!S.Context.hasSameUnqualifiedType(E->getType(), DestType) &&
	(E->getType()->isIntegralOrEnumerationType() \|\|
	E->getType()->isFloatingType())) {
	// There are two ways that T(v) can work when T is an enumeration type.
	// If there is either an implicit conversion sequence from v to T or
	// a conversion function that can convert from v to T, then we use that.
	// Otherwise, if v is of integral, enumeration, or floating-point type,
	// it is converted to the enumeration type via its underlying type.
	// There is no overlap possible between these two cases (except when the
	// source value is already of the destination type), and the first
	// case is handled by the general case for single-element lists below.
	ImplicitConversionSequence ICS;
	ICS.setStandard();
	ICS.Standard.setAsIdentityConversion();
	if (!E->isRValue())
	ICS.Standard.First = ICK_Lvalue_To_Rvalue;
	// If E is of a floating-point type, then the conversion is ill-formed
	// due to narrowing, but go through the motions in order to produce the
	// right diagnostic.
	ICS.Standard.Second = E->getType()->isFloatingType()
	? ICK_Floating_Integral
	: ICK_Integral_Conversion;
	ICS.Standard.setFromType(E->getType());
	ICS.Standard.setToType(0, E->getType());
	ICS.Standard.setToType(1, DestType);
	ICS.Standard.setToType(2, DestType);
	Sequence.AddConversionSequenceStep(ICS, ICS.Standard.getToType(2),
	/TopLevelOfInitList/true);
	Sequence.RewrapReferenceInitList(Entity.getType(), InitList);
	return;
	}

	// - Otherwise, if the initializer list has a single element of type E
	// [...references are handled above...], the object or reference is
	// initialized from that element (by copy-initialization for
	// copy-list-initialization, or by direct-initialization for
	// direct-list-initialization); if a narrowing conversion is required
	// to convert the element to T, the program is ill-formed.
	//
	// Per core-24034, this is direct-initialization if we were performing
	// direct-list-initialization and copy-initialization otherwise.
	// We can't use InitListChecker for this, because it always performs
	// copy-initialization. This only matters if we might use an 'explicit'
	// conversion operator, so we only need to handle the cases where the source
	// is of record type.
	if (InitList->getInit(0)->getType()->isRecordType()) {
	InitializationKind SubKind =
	Kind.getKind() == InitializationKind::IK_DirectList
	? InitializationKind::CreateDirect(Kind.getLocation(),
	InitList->getLBraceLoc(),
	InitList->getRBraceLoc())
	: Kind;
	Expr *SubInit[1] = { InitList->getInit(0) };
	Sequence.InitializeFrom(S, Entity, SubKind, SubInit,
	/TopLevelOfInitList/true,
	TreatUnavailableAsInvalid);
	if (Sequence)
	Sequence.RewrapReferenceInitList(Entity.getType(), InitList);
	return;
	}
	}

	InitListChecker CheckInitList(S, Entity, InitList,
	DestType, /VerifyOnly=/true, TreatUnavailableAsInvalid);
	if (CheckInitList.HadError()) {
	Sequence.SetFailed(InitializationSequence::FK_ListInitializationFailed);
	return;
	}

	// Add the list initialization step with the built init list.
	Sequence.AddListInitializationStep(DestType);
	}

	/// \brief Try a reference initialization that involves calling a conversion
	/// function.
	static OverloadingResult TryRefInitWithConversionFunction(
	Sema &S, const InitializedEntity &Entity, const InitializationKind &Kind,
	Expr *Initializer, bool AllowRValues, bool IsLValueRef,
	InitializationSequence &Sequence) {
	QualType DestType = Entity.getType();
	QualType cv1T1 = DestType->getAs<ReferenceType>()->getPointeeType();
	QualType T1 = cv1T1.getUnqualifiedType();
	QualType cv2T2 = Initializer->getType();
	QualType T2 = cv2T2.getUnqualifiedType();

	bool DerivedToBase;
	bool ObjCConversion;
	bool ObjCLifetimeConversion;
	assert(!S.CompareReferenceRelationship(Initializer->getLocStart(),
	T1, T2, DerivedToBase,
	ObjCConversion,
	ObjCLifetimeConversion) &&
	"Must have incompatible references when binding via conversion");
	(void)DerivedToBase;
	(void)ObjCConversion;
	(void)ObjCLifetimeConversion;

	// Build the candidate set directly in the initialization sequence
	// structure, so that it will persist if we fail.
	OverloadCandidateSet &CandidateSet = Sequence.getFailedCandidateSet();
	CandidateSet.clear(OverloadCandidateSet::CSK_InitByUserDefinedConversion);

	// Determine whether we are allowed to call explicit constructors or
	// explicit conversion operators.
	bool AllowExplicit = Kind.AllowExplicit();
	bool AllowExplicitConvs = Kind.allowExplicitConversionFunctionsInRefBinding();

	const RecordType *T1RecordType = nullptr;
	if (AllowRValues && (T1RecordType = T1->getAs<RecordType>()) &&
	S.isCompleteType(Kind.getLocation(), T1)) {
	// The type we're converting to is a class type. Enumerate its constructors
	// to see if there is a suitable conversion.
	CXXRecordDecl *T1RecordDecl = cast<CXXRecordDecl>(T1RecordType->getDecl());

	for (NamedDecl *D : S.LookupConstructors(T1RecordDecl)) {
	auto Info = getConstructorInfo(D);
	if (!Info.Constructor)
	continue;

	if (!Info.Constructor->isInvalidDecl() &&
	Info.Constructor->isConvertingConstructor(AllowExplicit)) {
	if (Info.ConstructorTmpl)
	S.AddTemplateOverloadCandidate(Info.ConstructorTmpl, Info.FoundDecl,
	/ExplicitArgs/ nullptr,
	Initializer, CandidateSet,
	/SuppressUserConversions=/true);
	else
	S.AddOverloadCandidate(Info.Constructor, Info.FoundDecl,
	Initializer, CandidateSet,
	/SuppressUserConversions=/true);
	}
	}
	}
	if (T1RecordType && T1RecordType->getDecl()->isInvalidDecl())
	return OR_No_Viable_Function;

	const RecordType *T2RecordType = nullptr;
	if ((T2RecordType = T2->getAs<RecordType>()) &&
	S.isCompleteType(Kind.getLocation(), T2)) {
	// The type we're converting from is a class type, enumerate its conversion
	// functions.
	CXXRecordDecl *T2RecordDecl = cast<CXXRecordDecl>(T2RecordType->getDecl());

	const auto &Conversions = T2RecordDecl->getVisibleConversionFunctions();
	for (auto I = Conversions.begin(), E = Conversions.end(); I != E; ++I) {
	NamedDecl D = I;
	CXXRecordDecl *ActingDC = cast<CXXRecordDecl>(D->getDeclContext());
	if (isa<UsingShadowDecl>(D))
	D = cast<UsingShadowDecl>(D)->getTargetDecl();

	FunctionTemplateDecl *ConvTemplate = dyn_cast<FunctionTemplateDecl>(D);
	CXXConversionDecl *Conv;
	if (ConvTemplate)
	Conv = cast<CXXConversionDecl>(ConvTemplate->getTemplatedDecl());
	else
	Conv = cast<CXXConversionDecl>(D);

	// If the conversion function doesn't return a reference type,
	// it can't be considered for this conversion unless we're allowed to
	// consider rvalues.
	// FIXME: Do we need to make sure that we only consider conversion
	// candidates with reference-compatible results? That might be needed to
	// break recursion.
	if ((AllowExplicitConvs \|\| !Conv->isExplicit()) &&
	(AllowRValues \|\| Conv->getConversionType()->isLValueReferenceType())){
	if (ConvTemplate)
	S.AddTemplateConversionCandidate(ConvTemplate, I.getPair(),
	ActingDC, Initializer,
	DestType, CandidateSet,
	/AllowObjCConversionOnExplicit=/
	false);
	else
	S.AddConversionCandidate(Conv, I.getPair(), ActingDC,
	Initializer, DestType, CandidateSet,
	/AllowObjCConversionOnExplicit=/false);
	}
	}
	}
	if (T2RecordType && T2RecordType->getDecl()->isInvalidDecl())
	return OR_No_Viable_Function;

	SourceLocation DeclLoc = Initializer->getLocStart();

	// Perform overload resolution. If it fails, return the failed result.
	OverloadCandidateSet::iterator Best;
	if (OverloadingResult Result
	= CandidateSet.BestViableFunction(S, DeclLoc, Best))
	return Result;

	FunctionDecl *Function = Best->Function;
	// This is the overload that will be used for this initialization step if we
	// use this initialization. Mark it as referenced.
	Function->setReferenced();

	// Compute the returned type and value kind of the conversion.
	QualType cv3T3;
	if (isa<CXXConversionDecl>(Function))
	cv3T3 = Function->getReturnType();
	else
	cv3T3 = T1;

	ExprValueKind VK = VK_RValue;
	if (cv3T3->isLValueReferenceType())
	VK = VK_LValue;
	else if (const auto *RRef = cv3T3->getAs<RValueReferenceType>())
	VK = RRef->getPointeeType()->isFunctionType() ? VK_LValue : VK_XValue;
	cv3T3 = cv3T3.getNonLValueExprType(S.Context);

	// Add the user-defined conversion step.
	bool HadMultipleCandidates = (CandidateSet.size() > 1);
	Sequence.AddUserConversionStep(Function, Best->FoundDecl, cv3T3,
	HadMultipleCandidates);

	// Determine whether we'll need to perform derived-to-base adjustments or
	// other conversions.
	bool NewDerivedToBase = false;
	bool NewObjCConversion = false;
	bool NewObjCLifetimeConversion = false;
	Sema::ReferenceCompareResult NewRefRelationship
	= S.CompareReferenceRelationship(DeclLoc, T1, cv3T3,
	NewDerivedToBase, NewObjCConversion,
	NewObjCLifetimeConversion);

	// Add the final conversion sequence, if necessary.
	if (NewRefRelationship == Sema::Ref_Incompatible) {
	assert(!isa<CXXConstructorDecl>(Function) &&
	"should not have conversion after constructor");

	ImplicitConversionSequence ICS;
	ICS.setStandard();
	ICS.Standard = Best->FinalConversion;
	Sequence.AddConversionSequenceStep(ICS, ICS.Standard.getToType(2));

	// Every implicit conversion results in a prvalue, except for a glvalue
	// derived-to-base conversion, which we handle below.
	cv3T3 = ICS.Standard.getToType(2);
	VK = VK_RValue;
	}

	// If the converted initializer is a prvalue, its type T4 is adjusted to
	// type "cv1 T4" and the temporary materialization conversion is applied.
	//
	// We adjust the cv-qualifications to match the reference regardless of
	// whether we have a prvalue so that the AST records the change. In this
	// case, T4 is "cv3 T3".
	QualType cv1T4 = S.Context.getQualifiedType(cv3T3, cv1T1.getQualifiers());
	if (cv1T4.getQualifiers() != cv3T3.getQualifiers())
	Sequence.AddQualificationConversionStep(cv1T4, VK);
	Sequence.AddReferenceBindingStep(cv1T4, VK == VK_RValue);
	VK = IsLValueRef ? VK_LValue : VK_XValue;

	if (NewDerivedToBase)
	Sequence.AddDerivedToBaseCastStep(cv1T1, VK);
	else if (NewObjCConversion)
	Sequence.AddObjCObjectConversionStep(cv1T1);

	return OR_Success;
	}

	static void CheckCXX98CompatAccessibleCopy(Sema &S,
	const InitializedEntity &Entity,
	Expr *CurInitExpr);

	/// \brief Attempt reference initialization (C++0x [dcl.init.ref])
	static void TryReferenceInitialization(Sema &S,
	const InitializedEntity &Entity,
	const InitializationKind &Kind,
	Expr *Initializer,
	InitializationSequence &Sequence) {
	QualType DestType = Entity.getType();
	QualType cv1T1 = DestType->getAs<ReferenceType>()->getPointeeType();
	Qualifiers T1Quals;
	QualType T1 = S.Context.getUnqualifiedArrayType(cv1T1, T1Quals);
	QualType cv2T2 = Initializer->getType();
	Qualifiers T2Quals;
	QualType T2 = S.Context.getUnqualifiedArrayType(cv2T2, T2Quals);

	// If the initializer is the address of an overloaded function, try
	// to resolve the overloaded function. If all goes well, T2 is the
	// type of the resulting function.
	if (ResolveOverloadedFunctionForReferenceBinding(S, Initializer, cv2T2, T2,
	T1, Sequence))
	return;

	// Delegate everything else to a subfunction.
	TryReferenceInitializationCore(S, Entity, Kind, Initializer, cv1T1, T1,
	T1Quals, cv2T2, T2, T2Quals, Sequence);
	}

	/// Determine whether an expression is a non-referenceable glvalue (one to
	/// which a reference can never bind). Attemting to bind a reference to
	/// such a glvalue will always create a temporary.
	static bool isNonReferenceableGLValue(Expr *E) {
	return E->refersToBitField() \|\| E->refersToVectorElement();
	}

	/// \brief Reference initialization without resolving overloaded functions.
	static void TryReferenceInitializationCore(Sema &S,
	const InitializedEntity &Entity,
	const InitializationKind &Kind,
	Expr *Initializer,
	QualType cv1T1, QualType T1,
	Qualifiers T1Quals,
	QualType cv2T2, QualType T2,
	Qualifiers T2Quals,
	InitializationSequence &Sequence) {
	QualType DestType = Entity.getType();
	SourceLocation DeclLoc = Initializer->getLocStart();
	// Compute some basic properties of the types and the initializer.
	bool isLValueRef = DestType->isLValueReferenceType();
	bool isRValueRef = !isLValueRef;
	bool DerivedToBase = false;
	bool ObjCConversion = false;
	bool ObjCLifetimeConversion = false;
	Expr::Classification InitCategory = Initializer->Classify(S.Context);
	Sema::ReferenceCompareResult RefRelationship
	= S.CompareReferenceRelationship(DeclLoc, cv1T1, cv2T2, DerivedToBase,
	ObjCConversion, ObjCLifetimeConversion);

	// C++0x [dcl.init.ref]p5:
	// A reference to type "cv1 T1" is initialized by an expression of type
	// "cv2 T2" as follows:
	//
	// - If the reference is an lvalue reference and the initializer
	// expression
	// Note the analogous bullet points for rvalue refs to functions. Because
	// there are no function rvalues in C++, rvalue refs to functions are treated
	// like lvalue refs.
	OverloadingResult ConvOvlResult = OR_Success;
	bool T1Function = T1->isFunctionType();
	if (isLValueRef \|\| T1Function) {
	if (InitCategory.isLValue() && !isNonReferenceableGLValue(Initializer) &&
	(RefRelationship == Sema::Ref_Compatible \|\|
	(Kind.isCStyleOrFunctionalCast() &&
	RefRelationship == Sema::Ref_Related))) {
	// - is an lvalue (but is not a bit-field), and "cv1 T1" is
	// reference-compatible with "cv2 T2," or
	if (T1Quals != T2Quals)
	// Convert to cv1 T2. This should only add qualifiers unless this is a
	// c-style cast. The removal of qualifiers in that case notionally
	// happens after the reference binding, but that doesn't matter.
	Sequence.AddQualificationConversionStep(
	S.Context.getQualifiedType(T2, T1Quals),
	Initializer->getValueKind());
	if (DerivedToBase)
	Sequence.AddDerivedToBaseCastStep(cv1T1, VK_LValue);
	else if (ObjCConversion)
	Sequence.AddObjCObjectConversionStep(cv1T1);

	// We only create a temporary here when binding a reference to a
	// bit-field or vector element. Those cases are't supposed to be
	// handled by this bullet, but the outcome is the same either way.
	Sequence.AddReferenceBindingStep(cv1T1, false);
	return;
	}

	// - has a class type (i.e., T2 is a class type), where T1 is not
	// reference-related to T2, and can be implicitly converted to an
	// lvalue of type "cv3 T3," where "cv1 T1" is reference-compatible
	// with "cv3 T3" (this conversion is selected by enumerating the
	// applicable conversion functions (13.3.1.6) and choosing the best
	// one through overload resolution (13.3)),
	// If we have an rvalue ref to function type here, the rhs must be
	// an rvalue. DR1287 removed the "implicitly" here.
	if (RefRelationship == Sema::Ref_Incompatible && T2->isRecordType() &&
	(isLValueRef \|\| InitCategory.isRValue())) {
	ConvOvlResult = TryRefInitWithConversionFunction(
	S, Entity, Kind, Initializer, /AllowRValues/ isRValueRef,
	/IsLValueRef/ isLValueRef, Sequence);
	if (ConvOvlResult == OR_Success)
	return;
	if (ConvOvlResult != OR_No_Viable_Function)
	Sequence.SetOverloadFailure(
	InitializationSequence::FK_ReferenceInitOverloadFailed,
	ConvOvlResult);
	}
	}

	// - Otherwise, the reference shall be an lvalue reference to a
	// non-volatile const type (i.e., cv1 shall be const), or the reference
	// shall be an rvalue reference.
	if (isLValueRef && !(T1Quals.hasConst() && !T1Quals.hasVolatile())) {
	if (S.Context.getCanonicalType(T2) == S.Context.OverloadTy)
	Sequence.SetFailed(InitializationSequence::FK_AddressOfOverloadFailed);
	else if (ConvOvlResult && !Sequence.getFailedCandidateSet().empty())
	Sequence.SetOverloadFailure(
	InitializationSequence::FK_ReferenceInitOverloadFailed,
	ConvOvlResult);
	else if (!InitCategory.isLValue())
	Sequence.SetFailed(
	InitializationSequence::FK_NonConstLValueReferenceBindingToTemporary);
	else {
	InitializationSequence::FailureKind FK;
	switch (RefRelationship) {
	case Sema::Ref_Compatible:
	if (Initializer->refersToBitField())
	FK = InitializationSequence::
	FK_NonConstLValueReferenceBindingToBitfield;
	else if (Initializer->refersToVectorElement())
	FK = InitializationSequence::
	FK_NonConstLValueReferenceBindingToVectorElement;
	else
	llvm_unreachable("unexpected kind of compatible initializer");
	break;
	case Sema::Ref_Related:
	FK = InitializationSequence::FK_ReferenceInitDropsQualifiers;
	break;
	case Sema::Ref_Incompatible:
	FK = InitializationSequence::
	FK_NonConstLValueReferenceBindingToUnrelated;
	break;
	}
	Sequence.SetFailed(FK);
	}
	return;
	}

	// - If the initializer expression
	// - is an
	// [<=14] xvalue (but not a bit-field), class prvalue, array prvalue, or
	// [1z] rvalue (but not a bit-field) or
	// function lvalue and "cv1 T1" is reference-compatible with "cv2 T2"
	//
	// Note: functions are handled above and below rather than here...
	if (!T1Function &&
	(RefRelationship == Sema::Ref_Compatible \|\|
	(Kind.isCStyleOrFunctionalCast() &&
	RefRelationship == Sema::Ref_Related)) &&
	((InitCategory.isXValue() && !isNonReferenceableGLValue(Initializer)) \|\|
	(InitCategory.isPRValue() &&
	(S.getLangOpts().CPlusPlus17 \|\| T2->isRecordType() \|\|
	T2->isArrayType())))) {
	ExprValueKind ValueKind = InitCategory.isXValue() ? VK_XValue : VK_RValue;
	if (InitCategory.isPRValue() && T2->isRecordType()) {
	// The corresponding bullet in C++03 [dcl.init.ref]p5 gives the
	// compiler the freedom to perform a copy here or bind to the
	// object, while C++0x requires that we bind directly to the
	// object. Hence, we always bind to the object without making an
	// extra copy. However, in C++03 requires that we check for the
	// presence of a suitable copy constructor:
	//
	// The constructor that would be used to make the copy shall
	// be callable whether or not the copy is actually done.
	if (!S.getLangOpts().CPlusPlus11 && !S.getLangOpts().MicrosoftExt)
	Sequence.AddExtraneousCopyToTemporary(cv2T2);
	else if (S.getLangOpts().CPlusPlus11)
	CheckCXX98CompatAccessibleCopy(S, Entity, Initializer);
	}

	// C++1z [dcl.init.ref]/5.2.1.2:
	// If the converted initializer is a prvalue, its type T4 is adjusted
	// to type "cv1 T4" and the temporary materialization conversion is
	// applied.
	QualType cv1T4 = S.Context.getQualifiedType(cv2T2, T1Quals);
	if (T1Quals != T2Quals)
	Sequence.AddQualificationConversionStep(cv1T4, ValueKind);
	Sequence.AddReferenceBindingStep(cv1T4, ValueKind == VK_RValue);
	ValueKind = isLValueRef ? VK_LValue : VK_XValue;

	// In any case, the reference is bound to the resulting glvalue (or to
	// an appropriate base class subobject).
	if (DerivedToBase)
	Sequence.AddDerivedToBaseCastStep(cv1T1, ValueKind);
	else if (ObjCConversion)
	Sequence.AddObjCObjectConversionStep(cv1T1);
	return;
	}

	// - has a class type (i.e., T2 is a class type), where T1 is not
	// reference-related to T2, and can be implicitly converted to an
	// xvalue, class prvalue, or function lvalue of type "cv3 T3",
	// where "cv1 T1" is reference-compatible with "cv3 T3",
	//
	// DR1287 removes the "implicitly" here.
	if (T2->isRecordType()) {
	if (RefRelationship == Sema::Ref_Incompatible) {
	ConvOvlResult = TryRefInitWithConversionFunction(
	S, Entity, Kind, Initializer, /AllowRValues/ true,
	/IsLValueRef/ isLValueRef, Sequence);
	if (ConvOvlResult)
	Sequence.SetOverloadFailure(
	InitializationSequence::FK_ReferenceInitOverloadFailed,
	ConvOvlResult);

	return;
	}

	if (RefRelationship == Sema::Ref_Compatible &&
	isRValueRef && InitCategory.isLValue()) {
	Sequence.SetFailed(
	InitializationSequence::FK_RValueReferenceBindingToLValue);
	return;
	}

	Sequence.SetFailed(InitializationSequence::FK_ReferenceInitDropsQualifiers);
	return;
	}

	// - Otherwise, a temporary of type "cv1 T1" is created and initialized
	// from the initializer expression using the rules for a non-reference
	// copy-initialization (8.5). The reference is then bound to the
	// temporary. [...]

	InitializedEntity TempEntity = InitializedEntity::InitializeTemporary(cv1T1);

	// FIXME: Why do we use an implicit conversion here rather than trying
	// copy-initialization?
	ImplicitConversionSequence ICS
	= S.TryImplicitConversion(Initializer, TempEntity.getType(),
	/SuppressUserConversions=/false,
	/AllowExplicit=/false,
	/FIXME:InOverloadResolution=/false,
	/CStyle=/Kind.isCStyleOrFunctionalCast(),
	/AllowObjCWritebackConversion=/false);

	if (ICS.isBad()) {
	// FIXME: Use the conversion function set stored in ICS to turn
	// this into an overloading ambiguity diagnostic. However, we need
	// to keep that set as an OverloadCandidateSet rather than as some
	// other kind of set.
	if (ConvOvlResult && !Sequence.getFailedCandidateSet().empty())
	Sequence.SetOverloadFailure(
	InitializationSequence::FK_ReferenceInitOverloadFailed,
	ConvOvlResult);
	else if (S.Context.getCanonicalType(T2) == S.Context.OverloadTy)
	Sequence.SetFailed(InitializationSequence::FK_AddressOfOverloadFailed);
	else
	Sequence.SetFailed(InitializationSequence::FK_ReferenceInitFailed);
	return;
	} else {
	Sequence.AddConversionSequenceStep(ICS, TempEntity.getType());
	}

	// [...] If T1 is reference-related to T2, cv1 must be the
	// same cv-qualification as, or greater cv-qualification
	// than, cv2; otherwise, the program is ill-formed.
	unsigned T1CVRQuals = T1Quals.getCVRQualifiers();
	unsigned T2CVRQuals = T2Quals.getCVRQualifiers();
	if (RefRelationship == Sema::Ref_Related &&
	(T1CVRQuals \| T2CVRQuals) != T1CVRQuals) {
	Sequence.SetFailed(InitializationSequence::FK_ReferenceInitDropsQualifiers);
	return;
	}

	// [...] If T1 is reference-related to T2 and the reference is an rvalue
	// reference, the initializer expression shall not be an lvalue.
	if (RefRelationship >= Sema::Ref_Related && !isLValueRef &&
	InitCategory.isLValue()) {
	Sequence.SetFailed(
	InitializationSequence::FK_RValueReferenceBindingToLValue);
	return;
	}

	Sequence.AddReferenceBindingStep(cv1T1, /bindingTemporary=/true);
	}

	/// \brief Attempt character array initialization from a string literal
	/// (C++ [dcl.init.string], C99 6.7.8).
	static void TryStringLiteralInitialization(Sema &S,
	const InitializedEntity &Entity,
	const InitializationKind &Kind,
	Expr *Initializer,
	InitializationSequence &Sequence) {
	Sequence.AddStringInitStep(Entity.getType());
	}

	/// \brief Attempt value initialization (C++ [dcl.init]p7).
	static void TryValueInitialization(Sema &S,
	const InitializedEntity &Entity,
	const InitializationKind &Kind,
	InitializationSequence &Sequence,
	InitListExpr *InitList) {
	assert((!InitList \|\| InitList->getNumInits() == 0) &&
	"Shouldn't use value-init for non-empty init lists");

	// C++98 [dcl.init]p5, C++11 [dcl.init]p7:
	//
	// To value-initialize an object of type T means:
	QualType T = Entity.getType();

	// -- if T is an array type, then each element is value-initialized;
	T = S.Context.getBaseElementType(T);

	if (const RecordType *RT = T->getAs<RecordType>()) {
	if (CXXRecordDecl *ClassDecl = dyn_cast<CXXRecordDecl>(RT->getDecl())) {
	bool NeedZeroInitialization = true;
	// C++98:
	// -- if T is a class type (clause 9) with a user-declared constructor
	// (12.1), then the default constructor for T is called (and the
	// initialization is ill-formed if T has no accessible default
	// constructor);
	// C++11:
	// -- if T is a class type (clause 9) with either no default constructor
	// (12.1 [class.ctor]) or a default constructor that is user-provided
	// or deleted, then the object is default-initialized;
	//
	// Note that the C++11 rule is the same as the C++98 rule if there are no
	// defaulted or deleted constructors, so we just use it unconditionally.
	CXXConstructorDecl *CD = S.LookupDefaultConstructor(ClassDecl);
	if (!CD \|\| !CD->getCanonicalDecl()->isDefaulted() \|\| CD->isDeleted())
	NeedZeroInitialization = false;

	// -- if T is a (possibly cv-qualified) non-union class type without a
	// user-provided or deleted default constructor, then the object is
	// zero-initialized and, if T has a non-trivial default constructor,
	// default-initialized;
	// The 'non-union' here was removed by DR1502. The 'non-trivial default
	// constructor' part was removed by DR1507.
	if (NeedZeroInitialization)
	Sequence.AddZeroInitializationStep(Entity.getType());

	// C++03:
	// -- if T is a non-union class type without a user-declared constructor,
	// then every non-static data member and base class component of T is
	// value-initialized;
	// [...] A program that calls for [...] value-initialization of an
	// entity of reference type is ill-formed.
	//
	// C++11 doesn't need this handling, because value-initialization does not
	// occur recursively there, and the implicit default constructor is
	// defined as deleted in the problematic cases.
	if (!S.getLangOpts().CPlusPlus11 &&
	ClassDecl->hasUninitializedReferenceMember()) {
	Sequence.SetFailed(InitializationSequence::FK_TooManyInitsForReference);
	return;
	}

	// If this is list-value-initialization, pass the empty init list on when
	// building the constructor call. This affects the semantics of a few
	// things (such as whether an explicit default constructor can be called).
	Expr *InitListAsExpr = InitList;
	MultiExprArg Args(&InitListAsExpr, InitList ? 1 : 0);
	bool InitListSyntax = InitList;

	// FIXME: Instead of creating a CXXConstructExpr of array type here,
	// wrap a class-typed CXXConstructExpr in an ArrayInitLoopExpr.
	return TryConstructorInitialization(
	S, Entity, Kind, Args, T, Entity.getType(), Sequence, InitListSyntax);
	}
	}

	Sequence.AddZeroInitializationStep(Entity.getType());
	}

	/// \brief Attempt default initialization (C++ [dcl.init]p6).
	static void TryDefaultInitialization(Sema &S,
	const InitializedEntity &Entity,
	const InitializationKind &Kind,
	InitializationSequence &Sequence) {
	assert(Kind.getKind() == InitializationKind::IK_Default);

	// C++ [dcl.init]p6:
	// To default-initialize an object of type T means:
	// - if T is an array type, each element is default-initialized;
	QualType DestType = S.Context.getBaseElementType(Entity.getType());

	// - if T is a (possibly cv-qualified) class type (Clause 9), the default
	// constructor for T is called (and the initialization is ill-formed if
	// T has no accessible default constructor);
	if (DestType->isRecordType() && S.getLangOpts().CPlusPlus) {
	TryConstructorInitialization(S, Entity, Kind, None, DestType,
	Entity.getType(), Sequence);
	return;
	}

	// - otherwise, no initialization is performed.

	// If a program calls for the default initialization of an object of
	// a const-qualified type T, T shall be a class type with a user-provided
	// default constructor.
	if (DestType.isConstQualified() && S.getLangOpts().CPlusPlus) {
	if (!maybeRecoverWithZeroInitialization(S, Sequence, Entity))
	Sequence.SetFailed(InitializationSequence::FK_DefaultInitOfConst);
	return;
	}

	// If the destination type has a lifetime property, zero-initialize it.
	if (DestType.getQualifiers().hasObjCLifetime()) {
	Sequence.AddZeroInitializationStep(Entity.getType());
	return;
	}
	}

	/// \brief Attempt a user-defined conversion between two types (C++ [dcl.init]),
	/// which enumerates all conversion functions and performs overload resolution
	/// to select the best.
	static void TryUserDefinedConversion(Sema &S,
	QualType DestType,
	const InitializationKind &Kind,
	Expr *Initializer,
	InitializationSequence &Sequence,
	bool TopLevelOfInitList) {
	assert(!DestType->isReferenceType() && "References are handled elsewhere");
	QualType SourceType = Initializer->getType();
	assert((DestType->isRecordType() \|\| SourceType->isRecordType()) &&
	"Must have a class type to perform a user-defined conversion");

	// Build the candidate set directly in the initialization sequence
	// structure, so that it will persist if we fail.
	OverloadCandidateSet &CandidateSet = Sequence.getFailedCandidateSet();
	CandidateSet.clear(OverloadCandidateSet::CSK_InitByUserDefinedConversion);

	// Determine whether we are allowed to call explicit constructors or
	// explicit conversion operators.
	bool AllowExplicit = Kind.AllowExplicit();

	if (const RecordType *DestRecordType = DestType->getAs<RecordType>()) {
	// The type we're converting to is a class type. Enumerate its constructors
	// to see if there is a suitable conversion.
	CXXRecordDecl *DestRecordDecl
	= cast<CXXRecordDecl>(DestRecordType->getDecl());

	// Try to complete the type we're converting to.
	if (S.isCompleteType(Kind.getLocation(), DestType)) {
	for (NamedDecl *D : S.LookupConstructors(DestRecordDecl)) {
	auto Info = getConstructorInfo(D);
	if (!Info.Constructor)
	continue;

	if (!Info.Constructor->isInvalidDecl() &&
	Info.Constructor->isConvertingConstructor(AllowExplicit)) {
	if (Info.ConstructorTmpl)
	S.AddTemplateOverloadCandidate(Info.ConstructorTmpl, Info.FoundDecl,
	/ExplicitArgs/ nullptr,
	Initializer, CandidateSet,
	/SuppressUserConversions=/true);
	else
	S.AddOverloadCandidate(Info.Constructor, Info.FoundDecl,
	Initializer, CandidateSet,
	/SuppressUserConversions=/true);
	}
	}
	}
	}

	SourceLocation DeclLoc = Initializer->getLocStart();

	if (const RecordType *SourceRecordType = SourceType->getAs<RecordType>()) {
	// The type we're converting from is a class type, enumerate its conversion
	// functions.

	// We can only enumerate the conversion functions for a complete type; if
	// the type isn't complete, simply skip this step.
	if (S.isCompleteType(DeclLoc, SourceType)) {
	CXXRecordDecl *SourceRecordDecl
	= cast<CXXRecordDecl>(SourceRecordType->getDecl());

	const auto &Conversions =
	SourceRecordDecl->getVisibleConversionFunctions();
	for (auto I = Conversions.begin(), E = Conversions.end(); I != E; ++I) {
	NamedDecl D = I;
	CXXRecordDecl *ActingDC = cast<CXXRecordDecl>(D->getDeclContext());
	if (isa<UsingShadowDecl>(D))
	D = cast<UsingShadowDecl>(D)->getTargetDecl();

	FunctionTemplateDecl *ConvTemplate = dyn_cast<FunctionTemplateDecl>(D);
	CXXConversionDecl *Conv;
	if (ConvTemplate)
	Conv = cast<CXXConversionDecl>(ConvTemplate->getTemplatedDecl());
	else
	Conv = cast<CXXConversionDecl>(D);

	if (AllowExplicit \|\| !Conv->isExplicit()) {
	if (ConvTemplate)
	S.AddTemplateConversionCandidate(ConvTemplate, I.getPair(),
	ActingDC, Initializer, DestType,
	CandidateSet, AllowExplicit);
	else
	S.AddConversionCandidate(Conv, I.getPair(), ActingDC,
	Initializer, DestType, CandidateSet,
	AllowExplicit);
	}
	}
	}
	}

	// Perform overload resolution. If it fails, return the failed result.
	OverloadCandidateSet::iterator Best;
	if (OverloadingResult Result
	= CandidateSet.BestViableFunction(S, DeclLoc, Best)) {
	Sequence.SetOverloadFailure(
	InitializationSequence::FK_UserConversionOverloadFailed,
	Result);
	return;
	}

	FunctionDecl *Function = Best->Function;
	Function->setReferenced();
	bool HadMultipleCandidates = (CandidateSet.size() > 1);

	if (isa<CXXConstructorDecl>(Function)) {
	// Add the user-defined conversion step. Any cv-qualification conversion is
	// subsumed by the initialization. Per DR5, the created temporary is of the
	// cv-unqualified type of the destination.
	Sequence.AddUserConversionStep(Function, Best->FoundDecl,
	DestType.getUnqualifiedType(),
	HadMultipleCandidates);

	// C++14 and before:
	// - if the function is a constructor, the call initializes a temporary
	// of the cv-unqualified version of the destination type. The [...]
	// temporary [...] is then used to direct-initialize, according to the
	// rules above, the object that is the destination of the
	// copy-initialization.
	// Note that this just performs a simple object copy from the temporary.
	//
	// C++17:
	// - if the function is a constructor, the call is a prvalue of the
	// cv-unqualified version of the destination type whose return object
	// is initialized by the constructor. The call is used to
	// direct-initialize, according to the rules above, the object that
	// is the destination of the copy-initialization.
	// Therefore we need to do nothing further.
	//
	// FIXME: Mark this copy as extraneous.
	if (!S.getLangOpts().CPlusPlus17)
	Sequence.AddFinalCopy(DestType);
	else if (DestType.hasQualifiers())
	Sequence.AddQualificationConversionStep(DestType, VK_RValue);
	return;
	}

	// Add the user-defined conversion step that calls the conversion function.
	QualType ConvType = Function->getCallResultType();
	Sequence.AddUserConversionStep(Function, Best->FoundDecl, ConvType,
	HadMultipleCandidates);

	if (ConvType->getAs<RecordType>()) {
	// The call is used to direct-initialize [...] the object that is the
	// destination of the copy-initialization.
	//
	// In C++17, this does not call a constructor if we enter /17.6.1:
	// - If the initializer expression is a prvalue and the cv-unqualified
	// version of the source type is the same as the class of the
	// destination [... do not make an extra copy]
	//
	// FIXME: Mark this copy as extraneous.
	if (!S.getLangOpts().CPlusPlus17 \|\|
	Function->getReturnType()->isReferenceType() \|\|
	!S.Context.hasSameUnqualifiedType(ConvType, DestType))
	Sequence.AddFinalCopy(DestType);
	else if (!S.Context.hasSameType(ConvType, DestType))
	Sequence.AddQualificationConversionStep(DestType, VK_RValue);
	return;
	}

	// If the conversion following the call to the conversion function
	// is interesting, add it as a separate step.
	if (Best->FinalConversion.First \|\| Best->FinalConversion.Second \|\|
	Best->FinalConversion.Third) {
	ImplicitConversionSequence ICS;
	ICS.setStandard();
	ICS.Standard = Best->FinalConversion;
	Sequence.AddConversionSequenceStep(ICS, DestType, TopLevelOfInitList);
	}
	}

	/// An egregious hack for compatibility with libstdc++-4.2: in <tr1/hashtable>,
	/// a function with a pointer return type contains a 'return false;' statement.
	/// In C++11, 'false' is not a null pointer, so this breaks the build of any
	/// code using that header.
	///
	/// Work around this by treating 'return false;' as zero-initializing the result
	/// if it's used in a pointer-returning function in a system header.
	static bool isLibstdcxxPointerReturnFalseHack(Sema &S,
	const InitializedEntity &Entity,
	const Expr *Init) {
	return S.getLangOpts().CPlusPlus11 &&
	Entity.getKind() == InitializedEntity::EK_Result &&
	Entity.getType()->isPointerType() &&
	isa<CXXBoolLiteralExpr>(Init) &&
	!cast<CXXBoolLiteralExpr>(Init)->getValue() &&
	S.getSourceManager().isInSystemHeader(Init->getExprLoc());
	}

	/// The non-zero enum values here are indexes into diagnostic alternatives.
	enum InvalidICRKind { IIK_okay, IIK_nonlocal, IIK_nonscalar };

	/// Determines whether this expression is an acceptable ICR source.
	static InvalidICRKind isInvalidICRSource(ASTContext &C, Expr *e,
	bool isAddressOf, bool &isWeakAccess) {
	// Skip parens.
	e = e->IgnoreParens();

	// Skip address-of nodes.
	if (UnaryOperator *op = dyn_cast<UnaryOperator>(e)) {
	if (op->getOpcode() == UO_AddrOf)
	return isInvalidICRSource(C, op->getSubExpr(), /addressof/ true,
	isWeakAccess);

	// Skip certain casts.
	} else if (CastExpr *ce = dyn_cast<CastExpr>(e)) {
	switch (ce->getCastKind()) {
	case CK_Dependent:
	case CK_BitCast:
	case CK_LValueBitCast:
	case CK_NoOp:
	return isInvalidICRSource(C, ce->getSubExpr(), isAddressOf, isWeakAccess);

	case CK_ArrayToPointerDecay:
	return IIK_nonscalar;

	case CK_NullToPointer:
	return IIK_okay;

	default:
	break;
	}

	// If we have a declaration reference, it had better be a local variable.
	} else if (isa<DeclRefExpr>(e)) {
	// set isWeakAccess to true, to mean that there will be an implicit
	// load which requires a cleanup.
	if (e->getType().getObjCLifetime() == Qualifiers::OCL_Weak)
	isWeakAccess = true;

	if (!isAddressOf) return IIK_nonlocal;

	VarDecl *var = dyn_cast<VarDecl>(cast<DeclRefExpr>(e)->getDecl());
	if (!var) return IIK_nonlocal;

	return (var->hasLocalStorage() ? IIK_okay : IIK_nonlocal);

	// If we have a conditional operator, check both sides.
	} else if (ConditionalOperator *cond = dyn_cast<ConditionalOperator>(e)) {
	if (InvalidICRKind iik = isInvalidICRSource(C, cond->getLHS(), isAddressOf,
	isWeakAccess))
	return iik;

	return isInvalidICRSource(C, cond->getRHS(), isAddressOf, isWeakAccess);

	// These are never scalar.
	} else if (isa<ArraySubscriptExpr>(e)) {
	return IIK_nonscalar;

	// Otherwise, it needs to be a null pointer constant.
	} else {
	return (e->isNullPointerConstant(C, Expr::NPC_ValueDependentIsNull)
	? IIK_okay : IIK_nonlocal);
	}

	return IIK_nonlocal;
	}

	/// Check whether the given expression is a valid operand for an
	/// indirect copy/restore.
	static void checkIndirectCopyRestoreSource(Sema &S, Expr *src) {
	assert(src->isRValue());
	bool isWeakAccess = false;
	InvalidICRKind iik = isInvalidICRSource(S.Context, src, false, isWeakAccess);
	// If isWeakAccess to true, there will be an implicit
	// load which requires a cleanup.
	if (S.getLangOpts().ObjCAutoRefCount && isWeakAccess)
	S.Cleanup.setExprNeedsCleanups(true);

	if (iik == IIK_okay) return;

	S.Diag(src->getExprLoc(), diag::err_arc_nonlocal_writeback)
	<< ((unsigned) iik - 1) // shift index into diagnostic explanations
	<< src->getSourceRange();
	}

	/// \brief Determine whether we have compatible array types for the
	/// purposes of GNU by-copy array initialization.
	static bool hasCompatibleArrayTypes(ASTContext &Context, const ArrayType *Dest,
	const ArrayType *Source) {
	// If the source and destination array types are equivalent, we're
	// done.
	if (Context.hasSameType(QualType(Dest, 0), QualType(Source, 0)))
	return true;

	// Make sure that the element types are the same.
	if (!Context.hasSameType(Dest->getElementType(), Source->getElementType()))
	return false;

	// The only mismatch we allow is when the destination is an
	// incomplete array type and the source is a constant array type.
	return Source->isConstantArrayType() && Dest->isIncompleteArrayType();
	}

	static bool tryObjCWritebackConversion(Sema &S,
	InitializationSequence &Sequence,
	const InitializedEntity &Entity,
	Expr *Initializer) {
	bool ArrayDecay = false;
	QualType ArgType = Initializer->getType();
	QualType ArgPointee;
	if (const ArrayType *ArgArrayType = S.Context.getAsArrayType(ArgType)) {
	ArrayDecay = true;
	ArgPointee = ArgArrayType->getElementType();
	ArgType = S.Context.getPointerType(ArgPointee);
	}

	// Handle write-back conversion.
	QualType ConvertedArgType;
	if (!S.isObjCWritebackConversion(ArgType, Entity.getType(),
	ConvertedArgType))
	return false;

	// We should copy unless we're passing to an argument explicitly
	// marked 'out'.
	bool ShouldCopy = true;
	if (ParmVarDecl *param = cast_or_null<ParmVarDecl>(Entity.getDecl()))
	ShouldCopy = (param->getObjCDeclQualifier() != ParmVarDecl::OBJC_TQ_Out);

	// Do we need an lvalue conversion?
	if (ArrayDecay \|\| Initializer->isGLValue()) {
	ImplicitConversionSequence ICS;
	ICS.setStandard();
	ICS.Standard.setAsIdentityConversion();

	QualType ResultType;
	if (ArrayDecay) {
	ICS.Standard.First = ICK_Array_To_Pointer;
	ResultType = S.Context.getPointerType(ArgPointee);
	} else {
	ICS.Standard.First = ICK_Lvalue_To_Rvalue;
	ResultType = Initializer->getType().getNonLValueExprType(S.Context);
	}

	Sequence.AddConversionSequenceStep(ICS, ResultType);
	}

	Sequence.AddPassByIndirectCopyRestoreStep(Entity.getType(), ShouldCopy);
	return true;
	}

	static bool TryOCLSamplerInitialization(Sema &S,
	InitializationSequence &Sequence,
	QualType DestType,
	Expr *Initializer) {
	if (!S.getLangOpts().OpenCL \|\| !DestType->isSamplerT() \|\|
	(!Initializer->isIntegerConstantExpr(S.Context) &&
	!Initializer->getType()->isSamplerT()))
	return false;

	Sequence.AddOCLSamplerInitStep(DestType);
	return true;
	}

	//
	// OpenCL 1.2 spec, s6.12.10
	//
	// The event argument can also be used to associate the
	// async_work_group_copy with a previous async copy allowing
	// an event to be shared by multiple async copies; otherwise
	// event should be zero.
	//
	static bool TryOCLZeroEventInitialization(Sema &S,
	InitializationSequence &Sequence,
	QualType DestType,
	Expr *Initializer) {
	if (!S.getLangOpts().OpenCL \|\| !DestType->isEventT() \|\|
	!Initializer->isIntegerConstantExpr(S.getASTContext()) \|\|
	(Initializer->EvaluateKnownConstInt(S.getASTContext()) != 0))
	return false;

	Sequence.AddOCLZeroEventStep(DestType);
	return true;
	}

	static bool TryOCLZeroQueueInitialization(Sema &S,
	InitializationSequence &Sequence,
	QualType DestType,
	Expr *Initializer) {
	if (!S.getLangOpts().OpenCL \|\| S.getLangOpts().OpenCLVersion < 200 \|\|
	!DestType->isQueueT() \|\|
	!Initializer->isIntegerConstantExpr(S.getASTContext()) \|\|
	(Initializer->EvaluateKnownConstInt(S.getASTContext()) != 0))
	return false;

	Sequence.AddOCLZeroQueueStep(DestType);
	return true;
	}

	InitializationSequence::InitializationSequence(Sema &S,
	const InitializedEntity &Entity,
	const InitializationKind &Kind,
	MultiExprArg Args,
	bool TopLevelOfInitList,
	bool TreatUnavailableAsInvalid)
	: FailedCandidateSet(Kind.getLocation(), OverloadCandidateSet::CSK_Normal) {
	InitializeFrom(S, Entity, Kind, Args, TopLevelOfInitList,
	TreatUnavailableAsInvalid);
	}

	/// Tries to get a FunctionDecl out of `E`. If it succeeds and we can take the
	/// address of that function, this returns true. Otherwise, it returns false.
	static bool isExprAnUnaddressableFunction(Sema &S, const Expr *E) {
	auto *DRE = dyn_cast<DeclRefExpr>(E);
	if (!DRE \|\| !isa<FunctionDecl>(DRE->getDecl()))
	return false;

	return !S.checkAddressOfFunctionIsAvailable(
	cast<FunctionDecl>(DRE->getDecl()));
	}

	/// Determine whether we can perform an elementwise array copy for this kind
	/// of entity.
	static bool canPerformArrayCopy(const InitializedEntity &Entity) {
	switch (Entity.getKind()) {
	case InitializedEntity::EK_LambdaCapture:
	// C++ [expr.prim.lambda]p24:
	// For array members, the array elements are direct-initialized in
	// increasing subscript order.
	return true;

	case InitializedEntity::EK_Variable:
	// C++ [dcl.decomp]p1:
	// [...] each element is copy-initialized or direct-initialized from the
	// corresponding element of the assignment-expression [...]
	return isa<DecompositionDecl>(Entity.getDecl());

	case InitializedEntity::EK_Member:
	// C++ [class.copy.ctor]p14:
	// - if the member is an array, each element is direct-initialized with
	// the corresponding subobject of x
	return Entity.isImplicitMemberInitializer();

	case InitializedEntity::EK_ArrayElement:
	// All the above cases are intended to apply recursively, even though none
	// of them actually say that.
	if (auto *E = Entity.getParent())
	return canPerformArrayCopy(*E);
	break;

	default:
	break;
	}

	return false;
	}

	void InitializationSequence::InitializeFrom(Sema &S,
	const InitializedEntity &Entity,
	const InitializationKind &Kind,
	MultiExprArg Args,
	bool TopLevelOfInitList,
	bool TreatUnavailableAsInvalid) {
	ASTContext &Context = S.Context;

	// Eliminate non-overload placeholder types in the arguments. We
	// need to do this before checking whether types are dependent
	// because lowering a pseudo-object expression might well give us
	// something of dependent type.
	for (unsigned I = 0, E = Args.size(); I != E; ++I)
	if (Args[I]->getType()->isNonOverloadPlaceholderType()) {
	// FIXME: should we be doing this here?
	ExprResult result = S.CheckPlaceholderExpr(Args[I]);
	if (result.isInvalid()) {
	SetFailed(FK_PlaceholderType);
	return;
	}
	Args[I] = result.get();
	}

	// C++0x [dcl.init]p16:
	// The semantics of initializers are as follows. The destination type is
	// the type of the object or reference being initialized and the source
	// type is the type of the initializer expression. The source type is not
	// defined when the initializer is a braced-init-list or when it is a
	// parenthesized list of expressions.
	QualType DestType = Entity.getType();

	if (DestType->isDependentType() \|\|
	Expr::hasAnyTypeDependentArguments(Args)) {
	SequenceKind = DependentSequence;
	return;
	}

	// Almost everything is a normal sequence.
	setSequenceKind(NormalSequence);

	QualType SourceType;
	Expr *Initializer = nullptr;
	if (Args.size() == 1) {
	Initializer = Args[0];
	if (S.getLangOpts().ObjC1) {
	if (S.CheckObjCBridgeRelatedConversions(Initializer->getLocStart(),
	DestType, Initializer->getType(),
	Initializer) \|\|
	S.ConversionToObjCStringLiteralCheck(DestType, Initializer))
	Args[0] = Initializer;
	}
	if (!isa<InitListExpr>(Initializer))
	SourceType = Initializer->getType();
	}

	// - If the initializer is a (non-parenthesized) braced-init-list, the
	// object is list-initialized (8.5.4).
	if (Kind.getKind() != InitializationKind::IK_Direct) {
	if (InitListExpr *InitList = dyn_cast_or_null<InitListExpr>(Initializer)) {
	TryListInitialization(S, Entity, Kind, InitList, *this,
	TreatUnavailableAsInvalid);
	return;
	}
	}

	// - If the destination type is a reference type, see 8.5.3.
	if (DestType->isReferenceType()) {
	// C++0x [dcl.init.ref]p1:
	// A variable declared to be a T& or T&&, that is, "reference to type T"
	// (8.3.2), shall be initialized by an object, or function, of type T or
	// by an object that can be converted into a T.
	// (Therefore, multiple arguments are not permitted.)
	if (Args.size() != 1)
	SetFailed(FK_TooManyInitsForReference);
	// C++17 [dcl.init.ref]p5:
	// A reference [...] is initialized by an expression [...] as follows:
	// If the initializer is not an expression, presumably we should reject,
	// but the standard fails to actually say so.
	else if (isa<InitListExpr>(Args[0]))
	SetFailed(FK_ParenthesizedListInitForReference);
	else
	TryReferenceInitialization(S, Entity, Kind, Args[0], *this);
	return;
	}

	// - If the initializer is (), the object is value-initialized.
	if (Kind.getKind() == InitializationKind::IK_Value \|\|
	(Kind.getKind() == InitializationKind::IK_Direct && Args.empty())) {
	TryValueInitialization(S, Entity, Kind, *this);
	return;
	}

	// Handle default initialization.
	if (Kind.getKind() == InitializationKind::IK_Default) {
	TryDefaultInitialization(S, Entity, Kind, *this);
	return;
	}

	// - If the destination type is an array of characters, an array of
	// char16_t, an array of char32_t, or an array of wchar_t, and the
	// initializer is a string literal, see 8.5.2.
	// - Otherwise, if the destination type is an array, the program is
	// ill-formed.
	if (const ArrayType *DestAT = Context.getAsArrayType(DestType)) {
	if (Initializer && isa<VariableArrayType>(DestAT)) {
	SetFailed(FK_VariableLengthArrayHasInitializer);
	return;
	}

	if (Initializer) {
	switch (IsStringInit(Initializer, DestAT, Context)) {
	case SIF_None:
	TryStringLiteralInitialization(S, Entity, Kind, Initializer, *this);
	return;
	case SIF_NarrowStringIntoWideChar:
	SetFailed(FK_NarrowStringIntoWideCharArray);
	return;
	case SIF_WideStringIntoChar:
	SetFailed(FK_WideStringIntoCharArray);
	return;
	case SIF_IncompatWideStringIntoWideChar:
	SetFailed(FK_IncompatWideStringIntoWideChar);
	return;
	case SIF_Other:
	break;
	}
	}

	// Some kinds of initialization permit an array to be initialized from
	// another array of the same type, and perform elementwise initialization.
	if (Initializer && isa<ConstantArrayType>(DestAT) &&
	S.Context.hasSameUnqualifiedType(Initializer->getType(),
	Entity.getType()) &&
	canPerformArrayCopy(Entity)) {
	// If source is a prvalue, use it directly.
	if (Initializer->getValueKind() == VK_RValue) {
	AddArrayInitStep(DestType, /IsGNUExtension/false);
	return;
	}

	// Emit element-at-a-time copy loop.
	InitializedEntity Element =
	InitializedEntity::InitializeElement(S.Context, 0, Entity);
	QualType InitEltT =
	Context.getAsArrayType(Initializer->getType())->getElementType();
	OpaqueValueExpr OVE(Initializer->getExprLoc(), InitEltT,
	Initializer->getValueKind(),
	Initializer->getObjectKind());
	Expr *OVEAsExpr = &OVE;
	InitializeFrom(S, Element, Kind, OVEAsExpr, TopLevelOfInitList,
	TreatUnavailableAsInvalid);
	if (!Failed())
	AddArrayInitLoopStep(Entity.getType(), InitEltT);
	return;
	}

	// Note: as an GNU C extension, we allow initialization of an
	// array from a compound literal that creates an array of the same
	// type, so long as the initializer has no side effects.
	if (!S.getLangOpts().CPlusPlus && Initializer &&
	isa<CompoundLiteralExpr>(Initializer->IgnoreParens()) &&
	Initializer->getType()->isArrayType()) {
	const ArrayType *SourceAT
	= Context.getAsArrayType(Initializer->getType());
	if (!hasCompatibleArrayTypes(S.Context, DestAT, SourceAT))
	SetFailed(FK_ArrayTypeMismatch);
	else if (Initializer->HasSideEffects(S.Context))
	SetFailed(FK_NonConstantArrayInit);
	else {
	AddArrayInitStep(DestType, /IsGNUExtension/true);
	}
	}
	// Note: as a GNU C++ extension, we allow list-initialization of a
	// class member of array type from a parenthesized initializer list.
	else if (S.getLangOpts().CPlusPlus &&
	Entity.getKind() == InitializedEntity::EK_Member &&
	Initializer && isa<InitListExpr>(Initializer)) {
	TryListInitialization(S, Entity, Kind, cast<InitListExpr>(Initializer),
	*this, TreatUnavailableAsInvalid);
	AddParenthesizedArrayInitStep(DestType);
	} else if (DestAT->getElementType()->isCharType())
	SetFailed(FK_ArrayNeedsInitListOrStringLiteral);
	else if (IsWideCharCompatible(DestAT->getElementType(), Context))
	SetFailed(FK_ArrayNeedsInitListOrWideStringLiteral);
	else
	SetFailed(FK_ArrayNeedsInitList);

	return;
	}

	// Determine whether we should consider writeback conversions for
	// Objective-C ARC.
	bool allowObjCWritebackConversion = S.getLangOpts().ObjCAutoRefCount &&
	Entity.isParameterKind();

	// We're at the end of the line for C: it's either a write-back conversion
	// or it's a C assignment. There's no need to check anything else.
	if (!S.getLangOpts().CPlusPlus) {
	// If allowed, check whether this is an Objective-C writeback conversion.
	if (allowObjCWritebackConversion &&
	tryObjCWritebackConversion(S, *this, Entity, Initializer)) {
	return;
	}

	if (TryOCLSamplerInitialization(S, *this, DestType, Initializer))
	return;

	if (TryOCLZeroEventInitialization(S, *this, DestType, Initializer))
	return;

	if (TryOCLZeroQueueInitialization(S, *this, DestType, Initializer))
	return;

	// Handle initialization in C
	AddCAssignmentStep(DestType);
	MaybeProduceObjCObject(S, *this, Entity);
	return;
	}

	assert(S.getLangOpts().CPlusPlus);

	// - If the destination type is a (possibly cv-qualified) class type:
	if (DestType->isRecordType()) {
	// - If the initialization is direct-initialization, or if it is
	// copy-initialization where the cv-unqualified version of the
	// source type is the same class as, or a derived class of, the
	// class of the destination, constructors are considered. [...]
	if (Kind.getKind() == InitializationKind::IK_Direct \|\|
	(Kind.getKind() == InitializationKind::IK_Copy &&
	(Context.hasSameUnqualifiedType(SourceType, DestType) \|\|
	S.IsDerivedFrom(Initializer->getLocStart(), SourceType, DestType))))
	TryConstructorInitialization(S, Entity, Kind, Args,
	DestType, DestType, *this);
	// - Otherwise (i.e., for the remaining copy-initialization cases),
	// user-defined conversion sequences that can convert from the source
	// type to the destination type or (when a conversion function is
	// used) to a derived class thereof are enumerated as described in
	// 13.3.1.4, and the best one is chosen through overload resolution
	// (13.3).
	else
	TryUserDefinedConversion(S, DestType, Kind, Initializer, *this,
	TopLevelOfInitList);
	return;
	}

	assert(Args.size() >= 1 && "Zero-argument case handled above");

	// The remaining cases all need a source type.
	if (Args.size() > 1) {
	SetFailed(FK_TooManyInitsForScalar);
	return;
	} else if (isa<InitListExpr>(Args[0])) {
	SetFailed(FK_ParenthesizedListInitForScalar);
	return;
	}

	// - Otherwise, if the source type is a (possibly cv-qualified) class
	// type, conversion functions are considered.
	if (!SourceType.isNull() && SourceType->isRecordType()) {
	// For a conversion to _Atomic(T) from either T or a class type derived
	// from T, initialize the T object then convert to _Atomic type.
	bool NeedAtomicConversion = false;
	if (const AtomicType *Atomic = DestType->getAs<AtomicType>()) {
	if (Context.hasSameUnqualifiedType(SourceType, Atomic->getValueType()) \|\|
	S.IsDerivedFrom(Initializer->getLocStart(), SourceType,
	Atomic->getValueType())) {
	DestType = Atomic->getValueType();
	NeedAtomicConversion = true;
	}
	}

	TryUserDefinedConversion(S, DestType, Kind, Initializer, *this,
	TopLevelOfInitList);
	MaybeProduceObjCObject(S, *this, Entity);
	if (!Failed() && NeedAtomicConversion)
	AddAtomicConversionStep(Entity.getType());
	return;
	}

	// - Otherwise, the initial value of the object being initialized is the
	// (possibly converted) value of the initializer expression. Standard
	// conversions (Clause 4) will be used, if necessary, to convert the
	// initializer expression to the cv-unqualified version of the
	// destination type; no user-defined conversions are considered.

	ImplicitConversionSequence ICS
	= S.TryImplicitConversion(Initializer, DestType,
	/SuppressUserConversions/true,
	/AllowExplicitConversions/ false,
	/InOverloadResolution/ false,
	/CStyle=/Kind.isCStyleOrFunctionalCast(),
	allowObjCWritebackConversion);

	if (ICS.isStandard() &&
	ICS.Standard.Second == ICK_Writeback_Conversion) {
	// Objective-C ARC writeback conversion.

	// We should copy unless we're passing to an argument explicitly
	// marked 'out'.
	bool ShouldCopy = true;
	if (ParmVarDecl *Param = cast_or_null<ParmVarDecl>(Entity.getDecl()))
	ShouldCopy = (Param->getObjCDeclQualifier() != ParmVarDecl::OBJC_TQ_Out);

	// If there was an lvalue adjustment, add it as a separate conversion.
	if (ICS.Standard.First == ICK_Array_To_Pointer \|\|
	ICS.Standard.First == ICK_Lvalue_To_Rvalue) {
	ImplicitConversionSequence LvalueICS;
	LvalueICS.setStandard();
	LvalueICS.Standard.setAsIdentityConversion();
	LvalueICS.Standard.setAllToTypes(ICS.Standard.getToType(0));
	LvalueICS.Standard.First = ICS.Standard.First;
	AddConversionSequenceStep(LvalueICS, ICS.Standard.getToType(0));
	}

	AddPassByIndirectCopyRestoreStep(DestType, ShouldCopy);
	} else if (ICS.isBad()) {
	DeclAccessPair dap;
	if (isLibstdcxxPointerReturnFalseHack(S, Entity, Initializer)) {
	AddZeroInitializationStep(Entity.getType());
	} else if (Initializer->getType() == Context.OverloadTy &&
	!S.ResolveAddressOfOverloadedFunction(Initializer, DestType,
	false, dap))
	SetFailed(InitializationSequence::FK_AddressOfOverloadFailed);
	else if (Initializer->getType()->isFunctionType() &&
	isExprAnUnaddressableFunction(S, Initializer))
	SetFailed(InitializationSequence::FK_AddressOfUnaddressableFunction);
	else
	SetFailed(InitializationSequence::FK_ConversionFailed);
	} else {
	AddConversionSequenceStep(ICS, DestType, TopLevelOfInitList);

	MaybeProduceObjCObject(S, *this, Entity);
	}
	}

	InitializationSequence::~InitializationSequence() {
	for (auto &S : Steps)
	S.Destroy();
	}

	//===----------------------------------------------------------------------===//
	// Perform initialization
	//===----------------------------------------------------------------------===//
	static Sema::AssignmentAction
	getAssignmentAction(const InitializedEntity &Entity, bool Diagnose = false) {
	switch(Entity.getKind()) {
	case InitializedEntity::EK_Variable:
	case InitializedEntity::EK_New:
	case InitializedEntity::EK_Exception:
	case InitializedEntity::EK_Base:
	case InitializedEntity::EK_Delegating:
	return Sema::AA_Initializing;

	case InitializedEntity::EK_Parameter:
	if (Entity.getDecl() &&
	isa<ObjCMethodDecl>(Entity.getDecl()->getDeclContext()))
	return Sema::AA_Sending;

	return Sema::AA_Passing;

	case InitializedEntity::EK_Parameter_CF_Audited:
	if (Entity.getDecl() &&
	isa<ObjCMethodDecl>(Entity.getDecl()->getDeclContext()))
	return Sema::AA_Sending;

	return !Diagnose ? Sema::AA_Passing : Sema::AA_Passing_CFAudited;

	case InitializedEntity::EK_Result:
	return Sema::AA_Returning;

	case InitializedEntity::EK_Temporary:
	case InitializedEntity::EK_RelatedResult:
	// FIXME: Can we tell apart casting vs. converting?
	return Sema::AA_Casting;

	case InitializedEntity::EK_Member:
	case InitializedEntity::EK_Binding:
	case InitializedEntity::EK_ArrayElement:
	case InitializedEntity::EK_VectorElement:
	case InitializedEntity::EK_ComplexElement:
	case InitializedEntity::EK_BlockElement:
	case InitializedEntity::EK_LambdaToBlockConversionBlockElement:
	case InitializedEntity::EK_LambdaCapture:
	case InitializedEntity::EK_CompoundLiteralInit:
	return Sema::AA_Initializing;
	}

	llvm_unreachable("Invalid EntityKind!");
	}

	/// \brief Whether we should bind a created object as a temporary when
	/// initializing the given entity.
	static bool shouldBindAsTemporary(const InitializedEntity &Entity) {
	switch (Entity.getKind()) {
	case InitializedEntity::EK_ArrayElement:
	case InitializedEntity::EK_Member:
	case InitializedEntity::EK_Result:
	case InitializedEntity::EK_New:
	case InitializedEntity::EK_Variable:
	case InitializedEntity::EK_Base:
	case InitializedEntity::EK_Delegating:
	case InitializedEntity::EK_VectorElement:
	case InitializedEntity::EK_ComplexElement:
	case InitializedEntity::EK_Exception:
	case InitializedEntity::EK_BlockElement:
	case InitializedEntity::EK_LambdaToBlockConversionBlockElement:
	case InitializedEntity::EK_LambdaCapture:
	case InitializedEntity::EK_CompoundLiteralInit:
	return false;

	case InitializedEntity::EK_Parameter:
	case InitializedEntity::EK_Parameter_CF_Audited:
	case InitializedEntity::EK_Temporary:
	case InitializedEntity::EK_RelatedResult:
	case InitializedEntity::EK_Binding:
	return true;
	}

	llvm_unreachable("missed an InitializedEntity kind?");
	}

	/// \brief Whether the given entity, when initialized with an object
	/// created for that initialization, requires destruction.
	static bool shouldDestroyEntity(const InitializedEntity &Entity) {
	switch (Entity.getKind()) {
	case InitializedEntity::EK_Result:
	case InitializedEntity::EK_New:
	case InitializedEntity::EK_Base:
	case InitializedEntity::EK_Delegating:
	case InitializedEntity::EK_VectorElement:
	case InitializedEntity::EK_ComplexElement:
	case InitializedEntity::EK_BlockElement:
	case InitializedEntity::EK_LambdaToBlockConversionBlockElement:
	case InitializedEntity::EK_LambdaCapture:
	return false;

	case InitializedEntity::EK_Member:
	case InitializedEntity::EK_Binding:
	case InitializedEntity::EK_Variable:
	case InitializedEntity::EK_Parameter:
	case InitializedEntity::EK_Parameter_CF_Audited:
	case InitializedEntity::EK_Temporary:
	case InitializedEntity::EK_ArrayElement:
	case InitializedEntity::EK_Exception:
	case InitializedEntity::EK_CompoundLiteralInit:
	case InitializedEntity::EK_RelatedResult:
	return true;
	}

	llvm_unreachable("missed an InitializedEntity kind?");
	}

	/// \brief Get the location at which initialization diagnostics should appear.
	static SourceLocation getInitializationLoc(const InitializedEntity &Entity,
	Expr *Initializer) {
	switch (Entity.getKind()) {
	case InitializedEntity::EK_Result:
	return Entity.getReturnLoc();

	case InitializedEntity::EK_Exception:
	return Entity.getThrowLoc();

	case InitializedEntity::EK_Variable:
	case InitializedEntity::EK_Binding:
	return Entity.getDecl()->getLocation();

	case InitializedEntity::EK_LambdaCapture:
	return Entity.getCaptureLoc();

	case InitializedEntity::EK_ArrayElement:
	case InitializedEntity::EK_Member:
	case InitializedEntity::EK_Parameter:
	case InitializedEntity::EK_Parameter_CF_Audited:
	case InitializedEntity::EK_Temporary:
	case InitializedEntity::EK_New:
	case InitializedEntity::EK_Base:
	case InitializedEntity::EK_Delegating:
	case InitializedEntity::EK_VectorElement:
	case InitializedEntity::EK_ComplexElement:
	case InitializedEntity::EK_BlockElement:
	case InitializedEntity::EK_LambdaToBlockConversionBlockElement:
	case InitializedEntity::EK_CompoundLiteralInit:
	case InitializedEntity::EK_RelatedResult:
	return Initializer->getLocStart();
	}
	llvm_unreachable("missed an InitializedEntity kind?");
	}

	/// \brief Make a (potentially elidable) temporary copy of the object
	/// provided by the given initializer by calling the appropriate copy
	/// constructor.
	///
	/// \param S The Sema object used for type-checking.
	///
	/// \param T The type of the temporary object, which must either be
	/// the type of the initializer expression or a superclass thereof.
	///
	/// \param Entity The entity being initialized.
	///
	/// \param CurInit The initializer expression.
	///
	/// \param IsExtraneousCopy Whether this is an "extraneous" copy that
	/// is permitted in C++03 (but not C++0x) when binding a reference to
	/// an rvalue.
	///
	/// \returns An expression that copies the initializer expression into
	/// a temporary object, or an error expression if a copy could not be
	/// created.
	static ExprResult CopyObject(Sema &S,
	QualType T,
	const InitializedEntity &Entity,
	ExprResult CurInit,
	bool IsExtraneousCopy) {
	if (CurInit.isInvalid())
	return CurInit;
	// Determine which class type we're copying to.
	Expr CurInitExpr = (Expr )CurInit.get();
	CXXRecordDecl *Class = nullptr;
	if (const RecordType *Record = T->getAs<RecordType>())
	Class = cast<CXXRecordDecl>(Record->getDecl());
	if (!Class)
	return CurInit;

	SourceLocation Loc = getInitializationLoc(Entity, CurInit.get());

	// Make sure that the type we are copying is complete.
	if (S.RequireCompleteType(Loc, T, diag::err_temp_copy_incomplete))
	return CurInit;

	// Perform overload resolution using the class's constructors. Per
	// C++11 [dcl.init]p16, second bullet for class types, this initialization
	// is direct-initialization.
	OverloadCandidateSet CandidateSet(Loc, OverloadCandidateSet::CSK_Normal);
	DeclContext::lookup_result Ctors = S.LookupConstructors(Class);

	OverloadCandidateSet::iterator Best;
	switch (ResolveConstructorOverload(
	S, Loc, CurInitExpr, CandidateSet, T, Ctors, Best,
	/CopyInitializing=/false, /AllowExplicit=/true,
	/OnlyListConstructors=/false, /IsListInit=/false,
	/SecondStepOfCopyInit=/true)) {
	case OR_Success:
	break;

	case OR_No_Viable_Function:
	S.Diag(Loc, IsExtraneousCopy && !S.isSFINAEContext()
	? diag::ext_rvalue_to_reference_temp_copy_no_viable
	: diag::err_temp_copy_no_viable)
	<< (int)Entity.getKind() << CurInitExpr->getType()
	<< CurInitExpr->getSourceRange();
	CandidateSet.NoteCandidates(S, OCD_AllCandidates, CurInitExpr);
	if (!IsExtraneousCopy \|\| S.isSFINAEContext())
	return ExprError();
	return CurInit;

	case OR_Ambiguous:
	S.Diag(Loc, diag::err_temp_copy_ambiguous)
	<< (int)Entity.getKind() << CurInitExpr->getType()
	<< CurInitExpr->getSourceRange();
	CandidateSet.NoteCandidates(S, OCD_ViableCandidates, CurInitExpr);
	return ExprError();

	case OR_Deleted:
	S.Diag(Loc, diag::err_temp_copy_deleted)
	<< (int)Entity.getKind() << CurInitExpr->getType()
	<< CurInitExpr->getSourceRange();
	S.NoteDeletedFunction(Best->Function);
	return ExprError();
	}

	bool HadMultipleCandidates = CandidateSet.size() > 1;

	CXXConstructorDecl *Constructor = cast<CXXConstructorDecl>(Best->Function);
	SmallVector<Expr*, 8> ConstructorArgs;
	CurInit.get(); // Ownership transferred into MultiExprArg, below.

	S.CheckConstructorAccess(Loc, Constructor, Best->FoundDecl, Entity,
	IsExtraneousCopy);

	if (IsExtraneousCopy) {
	// If this is a totally extraneous copy for C++03 reference
	// binding purposes, just return the original initialization
	// expression. We don't generate an (elided) copy operation here
	// because doing so would require us to pass down a flag to avoid
	// infinite recursion, where each step adds another extraneous,
	// elidable copy.

	// Instantiate the default arguments of any extra parameters in
	// the selected copy constructor, as if we were going to create a
	// proper call to the copy constructor.
	for (unsigned I = 1, N = Constructor->getNumParams(); I != N; ++I) {
	ParmVarDecl *Parm = Constructor->getParamDecl(I);
	if (S.RequireCompleteType(Loc, Parm->getType(),
	diag::err_call_incomplete_argument))
	break;

	// Build the default argument expression; we don't actually care
	// if this succeeds or not, because this routine will complain
	// if there was a problem.
	S.BuildCXXDefaultArgExpr(Loc, Constructor, Parm);
	}

	return CurInitExpr;
	}

	// Determine the arguments required to actually perform the
	// constructor call (we might have derived-to-base conversions, or
	// the copy constructor may have default arguments).
	if (S.CompleteConstructorCall(Constructor, CurInitExpr, Loc, ConstructorArgs))
	return ExprError();

	// C++0x [class.copy]p32:
	// When certain criteria are met, an implementation is allowed to
	// omit the copy/move construction of a class object, even if the
	// copy/move constructor and/or destructor for the object have
	// side effects. [...]
	// - when a temporary class object that has not been bound to a
	// reference (12.2) would be copied/moved to a class object
	// with the same cv-unqualified type, the copy/move operation
	// can be omitted by constructing the temporary object
	// directly into the target of the omitted copy/move
	//
	// Note that the other three bullets are handled elsewhere. Copy
	// elision for return statements and throw expressions are handled as part
	// of constructor initialization, while copy elision for exception handlers
	// is handled by the run-time.
	//
	// FIXME: If the function parameter is not the same type as the temporary, we
	// should still be able to elide the copy, but we don't have a way to
	// represent in the AST how much should be elided in this case.
	bool Elidable =
	CurInitExpr->isTemporaryObject(S.Context, Class) &&
	S.Context.hasSameUnqualifiedType(
	Best->Function->getParamDecl(0)->getType().getNonReferenceType(),
	CurInitExpr->getType());

	// Actually perform the constructor call.
	CurInit = S.BuildCXXConstructExpr(Loc, T, Best->FoundDecl, Constructor,
	Elidable,
	ConstructorArgs,
	HadMultipleCandidates,
	/ListInit/ false,
	/StdInitListInit/ false,
	/ZeroInit/ false,
	CXXConstructExpr::CK_Complete,
	SourceRange());

	// If we're supposed to bind temporaries, do so.
	if (!CurInit.isInvalid() && shouldBindAsTemporary(Entity))
	CurInit = S.MaybeBindToTemporary(CurInit.getAs<Expr>());
	return CurInit;
	}

	/// \brief Check whether elidable copy construction for binding a reference to
	/// a temporary would have succeeded if we were building in C++98 mode, for
	/// -Wc++98-compat.
	static void CheckCXX98CompatAccessibleCopy(Sema &S,
	const InitializedEntity &Entity,
	Expr *CurInitExpr) {
	assert(S.getLangOpts().CPlusPlus11);

	const RecordType *Record = CurInitExpr->getType()->getAs<RecordType>();
	if (!Record)
	return;

	SourceLocation Loc = getInitializationLoc(Entity, CurInitExpr);
	if (S.Diags.isIgnored(diag::warn_cxx98_compat_temp_copy, Loc))
	return;

	// Find constructors which would have been considered.
	OverloadCandidateSet CandidateSet(Loc, OverloadCandidateSet::CSK_Normal);
	DeclContext::lookup_result Ctors =
	S.LookupConstructors(cast<CXXRecordDecl>(Record->getDecl()));

	// Perform overload resolution.
	OverloadCandidateSet::iterator Best;
	OverloadingResult OR = ResolveConstructorOverload(
	S, Loc, CurInitExpr, CandidateSet, CurInitExpr->getType(), Ctors, Best,
	/CopyInitializing=/false, /AllowExplicit=/true,
	/OnlyListConstructors=/false, /IsListInit=/false,
	/SecondStepOfCopyInit=/true);

	PartialDiagnostic Diag = S.PDiag(diag::warn_cxx98_compat_temp_copy)
	<< OR << (int)Entity.getKind() << CurInitExpr->getType()
	<< CurInitExpr->getSourceRange();

	switch (OR) {
	case OR_Success:
	S.CheckConstructorAccess(Loc, cast<CXXConstructorDecl>(Best->Function),
	Best->FoundDecl, Entity, Diag);
	// FIXME: Check default arguments as far as that's possible.
	break;

	case OR_No_Viable_Function:
	S.Diag(Loc, Diag);
	CandidateSet.NoteCandidates(S, OCD_AllCandidates, CurInitExpr);
	break;

	case OR_Ambiguous:
	S.Diag(Loc, Diag);
	CandidateSet.NoteCandidates(S, OCD_ViableCandidates, CurInitExpr);
	break;

	case OR_Deleted:
	S.Diag(Loc, Diag);
	S.NoteDeletedFunction(Best->Function);
	break;
	}
	}

	void InitializationSequence::PrintInitLocationNote(Sema &S,
	const InitializedEntity &Entity) {
	if (Entity.isParameterKind() && Entity.getDecl()) {
	if (Entity.getDecl()->getLocation().isInvalid())
	return;

	if (Entity.getDecl()->getDeclName())
	S.Diag(Entity.getDecl()->getLocation(), diag::note_parameter_named_here)
	<< Entity.getDecl()->getDeclName();
	else
	S.Diag(Entity.getDecl()->getLocation(), diag::note_parameter_here);
	}
	else if (Entity.getKind() == InitializedEntity::EK_RelatedResult &&
	Entity.getMethodDecl())
	S.Diag(Entity.getMethodDecl()->getLocation(),
	diag::note_method_return_type_change)
	<< Entity.getMethodDecl()->getDeclName();
	}

	/// Returns true if the parameters describe a constructor initialization of
	/// an explicit temporary object, e.g. "Point(x, y)".
	static bool isExplicitTemporary(const InitializedEntity &Entity,
	const InitializationKind &Kind,
	unsigned NumArgs) {
	switch (Entity.getKind()) {
	case InitializedEntity::EK_Temporary:
	case InitializedEntity::EK_CompoundLiteralInit:
	case InitializedEntity::EK_RelatedResult:
	break;
	default:
	return false;
	}

	switch (Kind.getKind()) {
	case InitializationKind::IK_DirectList:
	return true;
	// FIXME: Hack to work around cast weirdness.
	case InitializationKind::IK_Direct:
	case InitializationKind::IK_Value:
	return NumArgs != 1;
	default:
	return false;
	}
	}

	static ExprResult
	PerformConstructorInitialization(Sema &S,
	const InitializedEntity &Entity,
	const InitializationKind &Kind,
	MultiExprArg Args,
	const InitializationSequence::Step& Step,
	bool &ConstructorInitRequiresZeroInit,
	bool IsListInitialization,
	bool IsStdInitListInitialization,
	SourceLocation LBraceLoc,
	SourceLocation RBraceLoc) {
	unsigned NumArgs = Args.size();
	CXXConstructorDecl *Constructor
	= cast<CXXConstructorDecl>(Step.Function.Function);
	bool HadMultipleCandidates = Step.Function.HadMultipleCandidates;

	// Build a call to the selected constructor.
	SmallVector<Expr*, 8> ConstructorArgs;
	SourceLocation Loc = (Kind.isCopyInit() && Kind.getEqualLoc().isValid())
	? Kind.getEqualLoc()
	: Kind.getLocation();

	if (Kind.getKind() == InitializationKind::IK_Default) {
	// Force even a trivial, implicit default constructor to be
	// semantically checked. We do this explicitly because we don't build
	// the definition for completely trivial constructors.
	assert(Constructor->getParent() && "No parent class for constructor.");
	if (Constructor->isDefaulted() && Constructor->isDefaultConstructor() &&
	Constructor->isTrivial() && !Constructor->isUsed(false))
	S.DefineImplicitDefaultConstructor(Loc, Constructor);
	}

	ExprResult CurInit((Expr *)nullptr);

	// C++ [over.match.copy]p1:
	// - When initializing a temporary to be bound to the first parameter
	// of a constructor that takes a reference to possibly cv-qualified
	// T as its first argument, called with a single argument in the
	// context of direct-initialization, explicit conversion functions
	// are also considered.
	bool AllowExplicitConv =
	Kind.AllowExplicit() && !Kind.isCopyInit() && Args.size() == 1 &&
	hasCopyOrMoveCtorParam(S.Context,
	getConstructorInfo(Step.Function.FoundDecl));

	// Determine the arguments required to actually perform the constructor
	// call.
	if (S.CompleteConstructorCall(Constructor, Args,
	Loc, ConstructorArgs,
	AllowExplicitConv,
	IsListInitialization))
	return ExprError();


	if (isExplicitTemporary(Entity, Kind, NumArgs)) {
	// An explicitly-constructed temporary, e.g., X(1, 2).
	if (S.DiagnoseUseOfDecl(Constructor, Loc))
	return ExprError();

	TypeSourceInfo *TSInfo = Entity.getTypeSourceInfo();
	if (!TSInfo)
	TSInfo = S.Context.getTrivialTypeSourceInfo(Entity.getType(), Loc);
	SourceRange ParenOrBraceRange =
	(Kind.getKind() == InitializationKind::IK_DirectList)
	? SourceRange(LBraceLoc, RBraceLoc)
	: Kind.getParenRange();

	if (auto *Shadow = dyn_cast<ConstructorUsingShadowDecl>(
	Step.Function.FoundDecl.getDecl())) {
	Constructor = S.findInheritingConstructor(Loc, Constructor, Shadow);
	if (S.DiagnoseUseOfDecl(Constructor, Loc))
	return ExprError();
	}
	S.MarkFunctionReferenced(Loc, Constructor);

	CurInit = new (S.Context) CXXTemporaryObjectExpr(
	S.Context, Constructor,
	Entity.getType().getNonLValueExprType(S.Context), TSInfo,
	ConstructorArgs, ParenOrBraceRange, HadMultipleCandidates,
	IsListInitialization, IsStdInitListInitialization,
	ConstructorInitRequiresZeroInit);
	} else {
	CXXConstructExpr::ConstructionKind ConstructKind =
	CXXConstructExpr::CK_Complete;

	if (Entity.getKind() == InitializedEntity::EK_Base) {
	ConstructKind = Entity.getBaseSpecifier()->isVirtual() ?
	CXXConstructExpr::CK_VirtualBase :
	CXXConstructExpr::CK_NonVirtualBase;
	} else if (Entity.getKind() == InitializedEntity::EK_Delegating) {
	ConstructKind = CXXConstructExpr::CK_Delegating;
	}

	// Only get the parenthesis or brace range if it is a list initialization or
	// direct construction.
	SourceRange ParenOrBraceRange;
	if (IsListInitialization)
	ParenOrBraceRange = SourceRange(LBraceLoc, RBraceLoc);
	else if (Kind.getKind() == InitializationKind::IK_Direct)
	ParenOrBraceRange = Kind.getParenRange();

	// If the entity allows NRVO, mark the construction as elidable
	// unconditionally.
	if (Entity.allowsNRVO())
	CurInit = S.BuildCXXConstructExpr(Loc, Step.Type,
	Step.Function.FoundDecl,
	Constructor, /Elidable=/true,
	ConstructorArgs,
	HadMultipleCandidates,
	IsListInitialization,
	IsStdInitListInitialization,
	ConstructorInitRequiresZeroInit,
	ConstructKind,
	ParenOrBraceRange);
	else
	CurInit = S.BuildCXXConstructExpr(Loc, Step.Type,
	Step.Function.FoundDecl,
	Constructor,
	ConstructorArgs,
	HadMultipleCandidates,
	IsListInitialization,
	IsStdInitListInitialization,
	ConstructorInitRequiresZeroInit,
	ConstructKind,
	ParenOrBraceRange);
	}
	if (CurInit.isInvalid())
	return ExprError();

	// Only check access if all of that succeeded.
	S.CheckConstructorAccess(Loc, Constructor, Step.Function.FoundDecl, Entity);
	if (S.DiagnoseUseOfDecl(Step.Function.FoundDecl, Loc))
	return ExprError();

	if (shouldBindAsTemporary(Entity))
	CurInit = S.MaybeBindToTemporary(CurInit.get());

	return CurInit;
	}

	/// Determine whether the specified InitializedEntity definitely has a lifetime
	/// longer than the current full-expression. Conservatively returns false if
	/// it's unclear.
	static bool
	InitializedEntityOutlivesFullExpression(const InitializedEntity &Entity) {
	const InitializedEntity *Top = &Entity;
	while (Top->getParent())
	Top = Top->getParent();

	switch (Top->getKind()) {
	case InitializedEntity::EK_Variable:
	case InitializedEntity::EK_Result:
	case InitializedEntity::EK_Exception:
	case InitializedEntity::EK_Member:
	case InitializedEntity::EK_Binding:
	case InitializedEntity::EK_New:
	case InitializedEntity::EK_Base:
	case InitializedEntity::EK_Delegating:
	return true;

	case InitializedEntity::EK_ArrayElement:
	case InitializedEntity::EK_VectorElement:
	case InitializedEntity::EK_BlockElement:
	case InitializedEntity::EK_LambdaToBlockConversionBlockElement:
	case InitializedEntity::EK_ComplexElement:
	// Could not determine what the full initialization is. Assume it might not
	// outlive the full-expression.
	return false;

	case InitializedEntity::EK_Parameter:
	case InitializedEntity::EK_Parameter_CF_Audited:
	case InitializedEntity::EK_Temporary:
	case InitializedEntity::EK_LambdaCapture:
	case InitializedEntity::EK_CompoundLiteralInit:
	case InitializedEntity::EK_RelatedResult:
	// The entity being initialized might not outlive the full-expression.
	return false;
	}

	llvm_unreachable("unknown entity kind");
	}

	/// Determine the declaration which an initialized entity ultimately refers to,
	/// for the purpose of lifetime-extending a temporary bound to a reference in
	/// the initialization of \p Entity.
	static const InitializedEntity *getEntityForTemporaryLifetimeExtension(
	const InitializedEntity *Entity,
	const InitializedEntity *FallbackDecl = nullptr) {
	// C++11 [class.temporary]p5:
	switch (Entity->getKind()) {
	case InitializedEntity::EK_Variable:
	// The temporary [...] persists for the lifetime of the reference
	return Entity;

	case InitializedEntity::EK_Member:
	// For subobjects, we look at the complete object.
	if (Entity->getParent())
	return getEntityForTemporaryLifetimeExtension(Entity->getParent(),
	Entity);

	// except:
	// -- A temporary bound to a reference member in a constructor's
	// ctor-initializer persists until the constructor exits.
	return Entity;

	case InitializedEntity::EK_Binding:
	// Per [dcl.decomp]p3, the binding is treated as a variable of reference
	// type.
	return Entity;

	case InitializedEntity::EK_Parameter:
	case InitializedEntity::EK_Parameter_CF_Audited:
	// -- A temporary bound to a reference parameter in a function call
	// persists until the completion of the full-expression containing
	// the call.
	case InitializedEntity::EK_Result:
	// -- The lifetime of a temporary bound to the returned value in a
	// function return statement is not extended; the temporary is
	// destroyed at the end of the full-expression in the return statement.
	case InitializedEntity::EK_New:
	// -- A temporary bound to a reference in a new-initializer persists
	// until the completion of the full-expression containing the
	// new-initializer.
	return nullptr;

	case InitializedEntity::EK_Temporary:
	case InitializedEntity::EK_CompoundLiteralInit:
	case InitializedEntity::EK_RelatedResult:
	// We don't yet know the storage duration of the surrounding temporary.
	// Assume it's got full-expression duration for now, it will patch up our
	// storage duration if that's not correct.
	return nullptr;

	case InitializedEntity::EK_ArrayElement:
	// For subobjects, we look at the complete object.
	return getEntityForTemporaryLifetimeExtension(Entity->getParent(),
	FallbackDecl);

	case InitializedEntity::EK_Base:
	// For subobjects, we look at the complete object.
	if (Entity->getParent())
	return getEntityForTemporaryLifetimeExtension(Entity->getParent(),
	Entity);
	LLVM_FALLTHROUGH;
	case InitializedEntity::EK_Delegating:
	// We can reach this case for aggregate initialization in a constructor:
	// struct A { int &&r; };
	// struct B : A { B() : A{0} {} };
	// In this case, use the innermost field decl as the context.
	return FallbackDecl;

	case InitializedEntity::EK_BlockElement:
	case InitializedEntity::EK_LambdaToBlockConversionBlockElement:
	case InitializedEntity::EK_LambdaCapture:
	case InitializedEntity::EK_Exception:
	case InitializedEntity::EK_VectorElement:
	case InitializedEntity::EK_ComplexElement:
	return nullptr;
	}
	llvm_unreachable("unknown entity kind");
	}

	static void performLifetimeExtension(Expr *Init,
	const InitializedEntity *ExtendingEntity);

	/// Update a glvalue expression that is used as the initializer of a reference
	/// to note that its lifetime is extended.
	/// \return \c true if any temporary had its lifetime extended.
	static bool
	performReferenceExtension(Expr *Init,
	const InitializedEntity *ExtendingEntity) {
	// Walk past any constructs which we can lifetime-extend across.
	Expr *Old;
	do {
	Old = Init;

	if (InitListExpr *ILE = dyn_cast<InitListExpr>(Init)) {
	if (ILE->getNumInits() == 1 && ILE->isGLValue()) {
	// This is just redundant braces around an initializer. Step over it.
	Init = ILE->getInit(0);
	}
	}

	// Step over any subobject adjustments; we may have a materialized
	// temporary inside them.
	Init = const_cast<Expr *>(Init->skipRValueSubobjectAdjustments());

	// Per current approach for DR1376, look through casts to reference type
	// when performing lifetime extension.
	if (CastExpr *CE = dyn_cast<CastExpr>(Init))
	if (CE->getSubExpr()->isGLValue())
	Init = CE->getSubExpr();

	// Per the current approach for DR1299, look through array element access
	// when performing lifetime extension.
	if (auto *ASE = dyn_cast<ArraySubscriptExpr>(Init))
	Init = ASE->getBase();
	} while (Init != Old);

	if (MaterializeTemporaryExpr *ME = dyn_cast<MaterializeTemporaryExpr>(Init)) {
	// Update the storage duration of the materialized temporary.
	// FIXME: Rebuild the expression instead of mutating it.
	ME->setExtendingDecl(ExtendingEntity->getDecl(),
	ExtendingEntity->allocateManglingNumber());
	performLifetimeExtension(ME->GetTemporaryExpr(), ExtendingEntity);
	return true;
	}

	return false;
	}

	/// Update a prvalue expression that is going to be materialized as a
	/// lifetime-extended temporary.
	static void performLifetimeExtension(Expr *Init,
	const InitializedEntity *ExtendingEntity) {
	// Dig out the expression which constructs the extended temporary.
	Init = const_cast<Expr *>(Init->skipRValueSubobjectAdjustments());

	if (CXXBindTemporaryExpr *BTE = dyn_cast<CXXBindTemporaryExpr>(Init))
	Init = BTE->getSubExpr();

	if (CXXStdInitializerListExpr *ILE =
	dyn_cast<CXXStdInitializerListExpr>(Init)) {
	performReferenceExtension(ILE->getSubExpr(), ExtendingEntity);
	return;
	}

	if (InitListExpr *ILE = dyn_cast<InitListExpr>(Init)) {
	if (ILE->getType()->isArrayType()) {
	for (unsigned I = 0, N = ILE->getNumInits(); I != N; ++I)
	performLifetimeExtension(ILE->getInit(I), ExtendingEntity);
	return;
	}

	if (CXXRecordDecl *RD = ILE->getType()->getAsCXXRecordDecl()) {
	assert(RD->isAggregate() && "aggregate init on non-aggregate");

	// If we lifetime-extend a braced initializer which is initializing an
	// aggregate, and that aggregate contains reference members which are
	// bound to temporaries, those temporaries are also lifetime-extended.
	if (RD->isUnion() && ILE->getInitializedFieldInUnion() &&
	ILE->getInitializedFieldInUnion()->getType()->isReferenceType())
	performReferenceExtension(ILE->getInit(0), ExtendingEntity);
	else {
	unsigned Index = 0;
	for (const auto *I : RD->fields()) {
	if (Index >= ILE->getNumInits())
	break;
	if (I->isUnnamedBitfield())
	continue;
	Expr *SubInit = ILE->getInit(Index);
	if (I->getType()->isReferenceType())
	performReferenceExtension(SubInit, ExtendingEntity);
	else if (isa<InitListExpr>(SubInit) \|\|
	isa<CXXStdInitializerListExpr>(SubInit))
	// This may be either aggregate-initialization of a member or
	// initialization of a std::initializer_list object. Either way,
	// we should recursively lifetime-extend that initializer.
	performLifetimeExtension(SubInit, ExtendingEntity);
	++Index;
	}
	}
	}
	}
	}

	static void warnOnLifetimeExtension(Sema &S, const InitializedEntity &Entity,
	const Expr *Init, bool IsInitializerList,
	const ValueDecl *ExtendingDecl) {
	// Warn if a field lifetime-extends a temporary.
	if (isa<FieldDecl>(ExtendingDecl)) {
	if (IsInitializerList) {
	S.Diag(Init->getExprLoc(), diag::warn_dangling_std_initializer_list)
	<< /at end of constructor/true;
	return;
	}

	bool IsSubobjectMember = false;
	for (const InitializedEntity *Ent = Entity.getParent(); Ent;
	Ent = Ent->getParent()) {
	if (Ent->getKind() != InitializedEntity::EK_Base) {
	IsSubobjectMember = true;
	break;
	}
	}
	S.Diag(Init->getExprLoc(),
	diag::warn_bind_ref_member_to_temporary)
	<< ExtendingDecl << Init->getSourceRange()
	<< IsSubobjectMember << IsInitializerList;
	if (IsSubobjectMember)
	S.Diag(ExtendingDecl->getLocation(),
	diag::note_ref_subobject_of_member_declared_here);
	else
	S.Diag(ExtendingDecl->getLocation(),
	diag::note_ref_or_ptr_member_declared_here)
	<< /is pointer/false;
	}
	}

	static void DiagnoseNarrowingInInitList(Sema &S,
	const ImplicitConversionSequence &ICS,
	QualType PreNarrowingType,
	QualType EntityType,
	const Expr *PostInit);

	/// Provide warnings when std::move is used on construction.
	static void CheckMoveOnConstruction(Sema &S, const Expr *InitExpr,
	bool IsReturnStmt) {
	if (!InitExpr)
	return;

	if (S.inTemplateInstantiation())
	return;

	QualType DestType = InitExpr->getType();
	if (!DestType->isRecordType())
	return;

	unsigned DiagID = 0;
	if (IsReturnStmt) {
	const CXXConstructExpr *CCE =
	dyn_cast<CXXConstructExpr>(InitExpr->IgnoreParens());
	if (!CCE \|\| CCE->getNumArgs() != 1)
	return;

	if (!CCE->getConstructor()->isCopyOrMoveConstructor())
	return;

	InitExpr = CCE->getArg(0)->IgnoreImpCasts();
	}

	// Find the std::move call and get the argument.
	const CallExpr *CE = dyn_cast<CallExpr>(InitExpr->IgnoreParens());
	if (!CE \|\| CE->getNumArgs() != 1)
	return;

	const FunctionDecl *MoveFunction = CE->getDirectCallee();
	if (!MoveFunction \|\| !MoveFunction->isInStdNamespace() \|\|
	!MoveFunction->getIdentifier() \|\|
	!MoveFunction->getIdentifier()->isStr("move"))
	return;

	const Expr *Arg = CE->getArg(0)->IgnoreImplicit();

	if (IsReturnStmt) {
	const DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(Arg->IgnoreParenImpCasts());
	if (!DRE \|\| DRE->refersToEnclosingVariableOrCapture())
	return;

	const VarDecl *VD = dyn_cast<VarDecl>(DRE->getDecl());
	if (!VD \|\| !VD->hasLocalStorage())
	return;

	// __block variables are not moved implicitly.
	if (VD->hasAttr<BlocksAttr>())
	return;

	QualType SourceType = VD->getType();
	if (!SourceType->isRecordType())
	return;

	if (!S.Context.hasSameUnqualifiedType(DestType, SourceType)) {
	return;
	}

	// If we're returning a function parameter, copy elision
	// is not possible.
	if (isa<ParmVarDecl>(VD))
	DiagID = diag::warn_redundant_move_on_return;
	else
	DiagID = diag::warn_pessimizing_move_on_return;
	} else {
	DiagID = diag::warn_pessimizing_move_on_initialization;
	const Expr *ArgStripped = Arg->IgnoreImplicit()->IgnoreParens();
	if (!ArgStripped->isRValue() \|\| !ArgStripped->getType()->isRecordType())
	return;
	}

	S.Diag(CE->getLocStart(), DiagID);

	// Get all the locations for a fix-it. Don't emit the fix-it if any location
	// is within a macro.
	SourceLocation CallBegin = CE->getCallee()->getLocStart();
	if (CallBegin.isMacroID())
	return;
	SourceLocation RParen = CE->getRParenLoc();
	if (RParen.isMacroID())
	return;
	SourceLocation LParen;
	SourceLocation ArgLoc = Arg->getLocStart();

	// Special testing for the argument location. Since the fix-it needs the
	// location right before the argument, the argument location can be in a
	// macro only if it is at the beginning of the macro.
	while (ArgLoc.isMacroID() &&
	S.getSourceManager().isAtStartOfImmediateMacroExpansion(ArgLoc)) {
	ArgLoc = S.getSourceManager().getImmediateExpansionRange(ArgLoc).first;
	}

	if (LParen.isMacroID())
	return;

	LParen = ArgLoc.getLocWithOffset(-1);

	S.Diag(CE->getLocStart(), diag::note_remove_move)
	<< FixItHint::CreateRemoval(SourceRange(CallBegin, LParen))
	<< FixItHint::CreateRemoval(SourceRange(RParen, RParen));
	}

	static void CheckForNullPointerDereference(Sema &S, const Expr *E) {
	// Check to see if we are dereferencing a null pointer. If so, this is
	// undefined behavior, so warn about it. This only handles the pattern
	// "*null", which is a very syntactic check.
	if (const UnaryOperator *UO = dyn_cast<UnaryOperator>(E->IgnoreParenCasts()))
	if (UO->getOpcode() == UO_Deref &&
	UO->getSubExpr()->IgnoreParenCasts()->
	isNullPointerConstant(S.Context, Expr::NPC_ValueDependentIsNotNull)) {
	S.DiagRuntimeBehavior(UO->getOperatorLoc(), UO,
	S.PDiag(diag::warn_binding_null_to_reference)
	<< UO->getSubExpr()->getSourceRange());
	}
	}

	MaterializeTemporaryExpr *
	Sema::CreateMaterializeTemporaryExpr(QualType T, Expr *Temporary,
	bool BoundToLvalueReference) {
	auto MTE = new (Context)
	MaterializeTemporaryExpr(T, Temporary, BoundToLvalueReference);

	// Order an ExprWithCleanups for lifetime marks.
	//
	// TODO: It'll be good to have a single place to check the access of the
	// destructor and generate ExprWithCleanups for various uses. Currently these
	// are done in both CreateMaterializeTemporaryExpr and MaybeBindToTemporary,
	// but there may be a chance to merge them.
	Cleanup.setExprNeedsCleanups(false);
	return MTE;
	}

	ExprResult Sema::TemporaryMaterializationConversion(Expr *E) {
	// In C++98, we don't want to implicitly create an xvalue.
	// FIXME: This means that AST consumers need to deal with "prvalues" that
	// denote materialized temporaries. Maybe we should add another ValueKind
	// for "xvalue pretending to be a prvalue" for C++98 support.
	if (!E->isRValue() \|\| !getLangOpts().CPlusPlus11)
	return E;

	// C++1z [conv.rval]/1: T shall be a complete type.
	// FIXME: Does this ever matter (can we form a prvalue of incomplete type)?
	// If so, we should check for a non-abstract class type here too.
	QualType T = E->getType();
	if (RequireCompleteType(E->getExprLoc(), T, diag::err_incomplete_type))
	return ExprError();

	return CreateMaterializeTemporaryExpr(E->getType(), E, false);
	}

	ExprResult
	InitializationSequence::Perform(Sema &S,
	const InitializedEntity &Entity,
	const InitializationKind &Kind,
	MultiExprArg Args,
	QualType *ResultType) {
	if (Failed()) {
	Diagnose(S, Entity, Kind, Args);
	return ExprError();
	}
	if (!ZeroInitializationFixit.empty()) {
	unsigned DiagID = diag::err_default_init_const;
	if (Decl *D = Entity.getDecl())
	if (S.getLangOpts().MSVCCompat && D->hasAttr<SelectAnyAttr>())
	DiagID = diag::ext_default_init_const;

	// The initialization would have succeeded with this fixit. Since the fixit
	// is on the error, we need to build a valid AST in this case, so this isn't
	// handled in the Failed() branch above.
	QualType DestType = Entity.getType();
	S.Diag(Kind.getLocation(), DiagID)
	<< DestType << (bool)DestType->getAs<RecordType>()
	<< FixItHint::CreateInsertion(ZeroInitializationFixitLoc,
	ZeroInitializationFixit);
	}

	if (getKind() == DependentSequence) {
	// If the declaration is a non-dependent, incomplete array type
	// that has an initializer, then its type will be completed once
	// the initializer is instantiated.
	if (ResultType && !Entity.getType()->isDependentType() &&
	Args.size() == 1) {
	QualType DeclType = Entity.getType();
	if (const IncompleteArrayType *ArrayT
	= S.Context.getAsIncompleteArrayType(DeclType)) {
	// FIXME: We don't currently have the ability to accurately
	// compute the length of an initializer list without
	// performing full type-checking of the initializer list
	// (since we have to determine where braces are implicitly
	// introduced and such). So, we fall back to making the array
	// type a dependently-sized array type with no specified
	// bound.
	if (isa<InitListExpr>((Expr *)Args[0])) {
	SourceRange Brackets;

	// Scavange the location of the brackets from the entity, if we can.
	if (auto *DD = dyn_cast_or_null<DeclaratorDecl>(Entity.getDecl())) {
	if (TypeSourceInfo *TInfo = DD->getTypeSourceInfo()) {
	TypeLoc TL = TInfo->getTypeLoc();
	if (IncompleteArrayTypeLoc ArrayLoc =
	TL.getAs<IncompleteArrayTypeLoc>())
	Brackets = ArrayLoc.getBracketsRange();
	}
	}

	*ResultType
	= S.Context.getDependentSizedArrayType(ArrayT->getElementType(),
	/NumElts=/nullptr,
	ArrayT->getSizeModifier(),
	ArrayT->getIndexTypeCVRQualifiers(),
	Brackets);
	}

	}
	}
	if (Kind.getKind() == InitializationKind::IK_Direct &&
	!Kind.isExplicitCast()) {
	// Rebuild the ParenListExpr.
	SourceRange ParenRange = Kind.getParenRange();
	return S.ActOnParenListExpr(ParenRange.getBegin(), ParenRange.getEnd(),
	Args);
	}
	assert(Kind.getKind() == InitializationKind::IK_Copy \|\|
	Kind.isExplicitCast() \|\|
	Kind.getKind() == InitializationKind::IK_DirectList);
	return ExprResult(Args[0]);
	}

	// No steps means no initialization.
	if (Steps.empty())
	return ExprResult((Expr *)nullptr);

	if (S.getLangOpts().CPlusPlus11 && Entity.getType()->isReferenceType() &&
	Args.size() == 1 && isa<InitListExpr>(Args[0]) &&
	!Entity.isParameterKind()) {
	// Produce a C++98 compatibility warning if we are initializing a reference
	// from an initializer list. For parameters, we produce a better warning
	// elsewhere.
	Expr *Init = Args[0];
	S.Diag(Init->getLocStart(), diag::warn_cxx98_compat_reference_list_init)
	<< Init->getSourceRange();
	}

	// OpenCL v2.0 s6.13.11.1. atomic variables can be initialized in global scope
	QualType ETy = Entity.getType();
	Qualifiers TyQualifiers = ETy.getQualifiers();
	bool HasGlobalAS = TyQualifiers.hasAddressSpace() &&
	TyQualifiers.getAddressSpace() == LangAS::opencl_global;

	if (S.getLangOpts().OpenCLVersion >= 200 &&
	ETy->isAtomicType() && !HasGlobalAS &&
	Entity.getKind() == InitializedEntity::EK_Variable && Args.size() > 0) {
	S.Diag(Args[0]->getLocStart(), diag::err_opencl_atomic_init) << 1 <<
	SourceRange(Entity.getDecl()->getLocStart(), Args[0]->getLocEnd());
	return ExprError();
	}

	// Diagnose cases where we initialize a pointer to an array temporary, and the
	// pointer obviously outlives the temporary.
	if (Args.size() == 1 && Args[0]->getType()->isArrayType() &&
	Entity.getType()->isPointerType() &&
	InitializedEntityOutlivesFullExpression(Entity)) {
	const Expr *Init = Args[0]->skipRValueSubobjectAdjustments();
	if (auto *MTE = dyn_cast<MaterializeTemporaryExpr>(Init))
	Init = MTE->GetTemporaryExpr();
	Expr::LValueClassification Kind = Init->ClassifyLValue(S.Context);
	if (Kind == Expr::LV_ClassTemporary \|\| Kind == Expr::LV_ArrayTemporary)
	S.Diag(Init->getLocStart(), diag::warn_temporary_array_to_pointer_decay)
	<< Init->getSourceRange();
	}

	QualType DestType = Entity.getType().getNonReferenceType();
	// FIXME: Ugly hack around the fact that Entity.getType() is not
	// the same as Entity.getDecl()->getType() in cases involving type merging,
	// and we want latter when it makes sense.
	if (ResultType)
	*ResultType = Entity.getDecl() ? Entity.getDecl()->getType() :
	Entity.getType();

	ExprResult CurInit((Expr *)nullptr);
	SmallVector<Expr*, 4> ArrayLoopCommonExprs;

	// For initialization steps that start with a single initializer,
	// grab the only argument out the Args and place it into the "current"
	// initializer.
	switch (Steps.front().Kind) {
	case SK_ResolveAddressOfOverloadedFunction:
	case SK_CastDerivedToBaseRValue:
	case SK_CastDerivedToBaseXValue:
	case SK_CastDerivedToBaseLValue:
	case SK_BindReference:
	case SK_BindReferenceToTemporary:
	case SK_FinalCopy:
	case SK_ExtraneousCopyToTemporary:
	case SK_UserConversion:
	case SK_QualificationConversionLValue:
	case SK_QualificationConversionXValue:
	case SK_QualificationConversionRValue:
	case SK_AtomicConversion:
	case SK_LValueToRValue:
	case SK_ConversionSequence:
	case SK_ConversionSequenceNoNarrowing:
	case SK_ListInitialization:
	case SK_UnwrapInitList:
	case SK_RewrapInitList:
	case SK_CAssignment:
	case SK_StringInit:
	case SK_ObjCObjectConversion:
	case SK_ArrayLoopIndex:
	case SK_ArrayLoopInit:
	case SK_ArrayInit:
	case SK_GNUArrayInit:
	case SK_ParenthesizedArrayInit:
	case SK_PassByIndirectCopyRestore:
	case SK_PassByIndirectRestore:
	case SK_ProduceObjCObject:
	case SK_StdInitializerList:
	case SK_OCLSamplerInit:
	case SK_OCLZeroEvent:
	case SK_OCLZeroQueue: {
	assert(Args.size() == 1);
	CurInit = Args[0];
	if (!CurInit.get()) return ExprError();
	break;
	}

	case SK_ConstructorInitialization:
	case SK_ConstructorInitializationFromList:
	case SK_StdInitializerListConstructorCall:
	case SK_ZeroInitialization:
	break;
	}

	// Promote from an unevaluated context to an unevaluated list context in
	// C++11 list-initialization; we need to instantiate entities usable in
	// constant expressions here in order to perform narrowing checks =(
	EnterExpressionEvaluationContext Evaluated(
	S, EnterExpressionEvaluationContext::InitList,
	CurInit.get() && isa<InitListExpr>(CurInit.get()));

	// C++ [class.abstract]p2:
	// no objects of an abstract class can be created except as subobjects
	// of a class derived from it
	auto checkAbstractType = [&](QualType T) -> bool {
	if (Entity.getKind() == InitializedEntity::EK_Base \|\|
	Entity.getKind() == InitializedEntity::EK_Delegating)
	return false;
	return S.RequireNonAbstractType(Kind.getLocation(), T,
	diag::err_allocation_of_abstract_type);
	};

	// Walk through the computed steps for the initialization sequence,
	// performing the specified conversions along the way.
	bool ConstructorInitRequiresZeroInit = false;
	for (step_iterator Step = step_begin(), StepEnd = step_end();
	Step != StepEnd; ++Step) {
	if (CurInit.isInvalid())
	return ExprError();

	QualType SourceType = CurInit.get() ? CurInit.get()->getType() : QualType();

	switch (Step->Kind) {
	case SK_ResolveAddressOfOverloadedFunction:
	// Overload resolution determined which function invoke; update the
	// initializer to reflect that choice.
	S.CheckAddressOfMemberAccess(CurInit.get(), Step->Function.FoundDecl);
	if (S.DiagnoseUseOfDecl(Step->Function.FoundDecl, Kind.getLocation()))
	return ExprError();
	CurInit = S.FixOverloadedFunctionReference(CurInit,
	Step->Function.FoundDecl,
	Step->Function.Function);
	break;

	case SK_CastDerivedToBaseRValue:
	case SK_CastDerivedToBaseXValue:
	case SK_CastDerivedToBaseLValue: {
	// We have a derived-to-base cast that produces either an rvalue or an
	// lvalue. Perform that cast.

	CXXCastPath BasePath;

	// Casts to inaccessible base classes are allowed with C-style casts.
	bool IgnoreBaseAccess = Kind.isCStyleOrFunctionalCast();
	if (S.CheckDerivedToBaseConversion(SourceType, Step->Type,
	CurInit.get()->getLocStart(),
	CurInit.get()->getSourceRange(),
	&BasePath, IgnoreBaseAccess))
	return ExprError();

	ExprValueKind VK =
	Step->Kind == SK_CastDerivedToBaseLValue ?
	VK_LValue :
	(Step->Kind == SK_CastDerivedToBaseXValue ?
	VK_XValue :
	VK_RValue);
	CurInit =
	ImplicitCastExpr::Create(S.Context, Step->Type, CK_DerivedToBase,
	CurInit.get(), &BasePath, VK);
	break;
	}

	case SK_BindReference:
	// Reference binding does not have any corresponding ASTs.

	// Check exception specifications
	if (S.CheckExceptionSpecCompatibility(CurInit.get(), DestType))
	return ExprError();

	// We don't check for e.g. function pointers here, since address
	// availability checks should only occur when the function first decays
	// into a pointer or reference.
	if (CurInit.get()->getType()->isFunctionProtoType()) {
	if (auto *DRE = dyn_cast<DeclRefExpr>(CurInit.get()->IgnoreParens())) {
	if (auto *FD = dyn_cast<FunctionDecl>(DRE->getDecl())) {
	if (!S.checkAddressOfFunctionIsAvailable(FD, /Complain=/true,
	DRE->getLocStart()))
	return ExprError();
	}
	}
	}

	// Even though we didn't materialize a temporary, the binding may still
	// extend the lifetime of a temporary. This happens if we bind a reference
	// to the result of a cast to reference type.
	if (const InitializedEntity *ExtendingEntity =
	getEntityForTemporaryLifetimeExtension(&Entity))
	if (performReferenceExtension(CurInit.get(), ExtendingEntity))
	warnOnLifetimeExtension(S, Entity, CurInit.get(),
	/IsInitializerList=/false,
	ExtendingEntity->getDecl());

	CheckForNullPointerDereference(S, CurInit.get());
	break;

	case SK_BindReferenceToTemporary: {
	// Make sure the "temporary" is actually an rvalue.
	assert(CurInit.get()->isRValue() && "not a temporary");

	// Check exception specifications
	if (S.CheckExceptionSpecCompatibility(CurInit.get(), DestType))
	return ExprError();

	// Materialize the temporary into memory.
	MaterializeTemporaryExpr *MTE = S.CreateMaterializeTemporaryExpr(
	Step->Type, CurInit.get(), Entity.getType()->isLValueReferenceType());

	// Maybe lifetime-extend the temporary's subobjects to match the
	// entity's lifetime.
	if (const InitializedEntity *ExtendingEntity =
	getEntityForTemporaryLifetimeExtension(&Entity))
	if (performReferenceExtension(MTE, ExtendingEntity))
	warnOnLifetimeExtension(S, Entity, CurInit.get(),
	/IsInitializerList=/false,
	ExtendingEntity->getDecl());

	// If we're extending this temporary to automatic storage duration -- we
	// need to register its cleanup during the full-expression's cleanups.
	if (MTE->getStorageDuration() == SD_Automatic &&
	MTE->getType().isDestructedType())
	S.Cleanup.setExprNeedsCleanups(true);

	CurInit = MTE;
	break;
	}

	case SK_FinalCopy:
	if (checkAbstractType(Step->Type))
	return ExprError();

	// If the overall initialization is initializing a temporary, we already
	// bound our argument if it was necessary to do so. If not (if we're
	// ultimately initializing a non-temporary), our argument needs to be
	// bound since it's initializing a function parameter.
	// FIXME: This is a mess. Rationalize temporary destruction.
	if (!shouldBindAsTemporary(Entity))
	CurInit = S.MaybeBindToTemporary(CurInit.get());
	CurInit = CopyObject(S, Step->Type, Entity, CurInit,
	/IsExtraneousCopy=/false);
	break;

	case SK_ExtraneousCopyToTemporary:
	CurInit = CopyObject(S, Step->Type, Entity, CurInit,
	/IsExtraneousCopy=/true);
	break;

	case SK_UserConversion: {
	// We have a user-defined conversion that invokes either a constructor
	// or a conversion function.
	CastKind CastKind;
	FunctionDecl *Fn = Step->Function.Function;
	DeclAccessPair FoundFn = Step->Function.FoundDecl;
	bool HadMultipleCandidates = Step->Function.HadMultipleCandidates;
	bool CreatedObject = false;
	if (CXXConstructorDecl *Constructor = dyn_cast<CXXConstructorDecl>(Fn)) {
	// Build a call to the selected constructor.
	SmallVector<Expr*, 8> ConstructorArgs;
	SourceLocation Loc = CurInit.get()->getLocStart();

	// Determine the arguments required to actually perform the constructor
	// call.
	Expr *Arg = CurInit.get();
	if (S.CompleteConstructorCall(Constructor,
	MultiExprArg(&Arg, 1),
	Loc, ConstructorArgs))
	return ExprError();

	// Build an expression that constructs a temporary.
	CurInit = S.BuildCXXConstructExpr(Loc, Step->Type,
	FoundFn, Constructor,
	ConstructorArgs,
	HadMultipleCandidates,
	/ListInit/ false,
	/StdInitListInit/ false,
	/ZeroInit/ false,
	CXXConstructExpr::CK_Complete,
	SourceRange());
	if (CurInit.isInvalid())
	return ExprError();

	S.CheckConstructorAccess(Kind.getLocation(), Constructor, FoundFn,
	Entity);
	if (S.DiagnoseUseOfDecl(FoundFn, Kind.getLocation()))
	return ExprError();

	CastKind = CK_ConstructorConversion;
	CreatedObject = true;
	} else {
	// Build a call to the conversion function.
	CXXConversionDecl *Conversion = cast<CXXConversionDecl>(Fn);
	S.CheckMemberOperatorAccess(Kind.getLocation(), CurInit.get(), nullptr,
	FoundFn);
	if (S.DiagnoseUseOfDecl(FoundFn, Kind.getLocation()))
	return ExprError();

	// FIXME: Should we move this initialization into a separate
	// derived-to-base conversion? I believe the answer is "no", because
	// we don't want to turn off access control here for c-style casts.
	CurInit = S.PerformObjectArgumentInitialization(CurInit.get(),
	/Qualifier=/nullptr,
	FoundFn, Conversion);
	if (CurInit.isInvalid())
	return ExprError();

	// Build the actual call to the conversion function.
	CurInit = S.BuildCXXMemberCallExpr(CurInit.get(), FoundFn, Conversion,
	HadMultipleCandidates);
	if (CurInit.isInvalid())
	return ExprError();

	CastKind = CK_UserDefinedConversion;
	CreatedObject = Conversion->getReturnType()->isRecordType();
	}

	if (CreatedObject && checkAbstractType(CurInit.get()->getType()))
	return ExprError();

	CurInit = ImplicitCastExpr::Create(S.Context, CurInit.get()->getType(),
	CastKind, CurInit.get(), nullptr,
	CurInit.get()->getValueKind());

	if (shouldBindAsTemporary(Entity))
	// The overall entity is temporary, so this expression should be
	// destroyed at the end of its full-expression.
	CurInit = S.MaybeBindToTemporary(CurInit.getAs<Expr>());
	else if (CreatedObject && shouldDestroyEntity(Entity)) {
	// The object outlasts the full-expression, but we need to prepare for
	// a destructor being run on it.
	// FIXME: It makes no sense to do this here. This should happen
	// regardless of how we initialized the entity.
	QualType T = CurInit.get()->getType();
	if (const RecordType *Record = T->getAs<RecordType>()) {
	CXXDestructorDecl *Destructor
	= S.LookupDestructor(cast<CXXRecordDecl>(Record->getDecl()));
	S.CheckDestructorAccess(CurInit.get()->getLocStart(), Destructor,
	S.PDiag(diag::err_access_dtor_temp) << T);
	S.MarkFunctionReferenced(CurInit.get()->getLocStart(), Destructor);
	if (S.DiagnoseUseOfDecl(Destructor, CurInit.get()->getLocStart()))
	return ExprError();
	}
	}
	break;
	}

	case SK_QualificationConversionLValue:
	case SK_QualificationConversionXValue:
	case SK_QualificationConversionRValue: {
	// Perform a qualification conversion; these can never go wrong.
	ExprValueKind VK =
	Step->Kind == SK_QualificationConversionLValue ?
	VK_LValue :
	(Step->Kind == SK_QualificationConversionXValue ?
	VK_XValue :
	VK_RValue);
	CurInit = S.ImpCastExprToType(CurInit.get(), Step->Type, CK_NoOp, VK);
	break;
	}

	case SK_AtomicConversion: {
	assert(CurInit.get()->isRValue() && "cannot convert glvalue to atomic");
	CurInit = S.ImpCastExprToType(CurInit.get(), Step->Type,
	CK_NonAtomicToAtomic, VK_RValue);
	break;
	}

	case SK_LValueToRValue: {
	assert(CurInit.get()->isGLValue() && "cannot load from a prvalue");
	CurInit = ImplicitCastExpr::Create(S.Context, Step->Type,
	CK_LValueToRValue, CurInit.get(),
	/BasePath=/nullptr, VK_RValue);
	break;
	}

	case SK_ConversionSequence:
	case SK_ConversionSequenceNoNarrowing: {
	Sema::CheckedConversionKind CCK
	= Kind.isCStyleCast()? Sema::CCK_CStyleCast
	: Kind.isFunctionalCast()? Sema::CCK_FunctionalCast
	: Kind.isExplicitCast()? Sema::CCK_OtherCast
	: Sema::CCK_ImplicitConversion;
	ExprResult CurInitExprRes =
	S.PerformImplicitConversion(CurInit.get(), Step->Type, *Step->ICS,
	getAssignmentAction(Entity), CCK);
	if (CurInitExprRes.isInvalid())
	return ExprError();

	S.DiscardMisalignedMemberAddress(Step->Type.getTypePtr(), CurInit.get());

	CurInit = CurInitExprRes;

	if (Step->Kind == SK_ConversionSequenceNoNarrowing &&
	S.getLangOpts().CPlusPlus)
	DiagnoseNarrowingInInitList(S, *Step->ICS, SourceType, Entity.getType(),
	CurInit.get());

	break;
	}

	case SK_ListInitialization: {
	if (checkAbstractType(Step->Type))
	return ExprError();

	InitListExpr *InitList = cast<InitListExpr>(CurInit.get());
	// If we're not initializing the top-level entity, we need to create an
	// InitializeTemporary entity for our target type.
	QualType Ty = Step->Type;
	bool IsTemporary = !S.Context.hasSameType(Entity.getType(), Ty);
	InitializedEntity TempEntity = InitializedEntity::InitializeTemporary(Ty);
	InitializedEntity InitEntity = IsTemporary ? TempEntity : Entity;
	InitListChecker PerformInitList(S, InitEntity,
	InitList, Ty, /VerifyOnly=/false,
	/TreatUnavailableAsInvalid=/false);
	if (PerformInitList.HadError())
	return ExprError();

	// Hack: We must update *ResultType if available in order to set the
	// bounds of arrays, e.g. in 'int ar[] = {1, 2, 3};'.
	// Worst case: 'const int (&arref)[] = {1, 2, 3};'.
	if (ResultType &&
	ResultType->getNonReferenceType()->isIncompleteArrayType()) {
	if ((*ResultType)->isRValueReferenceType())
	Ty = S.Context.getRValueReferenceType(Ty);
	else if ((*ResultType)->isLValueReferenceType())
	Ty = S.Context.getLValueReferenceType(Ty,
	(*ResultType)->getAs<LValueReferenceType>()->isSpelledAsLValue());
	*ResultType = Ty;
	}

	InitListExpr *StructuredInitList =
	PerformInitList.getFullyStructuredList();
	CurInit.get();
	CurInit = shouldBindAsTemporary(InitEntity)
	? S.MaybeBindToTemporary(StructuredInitList)
	: StructuredInitList;
	break;
	}

	case SK_ConstructorInitializationFromList: {
	if (checkAbstractType(Step->Type))
	return ExprError();

	// When an initializer list is passed for a parameter of type "reference
	// to object", we don't get an EK_Temporary entity, but instead an
	// EK_Parameter entity with reference type.
	// FIXME: This is a hack. What we really should do is create a user
	// conversion step for this case, but this makes it considerably more
	// complicated. For now, this will do.
	InitializedEntity TempEntity = InitializedEntity::InitializeTemporary(
	Entity.getType().getNonReferenceType());
	bool UseTemporary = Entity.getType()->isReferenceType();
	assert(Args.size() == 1 && "expected a single argument for list init");
	InitListExpr *InitList = cast<InitListExpr>(Args[0]);
	S.Diag(InitList->getExprLoc(), diag::warn_cxx98_compat_ctor_list_init)
	<< InitList->getSourceRange();
	MultiExprArg Arg(InitList->getInits(), InitList->getNumInits());
	CurInit = PerformConstructorInitialization(S, UseTemporary ? TempEntity :
	Entity,
	Kind, Arg, *Step,
	ConstructorInitRequiresZeroInit,
	/IsListInitialization/true,
	/IsStdInitListInit/false,
	InitList->getLBraceLoc(),
	InitList->getRBraceLoc());
	break;
	}

	case SK_UnwrapInitList:
	CurInit = cast<InitListExpr>(CurInit.get())->getInit(0);
	break;

	case SK_RewrapInitList: {
	Expr *E = CurInit.get();
	InitListExpr *Syntactic = Step->WrappingSyntacticList;
	InitListExpr *ILE = new (S.Context) InitListExpr(S.Context,
	Syntactic->getLBraceLoc(), E, Syntactic->getRBraceLoc());
	ILE->setSyntacticForm(Syntactic);
	ILE->setType(E->getType());
	ILE->setValueKind(E->getValueKind());
	CurInit = ILE;
	break;
	}

	case SK_ConstructorInitialization:
	case SK_StdInitializerListConstructorCall: {
	if (checkAbstractType(Step->Type))
	return ExprError();

	// When an initializer list is passed for a parameter of type "reference
	// to object", we don't get an EK_Temporary entity, but instead an
	// EK_Parameter entity with reference type.
	// FIXME: This is a hack. What we really should do is create a user
	// conversion step for this case, but this makes it considerably more
	// complicated. For now, this will do.
	InitializedEntity TempEntity = InitializedEntity::InitializeTemporary(
	Entity.getType().getNonReferenceType());
	bool UseTemporary = Entity.getType()->isReferenceType();
	bool IsStdInitListInit =
	Step->Kind == SK_StdInitializerListConstructorCall;
	Expr *Source = CurInit.get();
	CurInit = PerformConstructorInitialization(
	S, UseTemporary ? TempEntity : Entity, Kind,
	Source ? MultiExprArg(Source) : Args, *Step,
	ConstructorInitRequiresZeroInit,
	/IsListInitialization/ IsStdInitListInit,
	/IsStdInitListInitialization/ IsStdInitListInit,
	/LBraceLoc/ SourceLocation(),
	/RBraceLoc/ SourceLocation());
	break;
	}

	case SK_ZeroInitialization: {
	step_iterator NextStep = Step;
	++NextStep;
	if (NextStep != StepEnd &&
	(NextStep->Kind == SK_ConstructorInitialization \|\|
	NextStep->Kind == SK_ConstructorInitializationFromList)) {
	// The need for zero-initialization is recorded directly into
	// the call to the object's constructor within the next step.
	ConstructorInitRequiresZeroInit = true;
	} else if (Kind.getKind() == InitializationKind::IK_Value &&
	S.getLangOpts().CPlusPlus &&
	!Kind.isImplicitValueInit()) {
	TypeSourceInfo *TSInfo = Entity.getTypeSourceInfo();
	if (!TSInfo)
	TSInfo = S.Context.getTrivialTypeSourceInfo(Step->Type,
	Kind.getRange().getBegin());

	CurInit = new (S.Context) CXXScalarValueInitExpr(
	Entity.getType().getNonLValueExprType(S.Context), TSInfo,
	Kind.getRange().getEnd());
	} else {
	CurInit = new (S.Context) ImplicitValueInitExpr(Step->Type);
	}
	break;
	}

	case SK_CAssignment: {
	QualType SourceType = CurInit.get()->getType();
	// Save off the initial CurInit in case we need to emit a diagnostic
	ExprResult InitialCurInit = CurInit;
	ExprResult Result = CurInit;
	Sema::AssignConvertType ConvTy =
	S.CheckSingleAssignmentConstraints(Step->Type, Result, true,
	Entity.getKind() == InitializedEntity::EK_Parameter_CF_Audited);
	if (Result.isInvalid())
	return ExprError();
	CurInit = Result;

	// If this is a call, allow conversion to a transparent union.
	ExprResult CurInitExprRes = CurInit;
	if (ConvTy != Sema::Compatible &&
	Entity.isParameterKind() &&
	S.CheckTransparentUnionArgumentConstraints(Step->Type, CurInitExprRes)
	== Sema::Compatible)
	ConvTy = Sema::Compatible;
	if (CurInitExprRes.isInvalid())
	return ExprError();
	CurInit = CurInitExprRes;

	bool Complained;
	if (S.DiagnoseAssignmentResult(ConvTy, Kind.getLocation(),
	Step->Type, SourceType,
	InitialCurInit.get(),
	getAssignmentAction(Entity, true),
	&Complained)) {
	PrintInitLocationNote(S, Entity);
	return ExprError();
	} else if (Complained)
	PrintInitLocationNote(S, Entity);
	break;
	}

	case SK_StringInit: {
	QualType Ty = Step->Type;
	CheckStringInit(CurInit.get(), ResultType ? *ResultType : Ty,
	S.Context.getAsArrayType(Ty), S);
	break;
	}

	case SK_ObjCObjectConversion:
	CurInit = S.ImpCastExprToType(CurInit.get(), Step->Type,
	CK_ObjCObjectLValueCast,
	CurInit.get()->getValueKind());
	break;

	case SK_ArrayLoopIndex: {
	Expr *Cur = CurInit.get();
	Expr *BaseExpr = new (S.Context)
	OpaqueValueExpr(Cur->getExprLoc(), Cur->getType(),
	Cur->getValueKind(), Cur->getObjectKind(), Cur);
	Expr *IndexExpr =
	new (S.Context) ArrayInitIndexExpr(S.Context.getSizeType());
	CurInit = S.CreateBuiltinArraySubscriptExpr(
	BaseExpr, Kind.getLocation(), IndexExpr, Kind.getLocation());
	ArrayLoopCommonExprs.push_back(BaseExpr);
	break;
	}

	case SK_ArrayLoopInit: {
	assert(!ArrayLoopCommonExprs.empty() &&
	"mismatched SK_ArrayLoopIndex and SK_ArrayLoopInit");
	Expr *Common = ArrayLoopCommonExprs.pop_back_val();
	CurInit = new (S.Context) ArrayInitLoopExpr(Step->Type, Common,
	CurInit.get());
	break;
	}

	case SK_GNUArrayInit:
	// Okay: we checked everything before creating this step. Note that
	// this is a GNU extension.
	S.Diag(Kind.getLocation(), diag::ext_array_init_copy)
	<< Step->Type << CurInit.get()->getType()
	<< CurInit.get()->getSourceRange();
	LLVM_FALLTHROUGH;
	case SK_ArrayInit:
	// If the destination type is an incomplete array type, update the
	// type accordingly.
	if (ResultType) {
	if (const IncompleteArrayType *IncompleteDest
	= S.Context.getAsIncompleteArrayType(Step->Type)) {
	if (const ConstantArrayType *ConstantSource
	= S.Context.getAsConstantArrayType(CurInit.get()->getType())) {
	*ResultType = S.Context.getConstantArrayType(
	IncompleteDest->getElementType(),
	ConstantSource->getSize(),
	ArrayType::Normal, 0);
	}
	}
	}
	break;

	case SK_ParenthesizedArrayInit:
	// Okay: we checked everything before creating this step. Note that
	// this is a GNU extension.
	S.Diag(Kind.getLocation(), diag::ext_array_init_parens)
	<< CurInit.get()->getSourceRange();
	break;

	case SK_PassByIndirectCopyRestore:
	case SK_PassByIndirectRestore:
	checkIndirectCopyRestoreSource(S, CurInit.get());
	CurInit = new (S.Context) ObjCIndirectCopyRestoreExpr(
	CurInit.get(), Step->Type,
	Step->Kind == SK_PassByIndirectCopyRestore);
	break;

	case SK_ProduceObjCObject:
	CurInit =
	ImplicitCastExpr::Create(S.Context, Step->Type, CK_ARCProduceObject,
	CurInit.get(), nullptr, VK_RValue);
	break;

	case SK_StdInitializerList: {
	S.Diag(CurInit.get()->getExprLoc(),
	diag::warn_cxx98_compat_initializer_list_init)
	<< CurInit.get()->getSourceRange();

	// Materialize the temporary into memory.
	MaterializeTemporaryExpr *MTE = S.CreateMaterializeTemporaryExpr(
	CurInit.get()->getType(), CurInit.get(),
	/BoundToLvalueReference=/false);

	// Maybe lifetime-extend the array temporary's subobjects to match the
	// entity's lifetime.
	if (const InitializedEntity *ExtendingEntity =
	getEntityForTemporaryLifetimeExtension(&Entity))
	if (performReferenceExtension(MTE, ExtendingEntity))
	warnOnLifetimeExtension(S, Entity, CurInit.get(),
	/IsInitializerList=/true,
	ExtendingEntity->getDecl());

	// Wrap it in a construction of a std::initializer_list<T>.
	CurInit = new (S.Context) CXXStdInitializerListExpr(Step->Type, MTE);

	// Bind the result, in case the library has given initializer_list a
	// non-trivial destructor.
	if (shouldBindAsTemporary(Entity))
	CurInit = S.MaybeBindToTemporary(CurInit.get());
	break;
	}

	case SK_OCLSamplerInit: {
	// Sampler initialzation have 5 cases:
	// 1. function argument passing
	// 1a. argument is a file-scope variable
	// 1b. argument is a function-scope variable
	// 1c. argument is one of caller function's parameters
	// 2. variable initialization
	// 2a. initializing a file-scope variable
	// 2b. initializing a function-scope variable
	//
	// For file-scope variables, since they cannot be initialized by function
	// call of __translate_sampler_initializer in LLVM IR, their references
	// need to be replaced by a cast from their literal initializers to
	// sampler type. Since sampler variables can only be used in function
	// calls as arguments, we only need to replace them when handling the
	// argument passing.
	assert(Step->Type->isSamplerT() &&
	"Sampler initialization on non-sampler type.");
	Expr *Init = CurInit.get();
	QualType SourceType = Init->getType();
	// Case 1
	if (Entity.isParameterKind()) {
	if (!SourceType->isSamplerT() && !SourceType->isIntegerType()) {
	S.Diag(Kind.getLocation(), diag::err_sampler_argument_required)
	<< SourceType;
	break;
	} else if (const DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(Init)) {
	auto Var = cast<VarDecl>(DRE->getDecl());
	// Case 1b and 1c
	// No cast from integer to sampler is needed.
	if (!Var->hasGlobalStorage()) {
	CurInit = ImplicitCastExpr::Create(S.Context, Step->Type,
	CK_LValueToRValue, Init,
	/BasePath=/nullptr, VK_RValue);
	break;
	}
	// Case 1a
	// For function call with a file-scope sampler variable as argument,
	// get the integer literal.
	// Do not diagnose if the file-scope variable does not have initializer
	// since this has already been diagnosed when parsing the variable
	// declaration.
	if (!Var->getInit() \|\| !isa<ImplicitCastExpr>(Var->getInit()))
	break;
	Init = cast<ImplicitCastExpr>(const_cast<Expr*>(
	Var->getInit()))->getSubExpr();
	SourceType = Init->getType();
	}
	} else {
	// Case 2
	// Check initializer is 32 bit integer constant.
	// If the initializer is taken from global variable, do not diagnose since
	// this has already been done when parsing the variable declaration.
	if (!Init->isConstantInitializer(S.Context, false))
	break;

	if (!SourceType->isIntegerType() \|\|
	32 != S.Context.getIntWidth(SourceType)) {
	S.Diag(Kind.getLocation(), diag::err_sampler_initializer_not_integer)
	<< SourceType;
	break;
	}

	llvm::APSInt Result;
	Init->EvaluateAsInt(Result, S.Context);
	const uint64_t SamplerValue = Result.getLimitedValue();
	// 32-bit value of sampler's initializer is interpreted as
	// bit-field with the following structure:
	// \|unspecified\|Filter\|Addressing Mode\| Normalized Coords\|
	// \|31 6\|5 4\|3 1\| 0\|
	// This structure corresponds to enum values of sampler properties
	// defined in SPIR spec v1.2 and also opencl-c.h
	unsigned AddressingMode = (0x0E & SamplerValue) >> 1;
	unsigned FilterMode = (0x30 & SamplerValue) >> 4;
	if (FilterMode != 1 && FilterMode != 2)
	S.Diag(Kind.getLocation(),
	diag::warn_sampler_initializer_invalid_bits)
	<< "Filter Mode";
	if (AddressingMode > 4)
	S.Diag(Kind.getLocation(),
	diag::warn_sampler_initializer_invalid_bits)
	<< "Addressing Mode";
	}

	// Cases 1a, 2a and 2b
	// Insert cast from integer to sampler.
	CurInit = S.ImpCastExprToType(Init, S.Context.OCLSamplerTy,
	CK_IntToOCLSampler);
	break;
	}
	case SK_OCLZeroEvent: {
	assert(Step->Type->isEventT() &&
	"Event initialization on non-event type.");

	CurInit = S.ImpCastExprToType(CurInit.get(), Step->Type,
	CK_ZeroToOCLEvent,
	CurInit.get()->getValueKind());
	break;
	}
	case SK_OCLZeroQueue: {
	assert(Step->Type->isQueueT() &&
	"Event initialization on non queue type.");

	CurInit = S.ImpCastExprToType(CurInit.get(), Step->Type,
	CK_ZeroToOCLQueue,
	CurInit.get()->getValueKind());
	break;
	}
	}
	}

	// Diagnose non-fatal problems with the completed initialization.
	if (Entity.getKind() == InitializedEntity::EK_Member &&
	cast<FieldDecl>(Entity.getDecl())->isBitField())
	S.CheckBitFieldInitialization(Kind.getLocation(),
	cast<FieldDecl>(Entity.getDecl()),
	CurInit.get());

	// Check for std::move on construction.
	if (const Expr *E = CurInit.get()) {
	CheckMoveOnConstruction(S, E,
	Entity.getKind() == InitializedEntity::EK_Result);
	}

	return CurInit;
	}

	/// Somewhere within T there is an uninitialized reference subobject.
	/// Dig it out and diagnose it.
	static bool DiagnoseUninitializedReference(Sema &S, SourceLocation Loc,
	QualType T) {
	if (T->isReferenceType()) {
	S.Diag(Loc, diag::err_reference_without_init)
	<< T.getNonReferenceType();
	return true;
	}

	CXXRecordDecl *RD = T->getBaseElementTypeUnsafe()->getAsCXXRecordDecl();
	if (!RD \|\| !RD->hasUninitializedReferenceMember())
	return false;

	for (const auto *FI : RD->fields()) {
	if (FI->isUnnamedBitfield())
	continue;

	if (DiagnoseUninitializedReference(S, FI->getLocation(), FI->getType())) {
	S.Diag(Loc, diag::note_value_initialization_here) << RD;
	return true;
	}
	}

	for (const auto &BI : RD->bases()) {
	if (DiagnoseUninitializedReference(S, BI.getLocStart(), BI.getType())) {
	S.Diag(Loc, diag::note_value_initialization_here) << RD;
	return true;
	}
	}

	return false;
	}


	//===----------------------------------------------------------------------===//
	// Diagnose initialization failures
	//===----------------------------------------------------------------------===//

	/// Emit notes associated with an initialization that failed due to a
	/// "simple" conversion failure.
	static void emitBadConversionNotes(Sema &S, const InitializedEntity &entity,
	Expr *op) {
	QualType destType = entity.getType();
	if (destType.getNonReferenceType()->isObjCObjectPointerType() &&
	op->getType()->isObjCObjectPointerType()) {

	// Emit a possible note about the conversion failing because the
	// operand is a message send with a related result type.
	S.EmitRelatedResultTypeNote(op);

	// Emit a possible note about a return failing because we're
	// expecting a related result type.
	if (entity.getKind() == InitializedEntity::EK_Result)
	S.EmitRelatedResultTypeNoteForReturn(destType);
	}
	}

	static void diagnoseListInit(Sema &S, const InitializedEntity &Entity,
	InitListExpr *InitList) {
	QualType DestType = Entity.getType();

	QualType E;
	if (S.getLangOpts().CPlusPlus11 && S.isStdInitializerList(DestType, &E)) {
	QualType ArrayType = S.Context.getConstantArrayType(
	E.withConst(),
	llvm::APInt(S.Context.getTypeSize(S.Context.getSizeType()),
	InitList->getNumInits()),
	clang::ArrayType::Normal, 0);
	InitializedEntity HiddenArray =
	InitializedEntity::InitializeTemporary(ArrayType);
	return diagnoseListInit(S, HiddenArray, InitList);
	}

	if (DestType->isReferenceType()) {
	// A list-initialization failure for a reference means that we tried to
	// create a temporary of the inner type (per [dcl.init.list]p3.6) and the
	// inner initialization failed.
	QualType T = DestType->getAs<ReferenceType>()->getPointeeType();
	diagnoseListInit(S, InitializedEntity::InitializeTemporary(T), InitList);
	SourceLocation Loc = InitList->getLocStart();
	if (auto *D = Entity.getDecl())
	Loc = D->getLocation();
	S.Diag(Loc, diag::note_in_reference_temporary_list_initializer) << T;
	return;
	}

	InitListChecker DiagnoseInitList(S, Entity, InitList, DestType,
	/VerifyOnly=/false,
	/TreatUnavailableAsInvalid=/false);
	assert(DiagnoseInitList.HadError() &&
	"Inconsistent init list check result.");
	}

	bool InitializationSequence::Diagnose(Sema &S,
	const InitializedEntity &Entity,
	const InitializationKind &Kind,
	ArrayRef<Expr *> Args) {
	if (!Failed())
	return false;

	QualType DestType = Entity.getType();
	switch (Failure) {
	case FK_TooManyInitsForReference:
	// FIXME: Customize for the initialized entity?
	if (Args.empty()) {
	// Dig out the reference subobject which is uninitialized and diagnose it.
	// If this is value-initialization, this could be nested some way within
	// the target type.
	assert(Kind.getKind() == InitializationKind::IK_Value \|\|
	DestType->isReferenceType());
	bool Diagnosed =
	DiagnoseUninitializedReference(S, Kind.getLocation(), DestType);
	assert(Diagnosed && "couldn't find uninitialized reference to diagnose");
	(void)Diagnosed;
	} else // FIXME: diagnostic below could be better!
	S.Diag(Kind.getLocation(), diag::err_reference_has_multiple_inits)
	<< SourceRange(Args.front()->getLocStart(), Args.back()->getLocEnd());
	break;
	case FK_ParenthesizedListInitForReference:
	S.Diag(Kind.getLocation(), diag::err_list_init_in_parens)
	<< 1 << Entity.getType() << Args[0]->getSourceRange();
	break;

	case FK_ArrayNeedsInitList:
	S.Diag(Kind.getLocation(), diag::err_array_init_not_init_list) << 0;
	break;
	case FK_ArrayNeedsInitListOrStringLiteral:
	S.Diag(Kind.getLocation(), diag::err_array_init_not_init_list) << 1;
	break;
	case FK_ArrayNeedsInitListOrWideStringLiteral:
	S.Diag(Kind.getLocation(), diag::err_array_init_not_init_list) << 2;
	break;
	case FK_NarrowStringIntoWideCharArray:
	S.Diag(Kind.getLocation(), diag::err_array_init_narrow_string_into_wchar);
	break;
	case FK_WideStringIntoCharArray:
	S.Diag(Kind.getLocation(), diag::err_array_init_wide_string_into_char);
	break;
	case FK_IncompatWideStringIntoWideChar:
	S.Diag(Kind.getLocation(),
	diag::err_array_init_incompat_wide_string_into_wchar);
	break;
	case FK_ArrayTypeMismatch:
	case FK_NonConstantArrayInit:
	S.Diag(Kind.getLocation(),
	(Failure == FK_ArrayTypeMismatch
	? diag::err_array_init_different_type
	: diag::err_array_init_non_constant_array))
	<< DestType.getNonReferenceType()
	<< Args[0]->getType()
	<< Args[0]->getSourceRange();
	break;

	case FK_VariableLengthArrayHasInitializer:
	S.Diag(Kind.getLocation(), diag::err_variable_object_no_init)
	<< Args[0]->getSourceRange();
	break;

	case FK_AddressOfOverloadFailed: {
	DeclAccessPair Found;
	S.ResolveAddressOfOverloadedFunction(Args[0],
	DestType.getNonReferenceType(),
	true,
	Found);
	break;
	}

	case FK_AddressOfUnaddressableFunction: {
	auto *FD = cast<FunctionDecl>(cast<DeclRefExpr>(Args[0])->getDecl());
	S.checkAddressOfFunctionIsAvailable(FD, /Complain=/true,
	Args[0]->getLocStart());
	break;
	}

	case FK_ReferenceInitOverloadFailed:
	case FK_UserConversionOverloadFailed:
	switch (FailedOverloadResult) {
	case OR_Ambiguous:
	if (Failure == FK_UserConversionOverloadFailed)
	S.Diag(Kind.getLocation(), diag::err_typecheck_ambiguous_condition)
	<< Args[0]->getType() << DestType
	<< Args[0]->getSourceRange();
	else
	S.Diag(Kind.getLocation(), diag::err_ref_init_ambiguous)
	<< DestType << Args[0]->getType()
	<< Args[0]->getSourceRange();

	FailedCandidateSet.NoteCandidates(S, OCD_ViableCandidates, Args);
	break;

	case OR_No_Viable_Function:
	if (!S.RequireCompleteType(Kind.getLocation(),
	DestType.getNonReferenceType(),
	diag::err_typecheck_nonviable_condition_incomplete,
	Args[0]->getType(), Args[0]->getSourceRange()))
	S.Diag(Kind.getLocation(), diag::err_typecheck_nonviable_condition)
	<< (Entity.getKind() == InitializedEntity::EK_Result)
	<< Args[0]->getType() << Args[0]->getSourceRange()
	<< DestType.getNonReferenceType();

	FailedCandidateSet.NoteCandidates(S, OCD_AllCandidates, Args);
	break;

	case OR_Deleted: {
	S.Diag(Kind.getLocation(), diag::err_typecheck_deleted_function)
	<< Args[0]->getType() << DestType.getNonReferenceType()
	<< Args[0]->getSourceRange();
	OverloadCandidateSet::iterator Best;
	OverloadingResult Ovl
	= FailedCandidateSet.BestViableFunction(S, Kind.getLocation(), Best);
	if (Ovl == OR_Deleted) {
	S.NoteDeletedFunction(Best->Function);
	} else {
	llvm_unreachable("Inconsistent overload resolution?");
	}
	break;
	}

	case OR_Success:
	llvm_unreachable("Conversion did not fail!");
	}
	break;

	case FK_NonConstLValueReferenceBindingToTemporary:
	if (isa<InitListExpr>(Args[0])) {
	S.Diag(Kind.getLocation(),
	diag::err_lvalue_reference_bind_to_initlist)
	<< DestType.getNonReferenceType().isVolatileQualified()
	<< DestType.getNonReferenceType()
	<< Args[0]->getSourceRange();
	break;
	}
	LLVM_FALLTHROUGH;

	case FK_NonConstLValueReferenceBindingToUnrelated:
	S.Diag(Kind.getLocation(),
	Failure == FK_NonConstLValueReferenceBindingToTemporary
	? diag::err_lvalue_reference_bind_to_temporary
	: diag::err_lvalue_reference_bind_to_unrelated)
	<< DestType.getNonReferenceType().isVolatileQualified()
	<< DestType.getNonReferenceType()
	<< Args[0]->getType()
	<< Args[0]->getSourceRange();
	break;

	case FK_NonConstLValueReferenceBindingToBitfield: {
	// We don't necessarily have an unambiguous source bit-field.
	FieldDecl *BitField = Args[0]->getSourceBitField();
	S.Diag(Kind.getLocation(), diag::err_reference_bind_to_bitfield)
	<< DestType.isVolatileQualified()
	<< (BitField ? BitField->getDeclName() : DeclarationName())
	<< (BitField != nullptr)
	<< Args[0]->getSourceRange();
	if (BitField)
	S.Diag(BitField->getLocation(), diag::note_bitfield_decl);
	break;
	}

	case FK_NonConstLValueReferenceBindingToVectorElement:
	S.Diag(Kind.getLocation(), diag::err_reference_bind_to_vector_element)
	<< DestType.isVolatileQualified()
	<< Args[0]->getSourceRange();
	break;

	case FK_RValueReferenceBindingToLValue:
	S.Diag(Kind.getLocation(), diag::err_lvalue_to_rvalue_ref)
	<< DestType.getNonReferenceType() << Args[0]->getType()
	<< Args[0]->getSourceRange();
	break;

	case FK_ReferenceInitDropsQualifiers: {
	QualType SourceType = Args[0]->getType();
	QualType NonRefType = DestType.getNonReferenceType();
	Qualifiers DroppedQualifiers =
	SourceType.getQualifiers() - NonRefType.getQualifiers();

	S.Diag(Kind.getLocation(), diag::err_reference_bind_drops_quals)
	<< SourceType
	<< NonRefType
	<< DroppedQualifiers.getCVRQualifiers()
	<< Args[0]->getSourceRange();
	break;
	}

	case FK_ReferenceInitFailed:
	S.Diag(Kind.getLocation(), diag::err_reference_bind_failed)
	<< DestType.getNonReferenceType()
	<< Args[0]->isLValue()
	<< Args[0]->getType()
	<< Args[0]->getSourceRange();
	emitBadConversionNotes(S, Entity, Args[0]);
	break;

	case FK_ConversionFailed: {
	QualType FromType = Args[0]->getType();
	PartialDiagnostic PDiag = S.PDiag(diag::err_init_conversion_failed)
	<< (int)Entity.getKind()
	<< DestType
	<< Args[0]->isLValue()
	<< FromType
	<< Args[0]->getSourceRange();
	S.HandleFunctionTypeMismatch(PDiag, FromType, DestType);
	S.Diag(Kind.getLocation(), PDiag);
	emitBadConversionNotes(S, Entity, Args[0]);
	break;
	}

	case FK_ConversionFromPropertyFailed:
	// No-op. This error has already been reported.
	break;

	case FK_TooManyInitsForScalar: {
	SourceRange R;

	auto *InitList = dyn_cast<InitListExpr>(Args[0]);
	if (InitList && InitList->getNumInits() >= 1) {
	R = SourceRange(InitList->getInit(0)->getLocEnd(), InitList->getLocEnd());
	} else {
	assert(Args.size() > 1 && "Expected multiple initializers!");
	R = SourceRange(Args.front()->getLocEnd(), Args.back()->getLocEnd());
	}

	R.setBegin(S.getLocForEndOfToken(R.getBegin()));
	if (Kind.isCStyleOrFunctionalCast())
	S.Diag(Kind.getLocation(), diag::err_builtin_func_cast_more_than_one_arg)
	<< R;
	else
	S.Diag(Kind.getLocation(), diag::err_excess_initializers)
	<< /scalar=/2 << R;
	break;
	}

	case FK_ParenthesizedListInitForScalar:
	S.Diag(Kind.getLocation(), diag::err_list_init_in_parens)
	<< 0 << Entity.getType() << Args[0]->getSourceRange();
	break;

	case FK_ReferenceBindingToInitList:
	S.Diag(Kind.getLocation(), diag::err_reference_bind_init_list)
	<< DestType.getNonReferenceType() << Args[0]->getSourceRange();
	break;

	case FK_InitListBadDestinationType:
	S.Diag(Kind.getLocation(), diag::err_init_list_bad_dest_type)
	<< (DestType->isRecordType()) << DestType << Args[0]->getSourceRange();
	break;

	case FK_ListConstructorOverloadFailed:
	case FK_ConstructorOverloadFailed: {
	SourceRange ArgsRange;
	if (Args.size())
	ArgsRange = SourceRange(Args.front()->getLocStart(),
	Args.back()->getLocEnd());

	if (Failure == FK_ListConstructorOverloadFailed) {
	assert(Args.size() == 1 &&
	"List construction from other than 1 argument.");
	InitListExpr *InitList = cast<InitListExpr>(Args[0]);
	Args = MultiExprArg(InitList->getInits(), InitList->getNumInits());
	}

	// FIXME: Using "DestType" for the entity we're printing is probably
	// bad.
	switch (FailedOverloadResult) {
	case OR_Ambiguous:
	S.Diag(Kind.getLocation(), diag::err_ovl_ambiguous_init)
	<< DestType << ArgsRange;
	FailedCandidateSet.NoteCandidates(S, OCD_ViableCandidates, Args);
	break;

	case OR_No_Viable_Function:
	if (Kind.getKind() == InitializationKind::IK_Default &&
	(Entity.getKind() == InitializedEntity::EK_Base \|\|
	Entity.getKind() == InitializedEntity::EK_Member) &&
	isa<CXXConstructorDecl>(S.CurContext)) {
	// This is implicit default initialization of a member or
	// base within a constructor. If no viable function was
	// found, notify the user that they need to explicitly
	// initialize this base/member.
	CXXConstructorDecl *Constructor
	= cast<CXXConstructorDecl>(S.CurContext);
	const CXXRecordDecl *InheritedFrom = nullptr;
	if (auto Inherited = Constructor->getInheritedConstructor())
	InheritedFrom = Inherited.getShadowDecl()->getNominatedBaseClass();
	if (Entity.getKind() == InitializedEntity::EK_Base) {
	S.Diag(Kind.getLocation(), diag::err_missing_default_ctor)
	<< (InheritedFrom ? 2 : Constructor->isImplicit() ? 1 : 0)
	<< S.Context.getTypeDeclType(Constructor->getParent())
	<< /base=/0
	<< Entity.getType()
	<< InheritedFrom;

	RecordDecl *BaseDecl
	= Entity.getBaseSpecifier()->getType()->getAs<RecordType>()
	->getDecl();
	S.Diag(BaseDecl->getLocation(), diag::note_previous_decl)
	<< S.Context.getTagDeclType(BaseDecl);
	} else {
	S.Diag(Kind.getLocation(), diag::err_missing_default_ctor)
	<< (InheritedFrom ? 2 : Constructor->isImplicit() ? 1 : 0)
	<< S.Context.getTypeDeclType(Constructor->getParent())
	<< /member=/1
	<< Entity.getName()
	<< InheritedFrom;
	S.Diag(Entity.getDecl()->getLocation(),
	diag::note_member_declared_at);

	if (const RecordType *Record
	= Entity.getType()->getAs<RecordType>())
	S.Diag(Record->getDecl()->getLocation(),
	diag::note_previous_decl)
	<< S.Context.getTagDeclType(Record->getDecl());
	}
	break;
	}

	S.Diag(Kind.getLocation(), diag::err_ovl_no_viable_function_in_init)
	<< DestType << ArgsRange;
	FailedCandidateSet.NoteCandidates(S, OCD_AllCandidates, Args);
	break;

	case OR_Deleted: {
	OverloadCandidateSet::iterator Best;
	OverloadingResult Ovl
	= FailedCandidateSet.BestViableFunction(S, Kind.getLocation(), Best);
	if (Ovl != OR_Deleted) {
	S.Diag(Kind.getLocation(), diag::err_ovl_deleted_init)
	<< true << DestType << ArgsRange;
	llvm_unreachable("Inconsistent overload resolution?");
	break;
	}

	// If this is a defaulted or implicitly-declared function, then
	// it was implicitly deleted. Make it clear that the deletion was
	// implicit.
	if (S.isImplicitlyDeleted(Best->Function))
	S.Diag(Kind.getLocation(), diag::err_ovl_deleted_special_init)
	<< S.getSpecialMember(cast<CXXMethodDecl>(Best->Function))
	<< DestType << ArgsRange;
	else
	S.Diag(Kind.getLocation(), diag::err_ovl_deleted_init)
	<< true << DestType << ArgsRange;

	S.NoteDeletedFunction(Best->Function);
	break;
	}

	case OR_Success:
	llvm_unreachable("Conversion did not fail!");
	}
	}
	break;

	case FK_DefaultInitOfConst:
	if (Entity.getKind() == InitializedEntity::EK_Member &&
	isa<CXXConstructorDecl>(S.CurContext)) {
	// This is implicit default-initialization of a const member in
	// a constructor. Complain that it needs to be explicitly
	// initialized.
	CXXConstructorDecl *Constructor = cast<CXXConstructorDecl>(S.CurContext);
	S.Diag(Kind.getLocation(), diag::err_uninitialized_member_in_ctor)
	<< (Constructor->getInheritedConstructor() ? 2 :
	Constructor->isImplicit() ? 1 : 0)
	<< S.Context.getTypeDeclType(Constructor->getParent())
	<< /const=/1
	<< Entity.getName();
	S.Diag(Entity.getDecl()->getLocation(), diag::note_previous_decl)
	<< Entity.getName();
	} else {
	S.Diag(Kind.getLocation(), diag::err_default_init_const)
	<< DestType << (bool)DestType->getAs<RecordType>();
	}
	break;

	case FK_Incomplete:
	S.RequireCompleteType(Kind.getLocation(), FailedIncompleteType,
	diag::err_init_incomplete_type);
	break;

	case FK_ListInitializationFailed: {
	// Run the init list checker again to emit diagnostics.
	InitListExpr *InitList = cast<InitListExpr>(Args[0]);
	diagnoseListInit(S, Entity, InitList);
	break;
	}

	case FK_PlaceholderType: {
	// FIXME: Already diagnosed!
	break;
	}

	case FK_ExplicitConstructor: {
	S.Diag(Kind.getLocation(), diag::err_selected_explicit_constructor)
	<< Args[0]->getSourceRange();
	OverloadCandidateSet::iterator Best;
	OverloadingResult Ovl
	= FailedCandidateSet.BestViableFunction(S, Kind.getLocation(), Best);
	(void)Ovl;
	assert(Ovl == OR_Success && "Inconsistent overload resolution");
	CXXConstructorDecl *CtorDecl = cast<CXXConstructorDecl>(Best->Function);
	S.Diag(CtorDecl->getLocation(),
	diag::note_explicit_ctor_deduction_guide_here) << false;
	break;
	}
	}

	PrintInitLocationNote(S, Entity);
	return true;
	}

	void InitializationSequence::dump(raw_ostream &OS) const {
	switch (SequenceKind) {
	case FailedSequence: {
	OS << "Failed sequence: ";
	switch (Failure) {
	case FK_TooManyInitsForReference:
	OS << "too many initializers for reference";
	break;

	case FK_ParenthesizedListInitForReference:
	OS << "parenthesized list init for reference";
	break;

	case FK_ArrayNeedsInitList:
	OS << "array requires initializer list";
	break;

	case FK_AddressOfUnaddressableFunction:
	OS << "address of unaddressable function was taken";
	break;

	case FK_ArrayNeedsInitListOrStringLiteral:
	OS << "array requires initializer list or string literal";
	break;

	case FK_ArrayNeedsInitListOrWideStringLiteral:
	OS << "array requires initializer list or wide string literal";
	break;

	case FK_NarrowStringIntoWideCharArray:
	OS << "narrow string into wide char array";
	break;

	case FK_WideStringIntoCharArray:
	OS << "wide string into char array";
	break;

	case FK_IncompatWideStringIntoWideChar:
	OS << "incompatible wide string into wide char array";
	break;

	case FK_ArrayTypeMismatch:
	OS << "array type mismatch";
	break;

	case FK_NonConstantArrayInit:
	OS << "non-constant array initializer";
	break;

	case FK_AddressOfOverloadFailed:
	OS << "address of overloaded function failed";
	break;

	case FK_ReferenceInitOverloadFailed:
	OS << "overload resolution for reference initialization failed";
	break;

	case FK_NonConstLValueReferenceBindingToTemporary:
	OS << "non-const lvalue reference bound to temporary";
	break;

	case FK_NonConstLValueReferenceBindingToBitfield:
	OS << "non-const lvalue reference bound to bit-field";
	break;

	case FK_NonConstLValueReferenceBindingToVectorElement:
	OS << "non-const lvalue reference bound to vector element";
	break;

	case FK_NonConstLValueReferenceBindingToUnrelated:
	OS << "non-const lvalue reference bound to unrelated type";
	break;

	case FK_RValueReferenceBindingToLValue:
	OS << "rvalue reference bound to an lvalue";
	break;

	case FK_ReferenceInitDropsQualifiers:
	OS << "reference initialization drops qualifiers";
	break;

	case FK_ReferenceInitFailed:
	OS << "reference initialization failed";
	break;

	case FK_ConversionFailed:
	OS << "conversion failed";
	break;

	case FK_ConversionFromPropertyFailed:
	OS << "conversion from property failed";
	break;

	case FK_TooManyInitsForScalar:
	OS << "too many initializers for scalar";
	break;

	case FK_ParenthesizedListInitForScalar:
	OS << "parenthesized list init for reference";
	break;

	case FK_ReferenceBindingToInitList:
	OS << "referencing binding to initializer list";
	break;

	case FK_InitListBadDestinationType:
	OS << "initializer list for non-aggregate, non-scalar type";
	break;

	case FK_UserConversionOverloadFailed:
	OS << "overloading failed for user-defined conversion";
	break;

	case FK_ConstructorOverloadFailed:
	OS << "constructor overloading failed";
	break;

	case FK_DefaultInitOfConst:
	OS << "default initialization of a const variable";
	break;

	case FK_Incomplete:
	OS << "initialization of incomplete type";
	break;

	case FK_ListInitializationFailed:
	OS << "list initialization checker failure";
	break;

	case FK_VariableLengthArrayHasInitializer:
	OS << "variable length array has an initializer";
	break;

	case FK_PlaceholderType:
	OS << "initializer expression isn't contextually valid";
	break;

	case FK_ListConstructorOverloadFailed:
	OS << "list constructor overloading failed";
	break;

	case FK_ExplicitConstructor:
	OS << "list copy initialization chose explicit constructor";
	break;
	}
	OS << '\n';
	return;
	}

	case DependentSequence:
	OS << "Dependent sequence\n";
	return;

	case NormalSequence:
	OS << "Normal sequence: ";
	break;
	}

	for (step_iterator S = step_begin(), SEnd = step_end(); S != SEnd; ++S) {
	if (S != step_begin()) {
	OS << " -> ";
	}

	switch (S->Kind) {
	case SK_ResolveAddressOfOverloadedFunction:
	OS << "resolve address of overloaded function";
	break;

	case SK_CastDerivedToBaseRValue:
	OS << "derived-to-base (rvalue)";
	break;

	case SK_CastDerivedToBaseXValue:
	OS << "derived-to-base (xvalue)";
	break;

	case SK_CastDerivedToBaseLValue:
	OS << "derived-to-base (lvalue)";
	break;

	case SK_BindReference:
	OS << "bind reference to lvalue";
	break;

	case SK_BindReferenceToTemporary:
	OS << "bind reference to a temporary";
	break;

	case SK_FinalCopy:
	OS << "final copy in class direct-initialization";
	break;

	case SK_ExtraneousCopyToTemporary:
	OS << "extraneous C++03 copy to temporary";
	break;

	case SK_UserConversion:
	OS << "user-defined conversion via " << *S->Function.Function;
	break;

	case SK_QualificationConversionRValue:
	OS << "qualification conversion (rvalue)";
	break;

	case SK_QualificationConversionXValue:
	OS << "qualification conversion (xvalue)";
	break;

	case SK_QualificationConversionLValue:
	OS << "qualification conversion (lvalue)";
	break;

	case SK_AtomicConversion:
	OS << "non-atomic-to-atomic conversion";
	break;

	case SK_LValueToRValue:
	OS << "load (lvalue to rvalue)";
	break;

	case SK_ConversionSequence:
	OS << "implicit conversion sequence (";
	S->ICS->dump(); // FIXME: use OS
	OS << ")";
	break;

	case SK_ConversionSequenceNoNarrowing:
	OS << "implicit conversion sequence with narrowing prohibited (";
	S->ICS->dump(); // FIXME: use OS
	OS << ")";
	break;

	case SK_ListInitialization:
	OS << "list aggregate initialization";
	break;

	case SK_UnwrapInitList:
	OS << "unwrap reference initializer list";
	break;

	case SK_RewrapInitList:
	OS << "rewrap reference initializer list";
	break;

	case SK_ConstructorInitialization:
	OS << "constructor initialization";
	break;

	case SK_ConstructorInitializationFromList:
	OS << "list initialization via constructor";
	break;

	case SK_ZeroInitialization:
	OS << "zero initialization";
	break;

	case SK_CAssignment:
	OS << "C assignment";
	break;

	case SK_StringInit:
	OS << "string initialization";
	break;

	case SK_ObjCObjectConversion:
	OS << "Objective-C object conversion";
	break;

	case SK_ArrayLoopIndex:
	OS << "indexing for array initialization loop";
	break;

	case SK_ArrayLoopInit:
	OS << "array initialization loop";
	break;

	case SK_ArrayInit:
	OS << "array initialization";
	break;

	case SK_GNUArrayInit:
	OS << "array initialization (GNU extension)";
	break;

	case SK_ParenthesizedArrayInit:
	OS << "parenthesized array initialization";
	break;

	case SK_PassByIndirectCopyRestore:
	OS << "pass by indirect copy and restore";
	break;

	case SK_PassByIndirectRestore:
	OS << "pass by indirect restore";
	break;

	case SK_ProduceObjCObject:
	OS << "Objective-C object retension";
	break;

	case SK_StdInitializerList:
	OS << "std::initializer_list from initializer list";
	break;

	case SK_StdInitializerListConstructorCall:
	OS << "list initialization from std::initializer_list";
	break;

	case SK_OCLSamplerInit:
	OS << "OpenCL sampler_t from integer constant";
	break;

	case SK_OCLZeroEvent:
	OS << "OpenCL event_t from zero";
	break;

	case SK_OCLZeroQueue:
	OS << "OpenCL queue_t from zero";
	break;
	}

	OS << " [" << S->Type.getAsString() << ']';
	}

	OS << '\n';
	}

	void InitializationSequence::dump() const {
	dump(llvm::errs());
	}

	static void DiagnoseNarrowingInInitList(Sema &S,
	const ImplicitConversionSequence &ICS,
	QualType PreNarrowingType,
	QualType EntityType,
	const Expr *PostInit) {
	const StandardConversionSequence *SCS = nullptr;
	switch (ICS.getKind()) {
	case ImplicitConversionSequence::StandardConversion:
	SCS = &ICS.Standard;
	break;
	case ImplicitConversionSequence::UserDefinedConversion:
	SCS = &ICS.UserDefined.After;
	break;
	case ImplicitConversionSequence::AmbiguousConversion:
	case ImplicitConversionSequence::EllipsisConversion:
	case ImplicitConversionSequence::BadConversion:
	return;
	}

	// C++11 [dcl.init.list]p7: Check whether this is a narrowing conversion.
	APValue ConstantValue;
	QualType ConstantType;
	switch (SCS->getNarrowingKind(S.Context, PostInit, ConstantValue,
	ConstantType)) {
	case NK_Not_Narrowing:
	case NK_Dependent_Narrowing:
	// No narrowing occurred.
	return;

	case NK_Type_Narrowing:
	// This was a floating-to-integer conversion, which is always considered a
	// narrowing conversion even if the value is a constant and can be
	// represented exactly as an integer.
	S.Diag(PostInit->getLocStart(),
	(S.getLangOpts().MicrosoftExt \|\| !S.getLangOpts().CPlusPlus11)
	? diag::warn_init_list_type_narrowing
	: diag::ext_init_list_type_narrowing)
	<< PostInit->getSourceRange()
	<< PreNarrowingType.getLocalUnqualifiedType()
	<< EntityType.getLocalUnqualifiedType();
	break;

	case NK_Constant_Narrowing:
	// A constant value was narrowed.
	S.Diag(PostInit->getLocStart(),
	(S.getLangOpts().MicrosoftExt \|\| !S.getLangOpts().CPlusPlus11)
	? diag::warn_init_list_constant_narrowing
	: diag::ext_init_list_constant_narrowing)
	<< PostInit->getSourceRange()
	<< ConstantValue.getAsString(S.getASTContext(), ConstantType)
	<< EntityType.getLocalUnqualifiedType();
	break;

	case NK_Variable_Narrowing:
	// A variable's value may have been narrowed.
	S.Diag(PostInit->getLocStart(),
	(S.getLangOpts().MicrosoftExt \|\| !S.getLangOpts().CPlusPlus11)
	? diag::warn_init_list_variable_narrowing
	: diag::ext_init_list_variable_narrowing)
	<< PostInit->getSourceRange()
	<< PreNarrowingType.getLocalUnqualifiedType()
	<< EntityType.getLocalUnqualifiedType();
	break;
	}

	SmallString<128> StaticCast;
	llvm::raw_svector_ostream OS(StaticCast);
	OS << "static_cast<";
	if (const TypedefType *TT = EntityType->getAs<TypedefType>()) {
	// It's important to use the typedef's name if there is one so that the
	// fixit doesn't break code using types like int64_t.
	//
	// FIXME: This will break if the typedef requires qualification. But
	// getQualifiedNameAsString() includes non-machine-parsable components.
	OS << *TT->getDecl();
	} else if (const BuiltinType *BT = EntityType->getAs<BuiltinType>())
	OS << BT->getName(S.getLangOpts());
	else {
	// Oops, we didn't find the actual type of the variable. Don't emit a fixit
	// with a broken cast.
	return;
	}
	OS << ">(";
	S.Diag(PostInit->getLocStart(), diag::note_init_list_narrowing_silence)
	<< PostInit->getSourceRange()
	<< FixItHint::CreateInsertion(PostInit->getLocStart(), OS.str())
	<< FixItHint::CreateInsertion(
	S.getLocForEndOfToken(PostInit->getLocEnd()), ")");
	}

	//===----------------------------------------------------------------------===//
	// Initialization helper functions
	//===----------------------------------------------------------------------===//
	bool
	Sema::CanPerformCopyInitialization(const InitializedEntity &Entity,
	ExprResult Init) {
	if (Init.isInvalid())
	return false;

	Expr *InitE = Init.get();
	assert(InitE && "No initialization expression");

	InitializationKind Kind
	= InitializationKind::CreateCopy(InitE->getLocStart(), SourceLocation());
	InitializationSequence Seq(*this, Entity, Kind, InitE);
	return !Seq.Failed();
	}

	ExprResult
	Sema::PerformCopyInitialization(const InitializedEntity &Entity,
	SourceLocation EqualLoc,
	ExprResult Init,
	bool TopLevelOfInitList,
	bool AllowExplicit) {
	if (Init.isInvalid())
	return ExprError();

	Expr *InitE = Init.get();
	assert(InitE && "No initialization expression?");

	if (EqualLoc.isInvalid())
	EqualLoc = InitE->getLocStart();

	InitializationKind Kind = InitializationKind::CreateCopy(InitE->getLocStart(),
	EqualLoc,
	AllowExplicit);
	InitializationSequence Seq(*this, Entity, Kind, InitE, TopLevelOfInitList);

	// Prevent infinite recursion when performing parameter copy-initialization.
	const bool ShouldTrackCopy =
	Entity.isParameterKind() && Seq.isConstructorInitialization();
	if (ShouldTrackCopy) {
	if (llvm::find(CurrentParameterCopyTypes, Entity.getType()) !=
	CurrentParameterCopyTypes.end()) {
	Seq.SetOverloadFailure(
	InitializationSequence::FK_ConstructorOverloadFailed,
	OR_No_Viable_Function);

	// Try to give a meaningful diagnostic note for the problematic
	// constructor.
	const auto LastStep = Seq.step_end() - 1;
	assert(LastStep->Kind ==
	InitializationSequence::SK_ConstructorInitialization);
	const FunctionDecl *Function = LastStep->Function.Function;
	auto Candidate =
	llvm::find_if(Seq.getFailedCandidateSet(),
	[Function](const OverloadCandidate &Candidate) -> bool {
	return Candidate.Viable &&
	Candidate.Function == Function &&
	Candidate.Conversions.size() > 0;
	});
	if (Candidate != Seq.getFailedCandidateSet().end() &&
	Function->getNumParams() > 0) {
	Candidate->Viable = false;
	Candidate->FailureKind = ovl_fail_bad_conversion;
	Candidate->Conversions[0].setBad(BadConversionSequence::no_conversion,
	InitE,
	Function->getParamDecl(0)->getType());
	}
	}
	CurrentParameterCopyTypes.push_back(Entity.getType());
	}

	ExprResult Result = Seq.Perform(*this, Entity, Kind, InitE);

	if (ShouldTrackCopy)
	CurrentParameterCopyTypes.pop_back();

	return Result;
	}

	/// Determine whether RD is, or is derived from, a specialization of CTD.
	static bool isOrIsDerivedFromSpecializationOf(CXXRecordDecl *RD,
	ClassTemplateDecl *CTD) {
	auto NotSpecialization = [&] (const CXXRecordDecl *Candidate) {
	auto *CTSD = dyn_cast<ClassTemplateSpecializationDecl>(Candidate);
	return !CTSD \|\| !declaresSameEntity(CTSD->getSpecializedTemplate(), CTD);
	};
	return !(NotSpecialization(RD) && RD->forallBases(NotSpecialization));
	}

	QualType Sema::DeduceTemplateSpecializationFromInitializer(
	TypeSourceInfo *TSInfo, const InitializedEntity &Entity,
	const InitializationKind &Kind, MultiExprArg Inits) {
	auto *DeducedTST = dyn_cast<DeducedTemplateSpecializationType>(
	TSInfo->getType()->getContainedDeducedType());
	assert(DeducedTST && "not a deduced template specialization type");

	// We can only perform deduction for class templates.
	auto TemplateName = DeducedTST->getTemplateName();
	auto *Template =
	dyn_cast_or_null<ClassTemplateDecl>(TemplateName.getAsTemplateDecl());
	if (!Template) {
	Diag(Kind.getLocation(),
	diag::err_deduced_non_class_template_specialization_type)
	<< (int)getTemplateNameKindForDiagnostics(TemplateName) << TemplateName;
	if (auto *TD = TemplateName.getAsTemplateDecl())
	Diag(TD->getLocation(), diag::note_template_decl_here);
	return QualType();
	}

	// Can't deduce from dependent arguments.
	if (Expr::hasAnyTypeDependentArguments(Inits))
	return Context.DependentTy;

	// FIXME: Perform "exact type" matching first, per CWG discussion?
	// Or implement this via an implied 'T(T) -> T' deduction guide?

	// FIXME: Do we need/want a std::initializer_list<T> special case?

	// Look up deduction guides, including those synthesized from constructors.
	//
	// C++1z [over.match.class.deduct]p1:
	// A set of functions and function templates is formed comprising:
	// - For each constructor of the class template designated by the
	// template-name, a function template [...]
	// - For each deduction-guide, a function or function template [...]
	DeclarationNameInfo NameInfo(
	Context.DeclarationNames.getCXXDeductionGuideName(Template),
	TSInfo->getTypeLoc().getEndLoc());
	LookupResult Guides(*this, NameInfo, LookupOrdinaryName);
	LookupQualifiedName(Guides, Template->getDeclContext());

	// FIXME: Do not diagnose inaccessible deduction guides. The standard isn't
	// clear on this, but they're not found by name so access does not apply.
	Guides.suppressDiagnostics();

	// Figure out if this is list-initialization.
	InitListExpr *ListInit =
	(Inits.size() == 1 && Kind.getKind() != InitializationKind::IK_Direct)
	? dyn_cast<InitListExpr>(Inits[0])
	: nullptr;

	// C++1z [over.match.class.deduct]p1:
	// Initialization and overload resolution are performed as described in
	// [dcl.init] and [over.match.ctor], [over.match.copy], or [over.match.list]
	// (as appropriate for the type of initialization performed) for an object
	// of a hypothetical class type, where the selected functions and function
	// templates are considered to be the constructors of that class type
	//
	// Since we know we're initializing a class type of a type unrelated to that
	// of the initializer, this reduces to something fairly reasonable.
	OverloadCandidateSet Candidates(Kind.getLocation(),
	OverloadCandidateSet::CSK_Normal);
	OverloadCandidateSet::iterator Best;
	auto tryToResolveOverload =
	[&](bool OnlyListConstructors) -> OverloadingResult {
	Candidates.clear(OverloadCandidateSet::CSK_Normal);
	for (auto I = Guides.begin(), E = Guides.end(); I != E; ++I) {
	NamedDecl D = (I)->getUnderlyingDecl();
	if (D->isInvalidDecl())
	continue;

	auto *TD = dyn_cast<FunctionTemplateDecl>(D);
	auto *GD = dyn_cast_or_null<CXXDeductionGuideDecl>(
	TD ? TD->getTemplatedDecl() : dyn_cast<FunctionDecl>(D));
	if (!GD)
	continue;

	// C++ [over.match.ctor]p1: (non-list copy-initialization from non-class)
	// For copy-initialization, the candidate functions are all the
	// converting constructors (12.3.1) of that class.
	// C++ [over.match.copy]p1: (non-list copy-initialization from class)
	// The converting constructors of T are candidate functions.
	if (Kind.isCopyInit() && !ListInit) {
	// Only consider converting constructors.
	if (GD->isExplicit())
	continue;

	// When looking for a converting constructor, deduction guides that
	// could never be called with one argument are not interesting to
	// check or note.
	if (GD->getMinRequiredArguments() > 1 \|\|
	(GD->getNumParams() == 0 && !GD->isVariadic()))
	continue;
	}

	// C++ [over.match.list]p1.1: (first phase list initialization)
	// Initially, the candidate functions are the initializer-list
	// constructors of the class T
	if (OnlyListConstructors && !isInitListConstructor(GD))
	continue;

	// C++ [over.match.list]p1.2: (second phase list initialization)
	// the candidate functions are all the constructors of the class T
	// C++ [over.match.ctor]p1: (all other cases)
	// the candidate functions are all the constructors of the class of
	// the object being initialized

	// C++ [over.best.ics]p4:
	// When [...] the constructor [...] is a candidate by
	// - [over.match.copy] (in all cases)
	// FIXME: The "second phase of [over.match.list] case can also
	// theoretically happen here, but it's not clear whether we can
	// ever have a parameter of the right type.
	bool SuppressUserConversions = Kind.isCopyInit();

	if (TD)
	AddTemplateOverloadCandidate(TD, I.getPair(), /ExplicitArgs/ nullptr,
	Inits, Candidates,
	SuppressUserConversions);
	else
	AddOverloadCandidate(GD, I.getPair(), Inits, Candidates,
	SuppressUserConversions);
	}
	return Candidates.BestViableFunction(*this, Kind.getLocation(), Best);
	};

	OverloadingResult Result = OR_No_Viable_Function;

	// C++11 [over.match.list]p1, per DR1467: for list-initialization, first
	// try initializer-list constructors.
	if (ListInit) {
	bool TryListConstructors = true;

	// Try list constructors unless the list is empty and the class has one or
	// more default constructors, in which case those constructors win.
	if (!ListInit->getNumInits()) {
	for (NamedDecl *D : Guides) {
	auto *FD = dyn_cast<FunctionDecl>(D->getUnderlyingDecl());
	if (FD && FD->getMinRequiredArguments() == 0) {
	TryListConstructors = false;
	break;
	}
	}
	} else if (ListInit->getNumInits() == 1) {
	// C++ [over.match.class.deduct]:
	// As an exception, the first phase in [over.match.list] (considering
	// initializer-list constructors) is omitted if the initializer list
	// consists of a single expression of type cv U, where U is a
	// specialization of C or a class derived from a specialization of C.
	Expr *E = ListInit->getInit(0);
	auto *RD = E->getType()->getAsCXXRecordDecl();
	if (!isa<InitListExpr>(E) && RD &&
	isOrIsDerivedFromSpecializationOf(RD, Template))
	TryListConstructors = false;
	}

	if (TryListConstructors)
	Result = tryToResolveOverload(/OnlyListConstructor/true);
	// Then unwrap the initializer list and try again considering all
	// constructors.
	Inits = MultiExprArg(ListInit->getInits(), ListInit->getNumInits());
	}

	// If list-initialization fails, or if we're doing any other kind of
	// initialization, we (eventually) consider constructors.
	if (Result == OR_No_Viable_Function)
	Result = tryToResolveOverload(/OnlyListConstructor/false);

	switch (Result) {
	case OR_Ambiguous:
	Diag(Kind.getLocation(), diag::err_deduced_class_template_ctor_ambiguous)
	<< TemplateName;
	// FIXME: For list-initialization candidates, it'd usually be better to
	// list why they were not viable when given the initializer list itself as
	// an argument.
	Candidates.NoteCandidates(*this, OCD_ViableCandidates, Inits);
	return QualType();

	case OR_No_Viable_Function: {
	CXXRecordDecl *Primary =
	cast<ClassTemplateDecl>(Template)->getTemplatedDecl();
	bool Complete =
	isCompleteType(Kind.getLocation(), Context.getTypeDeclType(Primary));
	Diag(Kind.getLocation(),
	Complete ? diag::err_deduced_class_template_ctor_no_viable
	: diag::err_deduced_class_template_incomplete)
	<< TemplateName << !Guides.empty();
	Candidates.NoteCandidates(*this, OCD_AllCandidates, Inits);
	return QualType();
	}

	case OR_Deleted: {
	Diag(Kind.getLocation(), diag::err_deduced_class_template_deleted)
	<< TemplateName;
	NoteDeletedFunction(Best->Function);
	return QualType();
	}

	case OR_Success:
	// C++ [over.match.list]p1:
	// In copy-list-initialization, if an explicit constructor is chosen, the
	// initialization is ill-formed.
	if (Kind.isCopyInit() && ListInit &&
	cast<CXXDeductionGuideDecl>(Best->Function)->isExplicit()) {
	bool IsDeductionGuide = !Best->Function->isImplicit();
	Diag(Kind.getLocation(), diag::err_deduced_class_template_explicit)
	<< TemplateName << IsDeductionGuide;
	Diag(Best->Function->getLocation(),
	diag::note_explicit_ctor_deduction_guide_here)
	<< IsDeductionGuide;
	return QualType();
	}

	// Make sure we didn't select an unusable deduction guide, and mark it
	// as referenced.
	DiagnoseUseOfDecl(Best->Function, Kind.getLocation());
	MarkFunctionReferenced(Kind.getLocation(), Best->Function);
	break;
	}

	// C++ [dcl.type.class.deduct]p1:
	// The placeholder is replaced by the return type of the function selected
	// by overload resolution for class template deduction.
	return SubstAutoType(TSInfo->getType(), Best->Function->getReturnType());
	}
	Index: head/contrib/llvm/tools/clang
	===================================================================
	--- head/contrib/llvm/tools/clang (revision 329409)
	+++ head/contrib/llvm/tools/clang (revision 329410)

	Property changes on: head/contrib/llvm/tools/clang
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/clang/dist-release_60:r328795-329405
	Index: head/contrib/llvm/tools/lld/COFF/PDB.cpp
	===================================================================
	--- head/contrib/llvm/tools/lld/COFF/PDB.cpp (revision 329409)
	+++ head/contrib/llvm/tools/lld/COFF/PDB.cpp (revision 329410)
	@@ -1,997 +1,1022 @@
	//===- PDB.cpp ------------------------------------------------------------===//
	//
	// The LLVM Linker
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#include "PDB.h"
	#include "Chunks.h"
	#include "Config.h"
	#include "Driver.h"
	#include "SymbolTable.h"
	#include "Symbols.h"
	#include "Writer.h"
	#include "lld/Common/ErrorHandler.h"
	#include "llvm/DebugInfo/CodeView/CVDebugRecord.h"
	#include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
	#include "llvm/DebugInfo/CodeView/GlobalTypeTableBuilder.h"
	#include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
	#include "llvm/DebugInfo/CodeView/MergingTypeTableBuilder.h"
	#include "llvm/DebugInfo/CodeView/RecordName.h"
	#include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
	#include "llvm/DebugInfo/CodeView/SymbolSerializer.h"
	#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
	#include "llvm/DebugInfo/CodeView/TypeDumpVisitor.h"
	#include "llvm/DebugInfo/CodeView/TypeIndexDiscovery.h"
	#include "llvm/DebugInfo/CodeView/TypeStreamMerger.h"
	#include "llvm/DebugInfo/MSF/MSFBuilder.h"
	#include "llvm/DebugInfo/MSF/MSFCommon.h"
	#include "llvm/DebugInfo/PDB/GenericError.h"
	#include "llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h"
	#include "llvm/DebugInfo/PDB/Native/DbiStream.h"
	#include "llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h"
	#include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h"
	#include "llvm/DebugInfo/PDB/Native/InfoStream.h"
	#include "llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h"
	#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
	#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
	#include "llvm/DebugInfo/PDB/Native/PDBFileBuilder.h"
	#include "llvm/DebugInfo/PDB/Native/PDBStringTableBuilder.h"
	#include "llvm/DebugInfo/PDB/Native/TpiHashing.h"
	#include "llvm/DebugInfo/PDB/Native/TpiStream.h"
	#include "llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h"
	#include "llvm/DebugInfo/PDB/PDB.h"
	#include "llvm/Object/COFF.h"
	#include "llvm/Support/BinaryByteStream.h"
	#include "llvm/Support/Endian.h"
	#include "llvm/Support/JamCRC.h"
	#include "llvm/Support/Path.h"
	#include "llvm/Support/ScopedPrinter.h"
	#include <memory>

	using namespace lld;
	using namespace lld::coff;
	using namespace llvm;
	using namespace llvm::codeview;

	using llvm::object::coff_section;

	static ExitOnError ExitOnErr;

	namespace {
	/// Map from type index and item index in a type server PDB to the
	/// corresponding index in the destination PDB.
	struct CVIndexMap {
	SmallVector<TypeIndex, 0> TPIMap;
	SmallVector<TypeIndex, 0> IPIMap;
	bool IsTypeServerMap = false;
	};

	class PDBLinker {
	public:
	PDBLinker(SymbolTable *Symtab)
	: Alloc(), Symtab(Symtab), Builder(Alloc), TypeTable(Alloc),
	IDTable(Alloc), GlobalTypeTable(Alloc), GlobalIDTable(Alloc) {}

	/// Emit the basic PDB structure: initial streams, headers, etc.
	void initialize(const llvm::codeview::DebugInfo &BuildId);

	/// Link CodeView from each object file in the symbol table into the PDB.
	void addObjectsToPDB();

	/// Link CodeView from a single object file into the PDB.
	void addObjFile(ObjFile *File);

	/// Produce a mapping from the type and item indices used in the object
	/// file to those in the destination PDB.
	///
	/// If the object file uses a type server PDB (compiled with /Zi), merge TPI
	/// and IPI from the type server PDB and return a map for it. Each unique type
	/// server PDB is merged at most once, so this may return an existing index
	/// mapping.
	///
	/// If the object does not use a type server PDB (compiled with /Z7), we merge
	/// all the type and item records from the .debug$S stream and fill in the
	/// caller-provided ObjectIndexMap.
	- const CVIndexMap &mergeDebugT(ObjFile *File, CVIndexMap &ObjectIndexMap);
	+ Expected<const CVIndexMap&> mergeDebugT(ObjFile *File,
	+ CVIndexMap &ObjectIndexMap);

	- const CVIndexMap &maybeMergeTypeServerPDB(ObjFile *File,
	- TypeServer2Record &TS);
	+ Expected<const CVIndexMap&> maybeMergeTypeServerPDB(ObjFile *File,
	+ TypeServer2Record &TS);

	/// Add the section map and section contributions to the PDB.
	void addSections(ArrayRef<OutputSection *> OutputSections,
	ArrayRef<uint8_t> SectionTable);

	void addSectionContrib(pdb::DbiModuleDescriptorBuilder &LinkerModule,
	OutputSection OS, Chunk C);

	/// Write the PDB to disk.
	void commit();

	private:
	BumpPtrAllocator Alloc;

	SymbolTable *Symtab;

	pdb::PDBFileBuilder Builder;

	/// Type records that will go into the PDB TPI stream.
	MergingTypeTableBuilder TypeTable;

	/// Item records that will go into the PDB IPI stream.
	MergingTypeTableBuilder IDTable;

	/// Type records that will go into the PDB TPI stream (for /DEBUG:GHASH)
	GlobalTypeTableBuilder GlobalTypeTable;

	/// Item records that will go into the PDB IPI stream (for /DEBUG:GHASH)
	GlobalTypeTableBuilder GlobalIDTable;

	/// PDBs use a single global string table for filenames in the file checksum
	/// table.
	DebugStringTableSubsection PDBStrTab;

	llvm::SmallString<128> NativePath;

	std::vector<pdb::SecMapEntry> SectionMap;

	/// Type index mappings of type server PDBs that we've loaded so far.
	std::map<GUID, CVIndexMap> TypeServerIndexMappings;
	+
	+ /// List of TypeServer PDBs which cannot be loaded.
	+ /// Cached to prevent repeated load attempts.
	+ std::set<GUID> MissingTypeServerPDBs;
	};
	}

	static SectionChunk findByName(ArrayRef<SectionChunk > Sections,
	StringRef Name) {
	for (SectionChunk *C : Sections)
	if (C->getSectionName() == Name)
	return C;
	return nullptr;
	}

	static ArrayRef<uint8_t> consumeDebugMagic(ArrayRef<uint8_t> Data,
	StringRef SecName) {
	// First 4 bytes are section magic.
	if (Data.size() < 4)
	fatal(SecName + " too short");
	if (support::endian::read32le(Data.data()) != COFF::DEBUG_SECTION_MAGIC)
	fatal(SecName + " has an invalid magic");
	return Data.slice(4);
	}

	static ArrayRef<uint8_t> getDebugSection(ObjFile *File, StringRef SecName) {
	if (SectionChunk *Sec = findByName(File->getDebugChunks(), SecName))
	return consumeDebugMagic(Sec->getContents(), SecName);
	return {};
	}

	// A COFF .debug$H section is currently a clang extension. This function checks
	// if a .debug$H section is in a format that we expect / understand, so that we
	// can ignore any sections which are coincidentally also named .debug$H but do
	// not contain a format we recognize.
	static bool canUseDebugH(ArrayRef<uint8_t> DebugH) {
	if (DebugH.size() < sizeof(object::debug_h_header))
	return false;
	auto *Header =
	reinterpret_cast<const object::debug_h_header *>(DebugH.data());
	DebugH = DebugH.drop_front(sizeof(object::debug_h_header));
	return Header->Magic == COFF::DEBUG_HASHES_SECTION_MAGIC &&
	Header->Version == 0 &&
	Header->HashAlgorithm == uint16_t(GlobalTypeHashAlg::SHA1) &&
	(DebugH.size() % 20 == 0);
	}

	static Optional<ArrayRef<uint8_t>> getDebugH(ObjFile *File) {
	SectionChunk *Sec = findByName(File->getDebugChunks(), ".debug$H");
	if (!Sec)
	return llvm::None;
	ArrayRef<uint8_t> Contents = Sec->getContents();
	if (!canUseDebugH(Contents))
	return None;
	return Contents;
	}

	static ArrayRef<GloballyHashedType>
	getHashesFromDebugH(ArrayRef<uint8_t> DebugH) {
	assert(canUseDebugH(DebugH));

	DebugH = DebugH.drop_front(sizeof(object::debug_h_header));
	uint32_t Count = DebugH.size() / sizeof(GloballyHashedType);
	return {reinterpret_cast<const GloballyHashedType *>(DebugH.data()), Count};
	}

	static void addTypeInfo(pdb::TpiStreamBuilder &TpiBuilder,
	TypeCollection &TypeTable) {
	// Start the TPI or IPI stream header.
	TpiBuilder.setVersionHeader(pdb::PdbTpiV80);

	// Flatten the in memory type table and hash each type.
	TypeTable.ForEachRecord([&](TypeIndex TI, const CVType &Type) {
	auto Hash = pdb::hashTypeRecord(Type);
	if (auto E = Hash.takeError())
	fatal("type hashing error");
	TpiBuilder.addTypeRecord(Type.RecordData, *Hash);
	});
	}

	static Optional<TypeServer2Record>
	maybeReadTypeServerRecord(CVTypeArray &Types) {
	auto I = Types.begin();
	if (I == Types.end())
	return None;
	const CVType &Type = *I;
	if (Type.kind() != LF_TYPESERVER2)
	return None;
	TypeServer2Record TS;
	if (auto EC = TypeDeserializer::deserializeAs(const_cast<CVType &>(Type), TS))
	fatal("error reading type server record: " + toString(std::move(EC)));
	return std::move(TS);
	}

	-const CVIndexMap &PDBLinker::mergeDebugT(ObjFile *File,
	- CVIndexMap &ObjectIndexMap) {
	+Expected<const CVIndexMap&> PDBLinker::mergeDebugT(ObjFile *File,
	+ CVIndexMap &ObjectIndexMap) {
	ArrayRef<uint8_t> Data = getDebugSection(File, ".debug$T");
	if (Data.empty())
	return ObjectIndexMap;

	BinaryByteStream Stream(Data, support::little);
	CVTypeArray Types;
	BinaryStreamReader Reader(Stream);
	if (auto EC = Reader.readArray(Types, Reader.getLength()))
	fatal("Reader::readArray failed: " + toString(std::move(EC)));

	// Look through type servers. If we've already seen this type server, don't
	// merge any type information.
	if (Optional<TypeServer2Record> TS = maybeReadTypeServerRecord(Types))
	return maybeMergeTypeServerPDB(File, *TS);

	// This is a /Z7 object. Fill in the temporary, caller-provided
	// ObjectIndexMap.
	if (Config->DebugGHashes) {
	ArrayRef<GloballyHashedType> Hashes;
	std::vector<GloballyHashedType> OwnedHashes;
	if (Optional<ArrayRef<uint8_t>> DebugH = getDebugH(File))
	Hashes = getHashesFromDebugH(*DebugH);
	else {
	OwnedHashes = GloballyHashedType::hashTypes(Types);
	Hashes = OwnedHashes;
	}

	if (auto Err = mergeTypeAndIdRecords(GlobalIDTable, GlobalTypeTable,
	ObjectIndexMap.TPIMap, Types, Hashes))
	fatal("codeview::mergeTypeAndIdRecords failed: " +
	toString(std::move(Err)));
	} else {
	if (auto Err = mergeTypeAndIdRecords(IDTable, TypeTable,
	ObjectIndexMap.TPIMap, Types))
	fatal("codeview::mergeTypeAndIdRecords failed: " +
	toString(std::move(Err)));
	}
	return ObjectIndexMap;
	}

	static Expected<std::unique_ptr<pdb::NativeSession>>
	tryToLoadPDB(const GUID &GuidFromObj, StringRef TSPath) {
	ErrorOr<std::unique_ptr<MemoryBuffer>> MBOrErr = MemoryBuffer::getFile(
	TSPath, /FileSize=/-1, /RequiresNullTerminator=/false);
	if (!MBOrErr)
	return errorCodeToError(MBOrErr.getError());

	std::unique_ptr<pdb::IPDBSession> ThisSession;
	if (auto EC = pdb::NativeSession::createFromPdb(
	MemoryBuffer::getMemBuffer(Driver->takeBuffer(std::move(*MBOrErr)),
	/RequiresNullTerminator=/false),
	ThisSession))
	return std::move(EC);

	std::unique_ptr<pdb::NativeSession> NS(
	static_cast<pdb::NativeSession *>(ThisSession.release()));
	pdb::PDBFile &File = NS->getPDBFile();
	auto ExpectedInfo = File.getPDBInfoStream();
	// All PDB Files should have an Info stream.
	if (!ExpectedInfo)
	return ExpectedInfo.takeError();

	// Just because a file with a matching name was found and it was an actual
	// PDB file doesn't mean it matches. For it to match the InfoStream's GUID
	// must match the GUID specified in the TypeServer2 record.
	if (ExpectedInfo->getGuid() != GuidFromObj)
	return make_error<pdb::GenericError>(
	pdb::generic_error_code::type_server_not_found, TSPath);

	return std::move(NS);
	}

	-const CVIndexMap &PDBLinker::maybeMergeTypeServerPDB(ObjFile *File,
	- TypeServer2Record &TS) {
	- // First, check if we already loaded a PDB with this GUID. Return the type
	+Expected<const CVIndexMap&> PDBLinker::maybeMergeTypeServerPDB(ObjFile *File,
	+ TypeServer2Record &TS) {
	+ const GUID& TSId = TS.getGuid();
	+ StringRef TSPath = TS.getName();
	+
	+ // First, check if the PDB has previously failed to load.
	+ if (MissingTypeServerPDBs.count(TSId))
	+ return make_error<pdb::GenericError>(
	+ pdb::generic_error_code::type_server_not_found, TSPath);
	+
	+ // Second, check if we already loaded a PDB with this GUID. Return the type
	// index mapping if we have it.
	- auto Insertion = TypeServerIndexMappings.insert({TS.getGuid(), CVIndexMap()});
	+ auto Insertion = TypeServerIndexMappings.insert({TSId, CVIndexMap()});
	CVIndexMap &IndexMap = Insertion.first->second;
	if (!Insertion.second)
	return IndexMap;

	// Mark this map as a type server map.
	IndexMap.IsTypeServerMap = true;

	// Check for a PDB at:
	// 1. The given file path
	// 2. Next to the object file or archive file
	- auto ExpectedSession = tryToLoadPDB(TS.getGuid(), TS.getName());
	+ auto ExpectedSession = tryToLoadPDB(TSId, TSPath);
	if (!ExpectedSession) {
	consumeError(ExpectedSession.takeError());
	StringRef LocalPath =
	!File->ParentName.empty() ? File->ParentName : File->getName();
	SmallString<128> Path = sys::path::parent_path(LocalPath);
	sys::path::append(
	- Path, sys::path::filename(TS.getName(), sys::path::Style::windows));
	- ExpectedSession = tryToLoadPDB(TS.getGuid(), Path);
	+ Path, sys::path::filename(TSPath, sys::path::Style::windows));
	+ ExpectedSession = tryToLoadPDB(TSId, Path);
	}
	- if (auto E = ExpectedSession.takeError())
	- fatal("Type server PDB was not found: " + toString(std::move(E)));
	+ if (auto E = ExpectedSession.takeError()) {
	+ TypeServerIndexMappings.erase(TSId);
	+ MissingTypeServerPDBs.emplace(TSId);
	+ return std::move(E);
	+ }

	auto ExpectedTpi = (*ExpectedSession)->getPDBFile().getPDBTpiStream();
	if (auto E = ExpectedTpi.takeError())
	fatal("Type server does not have TPI stream: " + toString(std::move(E)));
	auto ExpectedIpi = (*ExpectedSession)->getPDBFile().getPDBIpiStream();
	if (auto E = ExpectedIpi.takeError())
	fatal("Type server does not have TPI stream: " + toString(std::move(E)));

	if (Config->DebugGHashes) {
	// PDBs do not actually store global hashes, so when merging a type server
	// PDB we have to synthesize global hashes. To do this, we first synthesize
	// global hashes for the TPI stream, since it is independent, then we
	// synthesize hashes for the IPI stream, using the hashes for the TPI stream
	// as inputs.
	auto TpiHashes = GloballyHashedType::hashTypes(ExpectedTpi->typeArray());
	auto IpiHashes =
	GloballyHashedType::hashIds(ExpectedIpi->typeArray(), TpiHashes);

	// Merge TPI first, because the IPI stream will reference type indices.
	if (auto Err = mergeTypeRecords(GlobalTypeTable, IndexMap.TPIMap,
	ExpectedTpi->typeArray(), TpiHashes))
	fatal("codeview::mergeTypeRecords failed: " + toString(std::move(Err)));

	// Merge IPI.
	if (auto Err =
	mergeIdRecords(GlobalIDTable, IndexMap.TPIMap, IndexMap.IPIMap,
	ExpectedIpi->typeArray(), IpiHashes))
	fatal("codeview::mergeIdRecords failed: " + toString(std::move(Err)));
	} else {
	// Merge TPI first, because the IPI stream will reference type indices.
	if (auto Err = mergeTypeRecords(TypeTable, IndexMap.TPIMap,
	ExpectedTpi->typeArray()))
	fatal("codeview::mergeTypeRecords failed: " + toString(std::move(Err)));

	// Merge IPI.
	if (auto Err = mergeIdRecords(IDTable, IndexMap.TPIMap, IndexMap.IPIMap,
	ExpectedIpi->typeArray()))
	fatal("codeview::mergeIdRecords failed: " + toString(std::move(Err)));
	}

	return IndexMap;
	}

	static bool remapTypeIndex(TypeIndex &TI, ArrayRef<TypeIndex> TypeIndexMap) {
	if (TI.isSimple())
	return true;
	if (TI.toArrayIndex() >= TypeIndexMap.size())
	return false;
	TI = TypeIndexMap[TI.toArrayIndex()];
	return true;
	}

	static void remapTypesInSymbolRecord(ObjFile *File, SymbolKind SymKind,
	MutableArrayRef<uint8_t> Contents,
	const CVIndexMap &IndexMap,
	ArrayRef<TiReference> TypeRefs) {
	for (const TiReference &Ref : TypeRefs) {
	unsigned ByteSize = Ref.Count * sizeof(TypeIndex);
	if (Contents.size() < Ref.Offset + ByteSize)
	fatal("symbol record too short");

	// This can be an item index or a type index. Choose the appropriate map.
	ArrayRef<TypeIndex> TypeOrItemMap = IndexMap.TPIMap;
	bool IsItemIndex = Ref.Kind == TiRefKind::IndexRef;
	if (IsItemIndex && IndexMap.IsTypeServerMap)
	TypeOrItemMap = IndexMap.IPIMap;

	MutableArrayRef<TypeIndex> TIs(
	reinterpret_cast<TypeIndex *>(Contents.data() + Ref.Offset), Ref.Count);
	for (TypeIndex &TI : TIs) {
	if (!remapTypeIndex(TI, TypeOrItemMap)) {
	log("ignoring symbol record of kind 0x" + utohexstr(SymKind) + " in " +
	File->getName() + " with bad " + (IsItemIndex ? "item" : "type") +
	" index 0x" + utohexstr(TI.getIndex()));
	TI = TypeIndex(SimpleTypeKind::NotTranslated);
	continue;
	}
	}
	}
	}

	static SymbolKind symbolKind(ArrayRef<uint8_t> RecordData) {
	const RecordPrefix *Prefix =
	reinterpret_cast<const RecordPrefix *>(RecordData.data());
	return static_cast<SymbolKind>(uint16_t(Prefix->RecordKind));
	}

	/// MSVC translates S_PROC_ID_END to S_END, and S_[LG]PROC32_ID to S_[LG]PROC32
	static void translateIdSymbols(MutableArrayRef<uint8_t> &RecordData,
	TypeCollection &IDTable) {
	RecordPrefix Prefix = reinterpret_cast<RecordPrefix >(RecordData.data());

	SymbolKind Kind = symbolKind(RecordData);

	if (Kind == SymbolKind::S_PROC_ID_END) {
	Prefix->RecordKind = SymbolKind::S_END;
	return;
	}

	// In an object file, GPROC32_ID has an embedded reference which refers to the
	// single object file type index namespace. This has already been translated
	// to the PDB file's ID stream index space, but we need to convert this to a
	// symbol that refers to the type stream index space. So we remap again from
	// ID index space to type index space.
	if (Kind == SymbolKind::S_GPROC32_ID \|\| Kind == SymbolKind::S_LPROC32_ID) {
	SmallVector<TiReference, 1> Refs;
	auto Content = RecordData.drop_front(sizeof(RecordPrefix));
	CVSymbol Sym(Kind, RecordData);
	discoverTypeIndicesInSymbol(Sym, Refs);
	assert(Refs.size() == 1);
	assert(Refs.front().Count == 1);

	TypeIndex *TI =
	reinterpret_cast<TypeIndex *>(Content.data() + Refs[0].Offset);
	// `TI` is the index of a FuncIdRecord or MemberFuncIdRecord which lives in
	// the IPI stream, whose `FunctionType` member refers to the TPI stream.
	// Note that LF_FUNC_ID and LF_MEMFUNC_ID have the same record layout, and
	// in both cases we just need the second type index.
	if (!TI->isSimple() && !TI->isNoneType()) {
	CVType FuncIdData = IDTable.getType(*TI);
	SmallVector<TypeIndex, 2> Indices;
	discoverTypeIndices(FuncIdData, Indices);
	assert(Indices.size() == 2);
	*TI = Indices[1];
	}

	Kind = (Kind == SymbolKind::S_GPROC32_ID) ? SymbolKind::S_GPROC32
	: SymbolKind::S_LPROC32;
	Prefix->RecordKind = uint16_t(Kind);
	}
	}

	/// Copy the symbol record. In a PDB, symbol records must be 4 byte aligned.
	/// The object file may not be aligned.
	static MutableArrayRef<uint8_t> copySymbolForPdb(const CVSymbol &Sym,
	BumpPtrAllocator &Alloc) {
	size_t Size = alignTo(Sym.length(), alignOf(CodeViewContainer::Pdb));
	assert(Size >= 4 && "record too short");
	assert(Size <= MaxRecordLength && "record too long");
	void *Mem = Alloc.Allocate(Size, 4);

	// Copy the symbol record and zero out any padding bytes.
	MutableArrayRef<uint8_t> NewData(reinterpret_cast<uint8_t *>(Mem), Size);
	memcpy(NewData.data(), Sym.data().data(), Sym.length());
	memset(NewData.data() + Sym.length(), 0, Size - Sym.length());

	// Update the record prefix length. It should point to the beginning of the
	// next record.
	auto Prefix = reinterpret_cast<RecordPrefix >(Mem);
	Prefix->RecordLen = Size - 2;
	return NewData;
	}

	/// Return true if this symbol opens a scope. This implies that the symbol has
	/// "parent" and "end" fields, which contain the offset of the S_END or
	/// S_INLINESITE_END record.
	static bool symbolOpensScope(SymbolKind Kind) {
	switch (Kind) {
	case SymbolKind::S_GPROC32:
	case SymbolKind::S_LPROC32:
	case SymbolKind::S_LPROC32_ID:
	case SymbolKind::S_GPROC32_ID:
	case SymbolKind::S_BLOCK32:
	case SymbolKind::S_SEPCODE:
	case SymbolKind::S_THUNK32:
	case SymbolKind::S_INLINESITE:
	case SymbolKind::S_INLINESITE2:
	return true;
	default:
	break;
	}
	return false;
	}

	static bool symbolEndsScope(SymbolKind Kind) {
	switch (Kind) {
	case SymbolKind::S_END:
	case SymbolKind::S_PROC_ID_END:
	case SymbolKind::S_INLINESITE_END:
	return true;
	default:
	break;
	}
	return false;
	}

	struct ScopeRecord {
	ulittle32_t PtrParent;
	ulittle32_t PtrEnd;
	};

	struct SymbolScope {
	ScopeRecord *OpeningRecord;
	uint32_t ScopeOffset;
	};

	static void scopeStackOpen(SmallVectorImpl<SymbolScope> &Stack,
	uint32_t CurOffset, CVSymbol &Sym) {
	assert(symbolOpensScope(Sym.kind()));
	SymbolScope S;
	S.ScopeOffset = CurOffset;
	S.OpeningRecord = const_cast<ScopeRecord *>(
	reinterpret_cast<const ScopeRecord *>(Sym.content().data()));
	S.OpeningRecord->PtrParent = Stack.empty() ? 0 : Stack.back().ScopeOffset;
	Stack.push_back(S);
	}

	static void scopeStackClose(SmallVectorImpl<SymbolScope> &Stack,
	uint32_t CurOffset, ObjFile *File) {
	if (Stack.empty()) {
	warn("symbol scopes are not balanced in " + File->getName());
	return;
	}
	SymbolScope S = Stack.pop_back_val();
	S.OpeningRecord->PtrEnd = CurOffset;
	}

	static bool symbolGoesInModuleStream(const CVSymbol &Sym) {
	switch (Sym.kind()) {
	case SymbolKind::S_GDATA32:
	case SymbolKind::S_CONSTANT:
	case SymbolKind::S_UDT:
	// We really should not be seeing S_PROCREF and S_LPROCREF in the first place
	// since they are synthesized by the linker in response to S_GPROC32 and
	// S_LPROC32, but if we do see them, don't put them in the module stream I
	// guess.
	case SymbolKind::S_PROCREF:
	case SymbolKind::S_LPROCREF:
	return false;
	// S_GDATA32 does not go in the module stream, but S_LDATA32 does.
	case SymbolKind::S_LDATA32:
	default:
	return true;
	}
	}

	static bool symbolGoesInGlobalsStream(const CVSymbol &Sym) {
	switch (Sym.kind()) {
	case SymbolKind::S_CONSTANT:
	case SymbolKind::S_GDATA32:
	// S_LDATA32 goes in both the module stream and the globals stream.
	case SymbolKind::S_LDATA32:
	case SymbolKind::S_GPROC32:
	case SymbolKind::S_LPROC32:
	// We really should not be seeing S_PROCREF and S_LPROCREF in the first place
	// since they are synthesized by the linker in response to S_GPROC32 and
	// S_LPROC32, but if we do see them, copy them straight through.
	case SymbolKind::S_PROCREF:
	case SymbolKind::S_LPROCREF:
	return true;
	// FIXME: For now, we drop all S_UDT symbols (i.e. they don't go in the
	// globals stream or the modules stream). These have special handling which
	// needs more investigation before we can get right, but by putting them all
	// into the globals stream WinDbg fails to display local variables of class
	// types saying that it cannot find the type Foo *. So as a stopgap just to
	// keep things working, we drop them.
	case SymbolKind::S_UDT:
	default:
	return false;
	}
	}

	static void addGlobalSymbol(pdb::GSIStreamBuilder &Builder, ObjFile &File,
	const CVSymbol &Sym) {
	switch (Sym.kind()) {
	case SymbolKind::S_CONSTANT:
	case SymbolKind::S_UDT:
	case SymbolKind::S_GDATA32:
	case SymbolKind::S_LDATA32:
	case SymbolKind::S_PROCREF:
	case SymbolKind::S_LPROCREF:
	Builder.addGlobalSymbol(Sym);
	break;
	case SymbolKind::S_GPROC32:
	case SymbolKind::S_LPROC32: {
	SymbolRecordKind K = SymbolRecordKind::ProcRefSym;
	if (Sym.kind() == SymbolKind::S_LPROC32)
	K = SymbolRecordKind::LocalProcRef;
	ProcRefSym PS(K);
	PS.Module = static_cast<uint16_t>(File.ModuleDBI->getModuleIndex());
	// For some reason, MSVC seems to add one to this value.
	++PS.Module;
	PS.Name = getSymbolName(Sym);
	PS.SumName = 0;
	PS.SymOffset = File.ModuleDBI->getNextSymbolOffset();
	Builder.addGlobalSymbol(PS);
	break;
	}
	default:
	llvm_unreachable("Invalid symbol kind!");
	}
	}

	static void mergeSymbolRecords(BumpPtrAllocator &Alloc, ObjFile *File,
	pdb::GSIStreamBuilder &GsiBuilder,
	const CVIndexMap &IndexMap,
	TypeCollection &IDTable,
	BinaryStreamRef SymData) {
	// FIXME: Improve error recovery by warning and skipping records when
	// possible.
	CVSymbolArray Syms;
	BinaryStreamReader Reader(SymData);
	ExitOnErr(Reader.readArray(Syms, Reader.getLength()));
	SmallVector<SymbolScope, 4> Scopes;
	for (CVSymbol Sym : Syms) {
	// Discover type index references in the record. Skip it if we don't know
	// where they are.
	SmallVector<TiReference, 32> TypeRefs;
	if (!discoverTypeIndicesInSymbol(Sym, TypeRefs)) {
	log("ignoring unknown symbol record with kind 0x" + utohexstr(Sym.kind()));
	continue;
	}

	// Copy the symbol record so we can mutate it.
	MutableArrayRef<uint8_t> NewData = copySymbolForPdb(Sym, Alloc);

	// Re-map all the type index references.
	MutableArrayRef<uint8_t> Contents =
	NewData.drop_front(sizeof(RecordPrefix));
	remapTypesInSymbolRecord(File, Sym.kind(), Contents, IndexMap, TypeRefs);

	// An object file may have S_xxx_ID symbols, but these get converted to
	// "real" symbols in a PDB.
	translateIdSymbols(NewData, IDTable);

	SymbolKind NewKind = symbolKind(NewData);

	// Fill in "Parent" and "End" fields by maintaining a stack of scopes.
	CVSymbol NewSym(NewKind, NewData);
	if (symbolOpensScope(NewKind))
	scopeStackOpen(Scopes, File->ModuleDBI->getNextSymbolOffset(), NewSym);
	else if (symbolEndsScope(NewKind))
	scopeStackClose(Scopes, File->ModuleDBI->getNextSymbolOffset(), File);

	// Add the symbol to the globals stream if necessary. Do this before adding
	// the symbol to the module since we may need to get the next symbol offset,
	// and writing to the module's symbol stream will update that offset.
	if (symbolGoesInGlobalsStream(NewSym))
	addGlobalSymbol(GsiBuilder, *File, NewSym);

	// Add the symbol to the module.
	if (symbolGoesInModuleStream(NewSym))
	File->ModuleDBI->addSymbol(NewSym);
	}
	}

	// Allocate memory for a .debug$S section and relocate it.
	static ArrayRef<uint8_t> relocateDebugChunk(BumpPtrAllocator &Alloc,
	SectionChunk *DebugChunk) {
	uint8_t *Buffer = Alloc.Allocate<uint8_t>(DebugChunk->getSize());
	assert(DebugChunk->OutputSectionOff == 0 &&
	"debug sections should not be in output sections");
	DebugChunk->writeTo(Buffer);
	return consumeDebugMagic(makeArrayRef(Buffer, DebugChunk->getSize()),
	".debug$S");
	}

	void PDBLinker::addObjFile(ObjFile *File) {
	// Add a module descriptor for every object file. We need to put an absolute
	// path to the object into the PDB. If this is a plain object, we make its
	// path absolute. If it's an object in an archive, we make the archive path
	// absolute.
	bool InArchive = !File->ParentName.empty();
	SmallString<128> Path = InArchive ? File->ParentName : File->getName();
	sys::fs::make_absolute(Path);
	sys::path::native(Path, sys::path::Style::windows);
	StringRef Name = InArchive ? File->getName() : StringRef(Path);

	File->ModuleDBI = &ExitOnErr(Builder.getDbiBuilder().addModuleInfo(Name));
	File->ModuleDBI->setObjFileName(Path);

	// Before we can process symbol substreams from .debug$S, we need to process
	// type information, file checksums, and the string table. Add type info to
	// the PDB first, so that we can get the map from object file type and item
	// indices to PDB type and item indices.
	CVIndexMap ObjectIndexMap;
	- const CVIndexMap &IndexMap = mergeDebugT(File, ObjectIndexMap);
	+ auto IndexMapResult = mergeDebugT(File, ObjectIndexMap);
	+
	+ // If the .debug$T sections fail to merge, assume there is no debug info.
	+ if (!IndexMapResult) {
	+ warn("Type server PDB for " + Name + " is invalid, ignoring debug info. " +
	+ toString(IndexMapResult.takeError()));
	+ return;
	+ }
	+
	+ const CVIndexMap &IndexMap = *IndexMapResult;

	// Now do all live .debug$S sections.
	for (SectionChunk *DebugChunk : File->getDebugChunks()) {
	if (!DebugChunk->isLive() \|\| DebugChunk->getSectionName() != ".debug$S")
	continue;

	ArrayRef<uint8_t> RelocatedDebugContents =
	relocateDebugChunk(Alloc, DebugChunk);
	if (RelocatedDebugContents.empty())
	continue;

	DebugSubsectionArray Subsections;
	BinaryStreamReader Reader(RelocatedDebugContents, support::little);
	ExitOnErr(Reader.readArray(Subsections, RelocatedDebugContents.size()));

	DebugStringTableSubsectionRef CVStrTab;
	DebugChecksumsSubsectionRef Checksums;
	for (const DebugSubsectionRecord &SS : Subsections) {
	switch (SS.kind()) {
	case DebugSubsectionKind::StringTable:
	ExitOnErr(CVStrTab.initialize(SS.getRecordData()));
	break;
	case DebugSubsectionKind::FileChecksums:
	ExitOnErr(Checksums.initialize(SS.getRecordData()));
	break;
	case DebugSubsectionKind::Lines:
	// We can add the relocated line table directly to the PDB without
	// modification because the file checksum offsets will stay the same.
	File->ModuleDBI->addDebugSubsection(SS);
	break;
	case DebugSubsectionKind::Symbols:
	if (Config->DebugGHashes) {
	mergeSymbolRecords(Alloc, File, Builder.getGsiBuilder(), IndexMap,
	GlobalIDTable, SS.getRecordData());
	} else {
	mergeSymbolRecords(Alloc, File, Builder.getGsiBuilder(), IndexMap,
	IDTable, SS.getRecordData());
	}
	break;
	default:
	// FIXME: Process the rest of the subsections.
	break;
	}
	}

	if (Checksums.valid()) {
	// Make a new file checksum table that refers to offsets in the PDB-wide
	// string table. Generally the string table subsection appears after the
	// checksum table, so we have to do this after looping over all the
	// subsections.
	if (!CVStrTab.valid())
	fatal(".debug$S sections must have both a string table subsection "
	"and a checksum subsection table or neither");
	auto NewChecksums = make_unique<DebugChecksumsSubsection>(PDBStrTab);
	for (FileChecksumEntry &FC : Checksums) {
	StringRef FileName = ExitOnErr(CVStrTab.getString(FC.FileNameOffset));
	ExitOnErr(Builder.getDbiBuilder().addModuleSourceFile(*File->ModuleDBI,
	FileName));
	NewChecksums->addChecksum(FileName, FC.Kind, FC.Checksum);
	}
	File->ModuleDBI->addDebugSubsection(std::move(NewChecksums));
	}
	}
	}

	static PublicSym32 createPublic(Defined *Def) {
	PublicSym32 Pub(SymbolKind::S_PUB32);
	Pub.Name = Def->getName();
	if (auto *D = dyn_cast<DefinedCOFF>(Def)) {
	if (D->getCOFFSymbol().isFunctionDefinition())
	Pub.Flags = PublicSymFlags::Function;
	} else if (isa<DefinedImportThunk>(Def)) {
	Pub.Flags = PublicSymFlags::Function;
	}

	OutputSection *OS = Def->getChunk()->getOutputSection();
	assert(OS && "all publics should be in final image");
	Pub.Offset = Def->getRVA() - OS->getRVA();
	Pub.Segment = OS->SectionIndex;
	return Pub;
	}

	// Add all object files to the PDB. Merge .debug$T sections into IpiData and
	// TpiData.
	void PDBLinker::addObjectsToPDB() {
	for (ObjFile *File : ObjFile::Instances)
	addObjFile(File);

	Builder.getStringTableBuilder().setStrings(PDBStrTab);

	// Construct TPI and IPI stream contents.
	if (Config->DebugGHashes) {
	addTypeInfo(Builder.getTpiBuilder(), GlobalTypeTable);
	addTypeInfo(Builder.getIpiBuilder(), GlobalIDTable);
	} else {
	addTypeInfo(Builder.getTpiBuilder(), TypeTable);
	addTypeInfo(Builder.getIpiBuilder(), IDTable);
	}

	// Compute the public and global symbols.
	auto &GsiBuilder = Builder.getGsiBuilder();
	std::vector<PublicSym32> Publics;
	Symtab->forEachSymbol([&Publics](Symbol *S) {
	// Only emit defined, live symbols that have a chunk.
	auto *Def = dyn_cast<Defined>(S);
	if (Def && Def->isLive() && Def->getChunk())
	Publics.push_back(createPublic(Def));
	});

	if (!Publics.empty()) {
	// Sort the public symbols and add them to the stream.
	std::sort(Publics.begin(), Publics.end(),
	[](const PublicSym32 &L, const PublicSym32 &R) {
	return L.Name < R.Name;
	});
	for (const PublicSym32 &Pub : Publics)
	GsiBuilder.addPublicSymbol(Pub);
	}
	}

	static void addCommonLinkerModuleSymbols(StringRef Path,
	pdb::DbiModuleDescriptorBuilder &Mod,
	BumpPtrAllocator &Allocator) {
	ObjNameSym ONS(SymbolRecordKind::ObjNameSym);
	Compile3Sym CS(SymbolRecordKind::Compile3Sym);
	EnvBlockSym EBS(SymbolRecordKind::EnvBlockSym);

	ONS.Name = "* Linker *";
	ONS.Signature = 0;

	CS.Machine = Config->is64() ? CPUType::X64 : CPUType::Intel80386;
	// Interestingly, if we set the string to 0.0.0.0, then when trying to view
	// local variables WinDbg emits an error that private symbols are not present.
	// By setting this to a valid MSVC linker version string, local variables are
	// displayed properly. As such, even though it is not representative of
	// LLVM's version information, we need this for compatibility.
	CS.Flags = CompileSym3Flags::None;
	CS.VersionBackendBuild = 25019;
	CS.VersionBackendMajor = 14;
	CS.VersionBackendMinor = 10;
	CS.VersionBackendQFE = 0;

	// MSVC also sets the frontend to 0.0.0.0 since this is specifically for the
	// linker module (which is by definition a backend), so we don't need to do
	// anything here. Also, it seems we can use "LLVM Linker" for the linker name
	// without any problems. Only the backend version has to be hardcoded to a
	// magic number.
	CS.VersionFrontendBuild = 0;
	CS.VersionFrontendMajor = 0;
	CS.VersionFrontendMinor = 0;
	CS.VersionFrontendQFE = 0;
	CS.Version = "LLVM Linker";
	CS.setLanguage(SourceLanguage::Link);

	ArrayRef<StringRef> Args = makeArrayRef(Config->Argv).drop_front();
	std::string ArgStr = llvm::join(Args, " ");
	EBS.Fields.push_back("cwd");
	SmallString<64> cwd;
	sys::fs::current_path(cwd);
	EBS.Fields.push_back(cwd);
	EBS.Fields.push_back("exe");
	SmallString<64> exe = Config->Argv[0];
	llvm::sys::fs::make_absolute(exe);
	EBS.Fields.push_back(exe);
	EBS.Fields.push_back("pdb");
	EBS.Fields.push_back(Path);
	EBS.Fields.push_back("cmd");
	EBS.Fields.push_back(ArgStr);
	Mod.addSymbol(codeview::SymbolSerializer::writeOneSymbol(
	ONS, Allocator, CodeViewContainer::Pdb));
	Mod.addSymbol(codeview::SymbolSerializer::writeOneSymbol(
	CS, Allocator, CodeViewContainer::Pdb));
	Mod.addSymbol(codeview::SymbolSerializer::writeOneSymbol(
	EBS, Allocator, CodeViewContainer::Pdb));
	}

	static void addLinkerModuleSectionSymbol(pdb::DbiModuleDescriptorBuilder &Mod,
	OutputSection &OS,
	BumpPtrAllocator &Allocator) {
	SectionSym Sym(SymbolRecordKind::SectionSym);
	Sym.Alignment = 12; // 2^12 = 4KB
	Sym.Characteristics = OS.getCharacteristics();
	Sym.Length = OS.getVirtualSize();
	Sym.Name = OS.getName();
	Sym.Rva = OS.getRVA();
	Sym.SectionNumber = OS.SectionIndex;
	Mod.addSymbol(codeview::SymbolSerializer::writeOneSymbol(
	Sym, Allocator, CodeViewContainer::Pdb));
	}

	// Creates a PDB file.
	void coff::createPDB(SymbolTable *Symtab,
	ArrayRef<OutputSection *> OutputSections,
	ArrayRef<uint8_t> SectionTable,
	const llvm::codeview::DebugInfo &BuildId) {
	PDBLinker PDB(Symtab);
	PDB.initialize(BuildId);
	PDB.addObjectsToPDB();
	PDB.addSections(OutputSections, SectionTable);
	PDB.commit();
	}

	void PDBLinker::initialize(const llvm::codeview::DebugInfo &BuildId) {
	ExitOnErr(Builder.initialize(4096)); // 4096 is blocksize

	// Create streams in MSF for predefined streams, namely
	// PDB, TPI, DBI and IPI.
	for (int I = 0; I < (int)pdb::kSpecialStreamCount; ++I)
	ExitOnErr(Builder.getMsfBuilder().addStream(0));

	// Add an Info stream.
	auto &InfoBuilder = Builder.getInfoBuilder();
	InfoBuilder.setAge(BuildId.PDB70.Age);

	GUID uuid;
	memcpy(&uuid, &BuildId.PDB70.Signature, sizeof(uuid));
	InfoBuilder.setGuid(uuid);
	InfoBuilder.setSignature(time(nullptr));
	InfoBuilder.setVersion(pdb::PdbRaw_ImplVer::PdbImplVC70);

	// Add an empty DBI stream.
	pdb::DbiStreamBuilder &DbiBuilder = Builder.getDbiBuilder();
	DbiBuilder.setAge(BuildId.PDB70.Age);
	DbiBuilder.setVersionHeader(pdb::PdbDbiV70);
	ExitOnErr(DbiBuilder.addDbgStream(pdb::DbgHeaderType::NewFPO, {}));
	}

	void PDBLinker::addSectionContrib(pdb::DbiModuleDescriptorBuilder &LinkerModule,
	OutputSection OS, Chunk C) {
	pdb::SectionContrib SC;
	memset(&SC, 0, sizeof(SC));
	SC.ISect = OS->SectionIndex;
	SC.Off = C->getRVA() - OS->getRVA();
	SC.Size = C->getSize();
	if (auto *SecChunk = dyn_cast<SectionChunk>(C)) {
	SC.Characteristics = SecChunk->Header->Characteristics;
	SC.Imod = SecChunk->File->ModuleDBI->getModuleIndex();
	ArrayRef<uint8_t> Contents = SecChunk->getContents();
	JamCRC CRC(0);
	ArrayRef<char> CharContents = makeArrayRef(
	reinterpret_cast<const char *>(Contents.data()), Contents.size());
	CRC.update(CharContents);
	SC.DataCrc = CRC.getCRC();
	} else {
	SC.Characteristics = OS->getCharacteristics();
	// FIXME: When we start creating DBI for import libraries, use those here.
	SC.Imod = LinkerModule.getModuleIndex();
	}
	SC.RelocCrc = 0; // FIXME
	Builder.getDbiBuilder().addSectionContrib(SC);
	}

	void PDBLinker::addSections(ArrayRef<OutputSection *> OutputSections,
	ArrayRef<uint8_t> SectionTable) {
	// It's not entirely clear what this is, but the * Linker * module uses it.
	pdb::DbiStreamBuilder &DbiBuilder = Builder.getDbiBuilder();
	NativePath = Config->PDBPath;
	sys::fs::make_absolute(NativePath);
	sys::path::native(NativePath, sys::path::Style::windows);
	uint32_t PdbFilePathNI = DbiBuilder.addECName(NativePath);
	auto &LinkerModule = ExitOnErr(DbiBuilder.addModuleInfo("* Linker *"));
	LinkerModule.setPdbFilePathNI(PdbFilePathNI);
	addCommonLinkerModuleSymbols(NativePath, LinkerModule, Alloc);

	// Add section contributions. They must be ordered by ascending RVA.
	for (OutputSection *OS : OutputSections) {
	addLinkerModuleSectionSymbol(LinkerModule, *OS, Alloc);
	for (Chunk *C : OS->getChunks())
	addSectionContrib(LinkerModule, OS, C);
	}

	// Add Section Map stream.
	ArrayRef<object::coff_section> Sections = {
	(const object::coff_section *)SectionTable.data(),
	SectionTable.size() / sizeof(object::coff_section)};
	SectionMap = pdb::DbiStreamBuilder::createSectionMap(Sections);
	DbiBuilder.setSectionMap(SectionMap);

	// Add COFF section header stream.
	ExitOnErr(
	DbiBuilder.addDbgStream(pdb::DbgHeaderType::SectionHdr, SectionTable));
	}

	void PDBLinker::commit() {
	// Write to a file.
	ExitOnErr(Builder.commit(Config->PDBPath));
	}
	Index: head/contrib/llvm/tools/lld/ELF/Driver.cpp
	===================================================================
	--- head/contrib/llvm/tools/lld/ELF/Driver.cpp (revision 329409)
	+++ head/contrib/llvm/tools/lld/ELF/Driver.cpp (revision 329410)
	@@ -1,1128 +1,1128 @@
	//===- Driver.cpp ---------------------------------------------------------===//
	//
	// The LLVM Linker
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// The driver drives the entire linking process. It is responsible for
	// parsing command line options and doing whatever it is instructed to do.
	//
	// One notable thing in the LLD's driver when compared to other linkers is
	// that the LLD's driver is agnostic on the host operating system.
	// Other linkers usually have implicit default values (such as a dynamic
	// linker path or library paths) for each host OS.
	//
	// I don't think implicit default values are useful because they are
	// usually explicitly specified by the compiler driver. They can even
	// be harmful when you are doing cross-linking. Therefore, in LLD, we
	// simply trust the compiler driver to pass all required options and
	// don't try to make effort on our side.
	//
	//===----------------------------------------------------------------------===//

	#include "Driver.h"
	#include "Config.h"
	#include "Filesystem.h"
	#include "ICF.h"
	#include "InputFiles.h"
	#include "InputSection.h"
	#include "LinkerScript.h"
	#include "OutputSections.h"
	#include "ScriptParser.h"
	#include "Strings.h"
	#include "SymbolTable.h"
	#include "Symbols.h"
	#include "SyntheticSections.h"
	#include "Target.h"
	#include "Writer.h"
	#include "lld/Common/Args.h"
	#include "lld/Common/Driver.h"
	#include "lld/Common/ErrorHandler.h"
	#include "lld/Common/Memory.h"
	#include "lld/Common/Threads.h"
	#include "lld/Common/Version.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compression.h"
	#include "llvm/Support/Path.h"
	#include "llvm/Support/TarWriter.h"
	#include "llvm/Support/TargetSelect.h"
	#include "llvm/Support/raw_ostream.h"
	#include <cstdlib>
	#include <utility>

	using namespace llvm;
	using namespace llvm::ELF;
	using namespace llvm::object;
	using namespace llvm::sys;

	using namespace lld;
	using namespace lld::elf;

	Configuration *elf::Config;
	LinkerDriver *elf::Driver;

	static void setConfigs();

	bool elf::link(ArrayRef<const char *> Args, bool CanExitEarly,
	raw_ostream &Error) {
	errorHandler().LogName = Args[0];
	errorHandler().ErrorLimitExceededMsg =
	"too many errors emitted, stopping now (use "
	"-error-limit=0 to see all errors)";
	errorHandler().ErrorOS = &Error;
	errorHandler().ColorDiagnostics = Error.has_colors();
	InputSections.clear();
	OutputSections.clear();
	Tar = nullptr;
	BinaryFiles.clear();
	BitcodeFiles.clear();
	ObjectFiles.clear();
	SharedFiles.clear();

	Config = make<Configuration>();
	Driver = make<LinkerDriver>();
	Script = make<LinkerScript>();
	Symtab = make<SymbolTable>();
	Config->Argv = {Args.begin(), Args.end()};

	Driver->main(Args, CanExitEarly);

	// Exit immediately if we don't need to return to the caller.
	// This saves time because the overhead of calling destructors
	// for all globally-allocated objects is not negligible.
	if (Config->ExitEarly)
	exitLld(errorCount() ? 1 : 0);

	freeArena();
	return !errorCount();
	}

	// Parses a linker -m option.
	static std::tuple<ELFKind, uint16_t, uint8_t> parseEmulation(StringRef Emul) {
	uint8_t OSABI = 0;
	StringRef S = Emul;
	if (S.endswith("_fbsd")) {
	S = S.drop_back(5);
	OSABI = ELFOSABI_FREEBSD;
	}

	std::pair<ELFKind, uint16_t> Ret =
	StringSwitch<std::pair<ELFKind, uint16_t>>(S)
	.Cases("aarch64elf", "aarch64linux", {ELF64LEKind, EM_AARCH64})
	.Cases("armelf", "armelf_linux_eabi", {ELF32LEKind, EM_ARM})
	.Case("elf32_x86_64", {ELF32LEKind, EM_X86_64})
	.Cases("elf32btsmip", "elf32btsmipn32", {ELF32BEKind, EM_MIPS})
	.Cases("elf32ltsmip", "elf32ltsmipn32", {ELF32LEKind, EM_MIPS})
	.Case("elf32ppc", {ELF32BEKind, EM_PPC})
	.Case("elf64btsmip", {ELF64BEKind, EM_MIPS})
	.Case("elf64ltsmip", {ELF64LEKind, EM_MIPS})
	.Case("elf64ppc", {ELF64BEKind, EM_PPC64})
	.Cases("elf_amd64", "elf_x86_64", {ELF64LEKind, EM_X86_64})
	.Case("elf_i386", {ELF32LEKind, EM_386})
	.Case("elf_iamcu", {ELF32LEKind, EM_IAMCU})
	.Default({ELFNoneKind, EM_NONE});

	if (Ret.first == ELFNoneKind)
	error("unknown emulation: " + Emul);
	return std::make_tuple(Ret.first, Ret.second, OSABI);
	}

	// Returns slices of MB by parsing MB as an archive file.
	// Each slice consists of a member file in the archive.
	std::vector<std::pair<MemoryBufferRef, uint64_t>> static getArchiveMembers(
	MemoryBufferRef MB) {
	std::unique_ptr<Archive> File =
	CHECK(Archive::create(MB),
	MB.getBufferIdentifier() + ": failed to parse archive");

	std::vector<std::pair<MemoryBufferRef, uint64_t>> V;
	Error Err = Error::success();
	bool AddToTar = File->isThin() && Tar;
	for (const ErrorOr<Archive::Child> &COrErr : File->children(Err)) {
	Archive::Child C =
	CHECK(COrErr, MB.getBufferIdentifier() +
	": could not get the child of the archive");
	MemoryBufferRef MBRef =
	CHECK(C.getMemoryBufferRef(),
	MB.getBufferIdentifier() +
	": could not get the buffer for a child of the archive");
	if (AddToTar)
	Tar->append(relativeToRoot(check(C.getFullName())), MBRef.getBuffer());
	V.push_back(std::make_pair(MBRef, C.getChildOffset()));
	}
	if (Err)
	fatal(MB.getBufferIdentifier() + ": Archive::children failed: " +
	toString(std::move(Err)));

	// Take ownership of memory buffers created for members of thin archives.
	for (std::unique_ptr<MemoryBuffer> &MB : File->takeThinBuffers())
	make<std::unique_ptr<MemoryBuffer>>(std::move(MB));

	return V;
	}

	// Opens a file and create a file object. Path has to be resolved already.
	void LinkerDriver::addFile(StringRef Path, bool WithLOption) {
	using namespace sys::fs;

	Optional<MemoryBufferRef> Buffer = readFile(Path);
	if (!Buffer.hasValue())
	return;
	MemoryBufferRef MBRef = *Buffer;

	if (InBinary) {
	Files.push_back(make<BinaryFile>(MBRef));
	return;
	}

	switch (identify_magic(MBRef.getBuffer())) {
	case file_magic::unknown:
	readLinkerScript(MBRef);
	return;
	case file_magic::archive: {
	// Handle -whole-archive.
	if (InWholeArchive) {
	for (const auto &P : getArchiveMembers(MBRef))
	Files.push_back(createObjectFile(P.first, Path, P.second));
	return;
	}

	std::unique_ptr<Archive> File =
	CHECK(Archive::create(MBRef), Path + ": failed to parse archive");

	// If an archive file has no symbol table, it is likely that a user
	// is attempting LTO and using a default ar command that doesn't
	// understand the LLVM bitcode file. It is a pretty common error, so
	// we'll handle it as if it had a symbol table.
	if (!File->isEmpty() && !File->hasSymbolTable()) {
	for (const auto &P : getArchiveMembers(MBRef))
	Files.push_back(make<LazyObjFile>(P.first, Path, P.second));
	return;
	}

	// Handle the regular case.
	Files.push_back(make<ArchiveFile>(std::move(File)));
	return;
	}
	case file_magic::elf_shared_object:
	if (Config->Relocatable) {
	error("attempted static link of dynamic object " + Path);
	return;
	}

	// DSOs usually have DT_SONAME tags in their ELF headers, and the
	// sonames are used to identify DSOs. But if they are missing,
	// they are identified by filenames. We don't know whether the new
	// file has a DT_SONAME or not because we haven't parsed it yet.
	// Here, we set the default soname for the file because we might
	// need it later.
	//
	// If a file was specified by -lfoo, the directory part is not
	// significant, as a user did not specify it. This behavior is
	// compatible with GNU.
	Files.push_back(
	createSharedFile(MBRef, WithLOption ? path::filename(Path) : Path));
	return;
	default:
	if (InLib)
	Files.push_back(make<LazyObjFile>(MBRef, "", 0));
	else
	Files.push_back(createObjectFile(MBRef));
	}
	}

	// Add a given library by searching it from input search paths.
	void LinkerDriver::addLibrary(StringRef Name) {
	if (Optional<std::string> Path = searchLibrary(Name))
	addFile(Path, /WithLOption=*/true);
	else
	error("unable to find library -l" + Name);
	}

	// This function is called on startup. We need this for LTO since
	// LTO calls LLVM functions to compile bitcode files to native code.
	// Technically this can be delayed until we read bitcode files, but
	// we don't bother to do lazily because the initialization is fast.
	static void initLLVM(opt::InputArgList &Args) {
	InitializeAllTargets();
	InitializeAllTargetMCs();
	InitializeAllAsmPrinters();
	InitializeAllAsmParsers();

	// Parse and evaluate -mllvm options.
	std::vector<const char *> V;
	V.push_back("lld (LLVM option parsing)");
	for (auto *Arg : Args.filtered(OPT_mllvm))
	V.push_back(Arg->getValue());
	cl::ParseCommandLineOptions(V.size(), V.data());
	}

	// Some command line options or some combinations of them are not allowed.
	// This function checks for such errors.
	static void checkOptions(opt::InputArgList &Args) {
	// The MIPS ABI as of 2016 does not support the GNU-style symbol lookup
	// table which is a relatively new feature.
	if (Config->EMachine == EM_MIPS && Config->GnuHash)
	error("the .gnu.hash section is not compatible with the MIPS target.");

	if (Config->FixCortexA53Errata843419 && Config->EMachine != EM_AARCH64)
	error("--fix-cortex-a53-843419 is only supported on AArch64 targets.");

	if (Config->Pie && Config->Shared)
	error("-shared and -pie may not be used together");

	if (!Config->Shared && !Config->FilterList.empty())
	error("-F may not be used without -shared");

	if (!Config->Shared && !Config->AuxiliaryList.empty())
	error("-f may not be used without -shared");

	if (!Config->Relocatable && !Config->DefineCommon)
	error("-no-define-common not supported in non relocatable output");

	if (Config->Relocatable) {
	if (Config->Shared)
	error("-r and -shared may not be used together");
	if (Config->GcSections)
	error("-r and --gc-sections may not be used together");
	if (Config->ICF)
	error("-r and --icf may not be used together");
	if (Config->Pie)
	error("-r and -pie may not be used together");
	}
	}

	static const char *getReproduceOption(opt::InputArgList &Args) {
	if (auto *Arg = Args.getLastArg(OPT_reproduce))
	return Arg->getValue();
	return getenv("LLD_REPRODUCE");
	}

	static bool hasZOption(opt::InputArgList &Args, StringRef Key) {
	for (auto *Arg : Args.filtered(OPT_z))
	if (Key == Arg->getValue())
	return true;
	return false;
	}

	void LinkerDriver::main(ArrayRef<const char *> ArgsArr, bool CanExitEarly) {
	ELFOptTable Parser;
	opt::InputArgList Args = Parser.parse(ArgsArr.slice(1));

	// Interpret this flag early because error() depends on them.
	errorHandler().ErrorLimit = args::getInteger(Args, OPT_error_limit, 20);

	// Handle -help
	if (Args.hasArg(OPT_help)) {
	printHelp(ArgsArr[0]);
	return;
	}

	// Handle -v or -version.
	//
	// A note about "compatible with GNU linkers" message: this is a hack for
	// scripts generated by GNU Libtool 2.4.6 (released in February 2014 and
	// still the newest version in March 2017) or earlier to recognize LLD as
	// a GNU compatible linker. As long as an output for the -v option
	// contains "GNU" or "with BFD", they recognize us as GNU-compatible.
	//
	// This is somewhat ugly hack, but in reality, we had no choice other
	// than doing this. Considering the very long release cycle of Libtool,
	// it is not easy to improve it to recognize LLD as a GNU compatible
	// linker in a timely manner. Even if we can make it, there are still a
	// lot of "configure" scripts out there that are generated by old version
	// of Libtool. We cannot convince every software developer to migrate to
	// the latest version and re-generate scripts. So we have this hack.
	if (Args.hasArg(OPT_v) \|\| Args.hasArg(OPT_version))
	message(getLLDVersion() + " (compatible with GNU linkers)");

	// The behavior of -v or --version is a bit strange, but this is
	// needed for compatibility with GNU linkers.
	if (Args.hasArg(OPT_v) && !Args.hasArg(OPT_INPUT))
	return;
	if (Args.hasArg(OPT_version))
	return;

	Config->ExitEarly = CanExitEarly && !Args.hasArg(OPT_full_shutdown);
	errorHandler().ExitEarly = Config->ExitEarly;

	if (const char *Path = getReproduceOption(Args)) {
	// Note that --reproduce is a debug option so you can ignore it
	// if you are trying to understand the whole picture of the code.
	Expected<std::unique_ptr<TarWriter>> ErrOrWriter =
	TarWriter::create(Path, path::stem(Path));
	if (ErrOrWriter) {
	Tar = ErrOrWriter->get();
	Tar->append("response.txt", createResponseFile(Args));
	Tar->append("version.txt", getLLDVersion() + "\n");
	make<std::unique_ptr<TarWriter>>(std::move(*ErrOrWriter));
	} else {
	error(Twine("--reproduce: failed to open ") + Path + ": " +
	toString(ErrOrWriter.takeError()));
	}
	}

	readConfigs(Args);
	initLLVM(Args);
	createFiles(Args);
	inferMachineType();
	setConfigs();
	checkOptions(Args);
	if (errorCount())
	return;

	switch (Config->EKind) {
	case ELF32LEKind:
	link<ELF32LE>(Args);
	return;
	case ELF32BEKind:
	link<ELF32BE>(Args);
	return;
	case ELF64LEKind:
	link<ELF64LE>(Args);
	return;
	case ELF64BEKind:
	link<ELF64BE>(Args);
	return;
	default:
	llvm_unreachable("unknown Config->EKind");
	}
	}

	static std::string getRpath(opt::InputArgList &Args) {
	std::vector<StringRef> V = args::getStrings(Args, OPT_rpath);
	return llvm::join(V.begin(), V.end(), ":");
	}

	// Determines what we should do if there are remaining unresolved
	// symbols after the name resolution.
	static UnresolvedPolicy getUnresolvedSymbolPolicy(opt::InputArgList &Args) {
	if (Args.hasArg(OPT_relocatable))
	return UnresolvedPolicy::IgnoreAll;

	UnresolvedPolicy ErrorOrWarn = Args.hasFlag(OPT_error_unresolved_symbols,
	OPT_warn_unresolved_symbols, true)
	? UnresolvedPolicy::ReportError
	: UnresolvedPolicy::Warn;

	// Process the last of -unresolved-symbols, -no-undefined or -z defs.
	for (auto *Arg : llvm::reverse(Args)) {
	switch (Arg->getOption().getID()) {
	case OPT_unresolved_symbols: {
	StringRef S = Arg->getValue();
	if (S == "ignore-all" \|\| S == "ignore-in-object-files")
	return UnresolvedPolicy::Ignore;
	if (S == "ignore-in-shared-libs" \|\| S == "report-all")
	return ErrorOrWarn;
	error("unknown --unresolved-symbols value: " + S);
	continue;
	}
	case OPT_no_undefined:
	return ErrorOrWarn;
	case OPT_z:
	if (StringRef(Arg->getValue()) == "defs")
	return ErrorOrWarn;
	continue;
	}
	}

	// -shared implies -unresolved-symbols=ignore-all because missing
	// symbols are likely to be resolved at runtime using other DSOs.
	if (Config->Shared)
	return UnresolvedPolicy::Ignore;
	return ErrorOrWarn;
	}

	static Target2Policy getTarget2(opt::InputArgList &Args) {
	StringRef S = Args.getLastArgValue(OPT_target2, "got-rel");
	if (S == "rel")
	return Target2Policy::Rel;
	if (S == "abs")
	return Target2Policy::Abs;
	if (S == "got-rel")
	return Target2Policy::GotRel;
	error("unknown --target2 option: " + S);
	return Target2Policy::GotRel;
	}

	static bool isOutputFormatBinary(opt::InputArgList &Args) {
	if (auto *Arg = Args.getLastArg(OPT_oformat)) {
	StringRef S = Arg->getValue();
	if (S == "binary")
	return true;
	error("unknown --oformat value: " + S);
	}
	return false;
	}

	static DiscardPolicy getDiscard(opt::InputArgList &Args) {
	if (Args.hasArg(OPT_relocatable))
	return DiscardPolicy::None;

	auto *Arg =
	Args.getLastArg(OPT_discard_all, OPT_discard_locals, OPT_discard_none);
	if (!Arg)
	return DiscardPolicy::Default;
	if (Arg->getOption().getID() == OPT_discard_all)
	return DiscardPolicy::All;
	if (Arg->getOption().getID() == OPT_discard_locals)
	return DiscardPolicy::Locals;
	return DiscardPolicy::None;
	}

	static StringRef getDynamicLinker(opt::InputArgList &Args) {
	auto *Arg = Args.getLastArg(OPT_dynamic_linker, OPT_no_dynamic_linker);
	if (!Arg \|\| Arg->getOption().getID() == OPT_no_dynamic_linker)
	return "";
	return Arg->getValue();
	}

	static StripPolicy getStrip(opt::InputArgList &Args) {
	if (Args.hasArg(OPT_relocatable))
	return StripPolicy::None;

	auto *Arg = Args.getLastArg(OPT_strip_all, OPT_strip_debug);
	if (!Arg)
	return StripPolicy::None;
	if (Arg->getOption().getID() == OPT_strip_all)
	return StripPolicy::All;
	return StripPolicy::Debug;
	}

	static uint64_t parseSectionAddress(StringRef S, const opt::Arg &Arg) {
	uint64_t VA = 0;
	if (S.startswith("0x"))
	S = S.drop_front(2);
	if (!to_integer(S, VA, 16))
	error("invalid argument: " + toString(Arg));
	return VA;
	}

	static StringMap<uint64_t> getSectionStartMap(opt::InputArgList &Args) {
	StringMap<uint64_t> Ret;
	for (auto *Arg : Args.filtered(OPT_section_start)) {
	StringRef Name;
	StringRef Addr;
	std::tie(Name, Addr) = StringRef(Arg->getValue()).split('=');
	Ret[Name] = parseSectionAddress(Addr, *Arg);
	}

	if (auto *Arg = Args.getLastArg(OPT_Ttext))
	Ret[".text"] = parseSectionAddress(Arg->getValue(), *Arg);
	if (auto *Arg = Args.getLastArg(OPT_Tdata))
	Ret[".data"] = parseSectionAddress(Arg->getValue(), *Arg);
	if (auto *Arg = Args.getLastArg(OPT_Tbss))
	Ret[".bss"] = parseSectionAddress(Arg->getValue(), *Arg);
	return Ret;
	}

	static SortSectionPolicy getSortSection(opt::InputArgList &Args) {
	StringRef S = Args.getLastArgValue(OPT_sort_section);
	if (S == "alignment")
	return SortSectionPolicy::Alignment;
	if (S == "name")
	return SortSectionPolicy::Name;
	if (!S.empty())
	error("unknown --sort-section rule: " + S);
	return SortSectionPolicy::Default;
	}

	static OrphanHandlingPolicy getOrphanHandling(opt::InputArgList &Args) {
	StringRef S = Args.getLastArgValue(OPT_orphan_handling, "place");
	if (S == "warn")
	return OrphanHandlingPolicy::Warn;
	if (S == "error")
	return OrphanHandlingPolicy::Error;
	if (S != "place")
	error("unknown --orphan-handling mode: " + S);
	return OrphanHandlingPolicy::Place;
	}

	// Parse --build-id or --build-id=<style>. We handle "tree" as a
	// synonym for "sha1" because all our hash functions including
	// -build-id=sha1 are actually tree hashes for performance reasons.
	static std::pair<BuildIdKind, std::vector<uint8_t>>
	getBuildId(opt::InputArgList &Args) {
	auto *Arg = Args.getLastArg(OPT_build_id, OPT_build_id_eq);
	if (!Arg)
	return {BuildIdKind::None, {}};

	if (Arg->getOption().getID() == OPT_build_id)
	return {BuildIdKind::Fast, {}};

	StringRef S = Arg->getValue();
	if (S == "md5")
	return {BuildIdKind::Md5, {}};
	if (S == "sha1" \|\| S == "tree")
	return {BuildIdKind::Sha1, {}};
	if (S == "uuid")
	return {BuildIdKind::Uuid, {}};
	if (S.startswith("0x"))
	return {BuildIdKind::Hexstring, parseHex(S.substr(2))};

	if (S != "none")
	error("unknown --build-id style: " + S);
	return {BuildIdKind::None, {}};
	}

	static bool getCompressDebugSections(opt::InputArgList &Args) {
	StringRef S = Args.getLastArgValue(OPT_compress_debug_sections, "none");
	if (S == "none")
	return false;
	if (S != "zlib")
	error("unknown --compress-debug-sections value: " + S);
	if (!zlib::isAvailable())
	error("--compress-debug-sections: zlib is not available");
	return true;
	}

	static int parseInt(StringRef S, opt::Arg *Arg) {
	int V = 0;
	if (!to_integer(S, V, 10))
	error(Arg->getSpelling() + ": number expected, but got '" + S + "'");
	return V;
	}

	// Initializes Config members by the command line options.
	void LinkerDriver::readConfigs(opt::InputArgList &Args) {
	Config->AllowMultipleDefinition =
	Args.hasArg(OPT_allow_multiple_definition) \|\| hasZOption(Args, "muldefs");
	Config->AuxiliaryList = args::getStrings(Args, OPT_auxiliary);
	Config->Bsymbolic = Args.hasArg(OPT_Bsymbolic);
	Config->BsymbolicFunctions = Args.hasArg(OPT_Bsymbolic_functions);
	Config->Chroot = Args.getLastArgValue(OPT_chroot);
	Config->CompressDebugSections = getCompressDebugSections(Args);
	Config->DefineCommon = Args.hasFlag(OPT_define_common, OPT_no_define_common,
	!Args.hasArg(OPT_relocatable));
	Config->Demangle = Args.hasFlag(OPT_demangle, OPT_no_demangle, true);
	Config->DisableVerify = Args.hasArg(OPT_disable_verify);
	Config->Discard = getDiscard(Args);
	Config->DynamicLinker = getDynamicLinker(Args);
	Config->EhFrameHdr =
	Args.hasFlag(OPT_eh_frame_hdr, OPT_no_eh_frame_hdr, false);
	Config->EmitRelocs = Args.hasArg(OPT_emit_relocs);
	Config->EnableNewDtags = !Args.hasArg(OPT_disable_new_dtags);
	Config->Entry = Args.getLastArgValue(OPT_entry);
	Config->ExportDynamic =
	Args.hasFlag(OPT_export_dynamic, OPT_no_export_dynamic, false);
	errorHandler().FatalWarnings =
	Args.hasFlag(OPT_fatal_warnings, OPT_no_fatal_warnings, false);
	Config->FilterList = args::getStrings(Args, OPT_filter);
	Config->Fini = Args.getLastArgValue(OPT_fini, "_fini");
	Config->FixCortexA53Errata843419 = Args.hasArg(OPT_fix_cortex_a53_843419);
	Config->GcSections = Args.hasFlag(OPT_gc_sections, OPT_no_gc_sections, false);
	Config->GdbIndex = Args.hasFlag(OPT_gdb_index, OPT_no_gdb_index, false);
	Config->ICF = Args.hasFlag(OPT_icf_all, OPT_icf_none, false);
	Config->ICFData = Args.hasArg(OPT_icf_data);
	Config->Init = Args.getLastArgValue(OPT_init, "_init");
	Config->LTOAAPipeline = Args.getLastArgValue(OPT_lto_aa_pipeline);
	Config->LTONewPmPasses = Args.getLastArgValue(OPT_lto_newpm_passes);
	Config->LTOO = args::getInteger(Args, OPT_lto_O, 2);
	Config->LTOPartitions = args::getInteger(Args, OPT_lto_partitions, 1);
	Config->MapFile = Args.getLastArgValue(OPT_Map);
	Config->NoGnuUnique = Args.hasArg(OPT_no_gnu_unique);
	Config->MergeArmExidx =
	Args.hasFlag(OPT_merge_exidx_entries, OPT_no_merge_exidx_entries, true);
	Config->NoUndefinedVersion = Args.hasArg(OPT_no_undefined_version);
	Config->NoinhibitExec = Args.hasArg(OPT_noinhibit_exec);
	Config->Nostdlib = Args.hasArg(OPT_nostdlib);
	Config->OFormatBinary = isOutputFormatBinary(Args);
	Config->Omagic = Args.hasFlag(OPT_omagic, OPT_no_omagic, false);
	Config->OptRemarksFilename = Args.getLastArgValue(OPT_opt_remarks_filename);
	Config->OptRemarksWithHotness = Args.hasArg(OPT_opt_remarks_with_hotness);
	Config->Optimize = args::getInteger(Args, OPT_O, 1);
	Config->OrphanHandling = getOrphanHandling(Args);
	Config->OutputFile = Args.getLastArgValue(OPT_o);
	- Config->Pie = Args.hasFlag(OPT_pie, OPT_nopie, false);
	+ Config->Pie = Args.hasFlag(OPT_pie, OPT_no_pie, false);
	Config->PrintGcSections =
	Args.hasFlag(OPT_print_gc_sections, OPT_no_print_gc_sections, false);
	Config->Rpath = getRpath(Args);
	Config->Relocatable = Args.hasArg(OPT_relocatable);
	Config->SaveTemps = Args.hasArg(OPT_save_temps);
	Config->SearchPaths = args::getStrings(Args, OPT_library_path);
	Config->SectionStartMap = getSectionStartMap(Args);
	Config->Shared = Args.hasArg(OPT_shared);
	Config->SingleRoRx = Args.hasArg(OPT_no_rosegment);
	Config->SoName = Args.getLastArgValue(OPT_soname);
	Config->SortSection = getSortSection(Args);
	Config->Strip = getStrip(Args);
	Config->Sysroot = Args.getLastArgValue(OPT_sysroot);
	Config->Target1Rel = Args.hasFlag(OPT_target1_rel, OPT_target1_abs, false);
	Config->Target2 = getTarget2(Args);
	Config->ThinLTOCacheDir = Args.getLastArgValue(OPT_thinlto_cache_dir);
	Config->ThinLTOCachePolicy = CHECK(
	parseCachePruningPolicy(Args.getLastArgValue(OPT_thinlto_cache_policy)),
	"--thinlto-cache-policy: invalid cache policy");
	Config->ThinLTOJobs = args::getInteger(Args, OPT_thinlto_jobs, -1u);
	ThreadsEnabled = Args.hasFlag(OPT_threads, OPT_no_threads, true);
	Config->Trace = Args.hasArg(OPT_trace);
	Config->Undefined = args::getStrings(Args, OPT_undefined);
	Config->UnresolvedSymbols = getUnresolvedSymbolPolicy(Args);
	Config->Verbose = Args.hasArg(OPT_verbose);
	errorHandler().Verbose = Config->Verbose;
	Config->WarnCommon = Args.hasArg(OPT_warn_common);
	Config->ZCombreloc = !hasZOption(Args, "nocombreloc");
	Config->ZExecstack = hasZOption(Args, "execstack");
	Config->ZNocopyreloc = hasZOption(Args, "nocopyreloc");
	Config->ZNodelete = hasZOption(Args, "nodelete");
	Config->ZNodlopen = hasZOption(Args, "nodlopen");
	Config->ZNow = hasZOption(Args, "now");
	Config->ZOrigin = hasZOption(Args, "origin");
	Config->ZRelro = !hasZOption(Args, "norelro");
	Config->ZRetpolineplt = hasZOption(Args, "retpolineplt");
	Config->ZRodynamic = hasZOption(Args, "rodynamic");
	Config->ZStackSize = args::getZOptionValue(Args, OPT_z, "stack-size", 0);
	Config->ZText = !hasZOption(Args, "notext");
	Config->ZWxneeded = hasZOption(Args, "wxneeded");

	// Parse LTO plugin-related options for compatibility with gold.
	for (auto *Arg : Args.filtered(OPT_plugin_opt, OPT_plugin_opt_eq)) {
	StringRef S = Arg->getValue();
	if (S == "disable-verify")
	Config->DisableVerify = true;
	else if (S == "save-temps")
	Config->SaveTemps = true;
	else if (S.startswith("O"))
	Config->LTOO = parseInt(S.substr(1), Arg);
	else if (S.startswith("lto-partitions="))
	Config->LTOPartitions = parseInt(S.substr(15), Arg);
	else if (S.startswith("jobs="))
	Config->ThinLTOJobs = parseInt(S.substr(5), Arg);
	else if (!S.startswith("/") && !S.startswith("-fresolution=") &&
	!S.startswith("-pass-through=") && !S.startswith("mcpu=") &&
	!S.startswith("thinlto") && S != "-function-sections" &&
	S != "-data-sections")
	error(Arg->getSpelling() + ": unknown option: " + S);
	}

	if (Config->LTOO > 3)
	error("invalid optimization level for LTO: " + Twine(Config->LTOO));
	if (Config->LTOPartitions == 0)
	error("--lto-partitions: number of threads must be > 0");
	if (Config->ThinLTOJobs == 0)
	error("--thinlto-jobs: number of threads must be > 0");

	// Parse ELF{32,64}{LE,BE} and CPU type.
	if (auto *Arg = Args.getLastArg(OPT_m)) {
	StringRef S = Arg->getValue();
	std::tie(Config->EKind, Config->EMachine, Config->OSABI) =
	parseEmulation(S);
	Config->MipsN32Abi = (S == "elf32btsmipn32" \|\| S == "elf32ltsmipn32");
	Config->Emulation = S;
	}

	// Parse -hash-style={sysv,gnu,both}.
	if (auto *Arg = Args.getLastArg(OPT_hash_style)) {
	StringRef S = Arg->getValue();
	if (S == "sysv")
	Config->SysvHash = true;
	else if (S == "gnu")
	Config->GnuHash = true;
	else if (S == "both")
	Config->SysvHash = Config->GnuHash = true;
	else
	error("unknown -hash-style: " + S);
	}

	if (Args.hasArg(OPT_print_map))
	Config->MapFile = "-";

	// --omagic is an option to create old-fashioned executables in which
	// .text segments are writable. Today, the option is still in use to
	// create special-purpose programs such as boot loaders. It doesn't
	// make sense to create PT_GNU_RELRO for such executables.
	if (Config->Omagic)
	Config->ZRelro = false;

	std::tie(Config->BuildId, Config->BuildIdVector) = getBuildId(Args);

	if (auto *Arg = Args.getLastArg(OPT_pack_dyn_relocs_eq)) {
	StringRef S = Arg->getValue();
	if (S == "android")
	Config->AndroidPackDynRelocs = true;
	else if (S != "none")
	error("unknown -pack-dyn-relocs format: " + S);
	}

	if (auto *Arg = Args.getLastArg(OPT_symbol_ordering_file))
	if (Optional<MemoryBufferRef> Buffer = readFile(Arg->getValue()))
	Config->SymbolOrderingFile = args::getLines(*Buffer);

	// If --retain-symbol-file is used, we'll keep only the symbols listed in
	// the file and discard all others.
	if (auto *Arg = Args.getLastArg(OPT_retain_symbols_file)) {
	Config->DefaultSymbolVersion = VER_NDX_LOCAL;
	if (Optional<MemoryBufferRef> Buffer = readFile(Arg->getValue()))
	for (StringRef S : args::getLines(*Buffer))
	Config->VersionScriptGlobals.push_back(
	{S, /IsExternCpp/ false, /HasWildcard/ false});
	}

	bool HasExportDynamic =
	Args.hasFlag(OPT_export_dynamic, OPT_no_export_dynamic, false);

	// Parses -dynamic-list and -export-dynamic-symbol. They make some
	// symbols private. Note that -export-dynamic takes precedence over them
	// as it says all symbols should be exported.
	if (!HasExportDynamic) {
	for (auto *Arg : Args.filtered(OPT_dynamic_list))
	if (Optional<MemoryBufferRef> Buffer = readFile(Arg->getValue()))
	readDynamicList(*Buffer);

	for (auto *Arg : Args.filtered(OPT_export_dynamic_symbol))
	Config->DynamicList.push_back(
	{Arg->getValue(), /IsExternCpp/ false, /HasWildcard/ false});
	}

	for (auto *Arg : Args.filtered(OPT_version_script))
	if (Optional<MemoryBufferRef> Buffer = readFile(Arg->getValue()))
	readVersionScript(*Buffer);
	}

	// Some Config members do not directly correspond to any particular
	// command line options, but computed based on other Config values.
	// This function initialize such members. See Config.h for the details
	// of these values.
	static void setConfigs() {
	ELFKind Kind = Config->EKind;
	uint16_t Machine = Config->EMachine;

	// There is an ILP32 ABI for x86-64, although it's not very popular.
	// It is called the x32 ABI.
	bool IsX32 = (Kind == ELF32LEKind && Machine == EM_X86_64);

	Config->CopyRelocs = (Config->Relocatable \|\| Config->EmitRelocs);
	Config->Is64 = (Kind == ELF64LEKind \|\| Kind == ELF64BEKind);
	Config->IsLE = (Kind == ELF32LEKind \|\| Kind == ELF64LEKind);
	Config->Endianness =
	Config->IsLE ? support::endianness::little : support::endianness::big;
	Config->IsMips64EL = (Kind == ELF64LEKind && Machine == EM_MIPS);
	Config->IsRela = Config->Is64 \|\| IsX32 \|\| Config->MipsN32Abi;
	Config->Pic = Config->Pie \|\| Config->Shared;
	Config->Wordsize = Config->Is64 ? 8 : 4;
	}

	// Returns a value of "-format" option.
	static bool getBinaryOption(StringRef S) {
	if (S == "binary")
	return true;
	if (S == "elf" \|\| S == "default")
	return false;
	error("unknown -format value: " + S +
	" (supported formats: elf, default, binary)");
	return false;
	}

	void LinkerDriver::createFiles(opt::InputArgList &Args) {
	for (auto *Arg : Args) {
	switch (Arg->getOption().getUnaliasedOption().getID()) {
	case OPT_library:
	addLibrary(Arg->getValue());
	break;
	case OPT_INPUT:
	addFile(Arg->getValue(), /WithLOption=/false);
	break;
	case OPT_script:
	if (Optional<std::string> Path = searchLinkerScript(Arg->getValue())) {
	if (Optional<MemoryBufferRef> MB = readFile(*Path))
	readLinkerScript(*MB);
	break;
	}
	error(Twine("cannot find linker script ") + Arg->getValue());
	break;
	case OPT_as_needed:
	Config->AsNeeded = true;
	break;
	case OPT_format:
	InBinary = getBinaryOption(Arg->getValue());
	break;
	case OPT_no_as_needed:
	Config->AsNeeded = false;
	break;
	case OPT_Bstatic:
	Config->Static = true;
	break;
	case OPT_Bdynamic:
	Config->Static = false;
	break;
	case OPT_whole_archive:
	InWholeArchive = true;
	break;
	case OPT_no_whole_archive:
	InWholeArchive = false;
	break;
	case OPT_start_lib:
	InLib = true;
	break;
	case OPT_end_lib:
	InLib = false;
	break;
	}
	}

	if (Files.empty() && errorCount() == 0)
	error("no input files");
	}

	// If -m <machine_type> was not given, infer it from object files.
	void LinkerDriver::inferMachineType() {
	if (Config->EKind != ELFNoneKind)
	return;

	for (InputFile *F : Files) {
	if (F->EKind == ELFNoneKind)
	continue;
	Config->EKind = F->EKind;
	Config->EMachine = F->EMachine;
	Config->OSABI = F->OSABI;
	Config->MipsN32Abi = Config->EMachine == EM_MIPS && isMipsN32Abi(F);
	return;
	}
	error("target emulation unknown: -m or at least one .o file required");
	}

	// Parse -z max-page-size=<value>. The default value is defined by
	// each target.
	static uint64_t getMaxPageSize(opt::InputArgList &Args) {
	uint64_t Val = args::getZOptionValue(Args, OPT_z, "max-page-size",
	Target->DefaultMaxPageSize);
	if (!isPowerOf2_64(Val))
	error("max-page-size: value isn't a power of 2");
	return Val;
	}

	// Parses -image-base option.
	static Optional<uint64_t> getImageBase(opt::InputArgList &Args) {
	// Because we are using "Config->MaxPageSize" here, this function has to be
	// called after the variable is initialized.
	auto *Arg = Args.getLastArg(OPT_image_base);
	if (!Arg)
	return None;

	StringRef S = Arg->getValue();
	uint64_t V;
	if (!to_integer(S, V)) {
	error("-image-base: number expected, but got " + S);
	return 0;
	}
	if ((V % Config->MaxPageSize) != 0)
	warn("-image-base: address isn't multiple of page size: " + S);
	return V;
	}

	// Parses `--exclude-libs=lib,lib,...`.
	// The library names may be delimited by commas or colons.
	static DenseSet<StringRef> getExcludeLibs(opt::InputArgList &Args) {
	DenseSet<StringRef> Ret;
	for (auto *Arg : Args.filtered(OPT_exclude_libs)) {
	StringRef S = Arg->getValue();
	for (;;) {
	size_t Pos = S.find_first_of(",:");
	if (Pos == StringRef::npos)
	break;
	Ret.insert(S.substr(0, Pos));
	S = S.substr(Pos + 1);
	}
	Ret.insert(S);
	}
	return Ret;
	}

	static Optional<StringRef> getArchiveName(InputFile *File) {
	if (isa<ArchiveFile>(File))
	return File->getName();
	if (!File->ArchiveName.empty())
	return File->ArchiveName;
	return None;
	}

	// Handles the -exclude-libs option. If a static library file is specified
	// by the -exclude-libs option, all public symbols from the archive become
	// private unless otherwise specified by version scripts or something.
	// A special library name "ALL" means all archive files.
	//
	// This is not a popular option, but some programs such as bionic libc use it.
	template <class ELFT>
	static void excludeLibs(opt::InputArgList &Args, ArrayRef<InputFile *> Files) {
	DenseSet<StringRef> Libs = getExcludeLibs(Args);
	bool All = Libs.count("ALL");

	for (InputFile *File : Files)
	if (Optional<StringRef> Archive = getArchiveName(File))
	if (All \|\| Libs.count(path::filename(*Archive)))
	for (Symbol *Sym : File->getSymbols())
	if (!Sym->isLocal())
	Sym->VersionId = VER_NDX_LOCAL;
	}

	// Do actual linking. Note that when this function is called,
	// all linker scripts have already been parsed.
	template <class ELFT> void LinkerDriver::link(opt::InputArgList &Args) {
	Target = getTarget();

	Config->MaxPageSize = getMaxPageSize(Args);
	Config->ImageBase = getImageBase(Args);

	// If a -hash-style option was not given, set to a default value,
	// which varies depending on the target.
	if (!Args.hasArg(OPT_hash_style)) {
	if (Config->EMachine == EM_MIPS)
	Config->SysvHash = true;
	else
	Config->SysvHash = Config->GnuHash = true;
	}

	// Default output filename is "a.out" by the Unix tradition.
	if (Config->OutputFile.empty())
	Config->OutputFile = "a.out";

	// Fail early if the output file or map file is not writable. If a user has a
	// long link, e.g. due to a large LTO link, they do not wish to run it and
	// find that it failed because there was a mistake in their command-line.
	if (auto E = tryCreateFile(Config->OutputFile))
	error("cannot open output file " + Config->OutputFile + ": " + E.message());
	if (auto E = tryCreateFile(Config->MapFile))
	error("cannot open map file " + Config->MapFile + ": " + E.message());
	if (errorCount())
	return;

	// Use default entry point name if no name was given via the command
	// line nor linker scripts. For some reason, MIPS entry point name is
	// different from others.
	Config->WarnMissingEntry =
	(!Config->Entry.empty() \|\| (!Config->Shared && !Config->Relocatable));
	if (Config->Entry.empty() && !Config->Relocatable)
	Config->Entry = (Config->EMachine == EM_MIPS) ? "__start" : "_start";

	// Handle --trace-symbol.
	for (auto *Arg : Args.filtered(OPT_trace_symbol))
	Symtab->trace(Arg->getValue());

	// Add all files to the symbol table. This will add almost all
	// symbols that we need to the symbol table.
	for (InputFile *F : Files)
	Symtab->addFile<ELFT>(F);

	// Process -defsym option.
	for (auto *Arg : Args.filtered(OPT_defsym)) {
	StringRef From;
	StringRef To;
	std::tie(From, To) = StringRef(Arg->getValue()).split('=');
	readDefsym(From, MemoryBufferRef(To, "-defsym"));
	}

	// Now that we have every file, we can decide if we will need a
	// dynamic symbol table.
	// We need one if we were asked to export dynamic symbols or if we are
	// producing a shared library.
	// We also need one if any shared libraries are used and for pie executables
	// (probably because the dynamic linker needs it).
	Config->HasDynSymTab =
	!SharedFiles.empty() \|\| Config->Pic \|\| Config->ExportDynamic;

	// Some symbols (such as __ehdr_start) are defined lazily only when there
	// are undefined symbols for them, so we add these to trigger that logic.
	for (StringRef Sym : Script->ReferencedSymbols)
	Symtab->addUndefined<ELFT>(Sym);

	// Handle the `--undefined <sym>` options.
	for (StringRef S : Config->Undefined)
	Symtab->fetchIfLazy<ELFT>(S);

	// If an entry symbol is in a static archive, pull out that file now
	// to complete the symbol table. After this, no new names except a
	// few linker-synthesized ones will be added to the symbol table.
	Symtab->fetchIfLazy<ELFT>(Config->Entry);

	// Return if there were name resolution errors.
	if (errorCount())
	return;

	// Handle undefined symbols in DSOs.
	if (!Config->Shared)
	Symtab->scanShlibUndefined<ELFT>();

	// Handle the -exclude-libs option.
	if (Args.hasArg(OPT_exclude_libs))
	excludeLibs<ELFT>(Args, Files);

	// Create ElfHeader early. We need a dummy section in
	// addReservedSymbols to mark the created symbols as not absolute.
	Out::ElfHeader = make<OutputSection>("", 0, SHF_ALLOC);
	Out::ElfHeader->Size = sizeof(typename ELFT::Ehdr);

	// We need to create some reserved symbols such as _end. Create them.
	if (!Config->Relocatable)
	addReservedSymbols();

	// Apply version scripts.
	Symtab->scanVersionScript();

	// Create wrapped symbols for -wrap option.
	for (auto *Arg : Args.filtered(OPT_wrap))
	Symtab->addSymbolWrap<ELFT>(Arg->getValue());

	Symtab->addCombinedLTOObject<ELFT>();
	if (errorCount())
	return;

	// Apply symbol renames for -wrap.
	Symtab->applySymbolWrap();

	// Now that we have a complete list of input files.
	// Beyond this point, no new files are added.
	// Aggregate all input sections into one place.
	for (InputFile *F : ObjectFiles)
	for (InputSectionBase *S : F->getSections())
	if (S && S != &InputSection::Discarded)
	InputSections.push_back(S);
	for (BinaryFile *F : BinaryFiles)
	for (InputSectionBase *S : F->getSections())
	InputSections.push_back(cast<InputSection>(S));

	// We do not want to emit debug sections if --strip-all
	// or -strip-debug are given.
	if (Config->Strip != StripPolicy::None)
	llvm::erase_if(InputSections, [](InputSectionBase *S) {
	return S->Name.startswith(".debug") \|\| S->Name.startswith(".zdebug");
	});

	Config->EFlags = Target->calcEFlags();

	if (Config->EMachine == EM_ARM) {
	// FIXME: These warnings can be removed when lld only uses these features
	// when the input objects have been compiled with an architecture that
	// supports them.
	if (Config->ARMHasBlx == false)
	warn("lld uses blx instruction, no object with architecture supporting "
	"feature detected.");
	if (Config->ARMJ1J2BranchEncoding == false)
	warn("lld uses extended branch encoding, no object with architecture "
	"supporting feature detected.");
	if (Config->ARMHasMovtMovw == false)
	warn("lld may use movt/movw, no object with architecture supporting "
	"feature detected.");
	}

	// This adds a .comment section containing a version string. We have to add it
	// before decompressAndMergeSections because the .comment section is a
	// mergeable section.
	if (!Config->Relocatable)
	InputSections.push_back(createCommentSection());

	// Do size optimizations: garbage collection, merging of SHF_MERGE sections
	// and identical code folding.
	markLive<ELFT>();
	decompressSections();
	mergeSections();
	if (Config->ICF)
	doIcf<ELFT>();

	// Write the result to the file.
	writeResult<ELFT>();
	}
	Index: head/contrib/llvm/tools/lld/ELF/InputFiles.cpp
	===================================================================
	--- head/contrib/llvm/tools/lld/ELF/InputFiles.cpp (revision 329409)
	+++ head/contrib/llvm/tools/lld/ELF/InputFiles.cpp (revision 329410)
	@@ -1,1190 +1,1198 @@
	//===- InputFiles.cpp -----------------------------------------------------===//
	//
	// The LLVM Linker
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#include "InputFiles.h"
	#include "InputSection.h"
	#include "LinkerScript.h"
	#include "SymbolTable.h"
	#include "Symbols.h"
	#include "SyntheticSections.h"
	#include "lld/Common/ErrorHandler.h"
	#include "lld/Common/Memory.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/CodeGen/Analysis.h"
	#include "llvm/DebugInfo/DWARF/DWARFContext.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/Module.h"
	#include "llvm/LTO/LTO.h"
	#include "llvm/MC/StringTableBuilder.h"
	#include "llvm/Object/ELFObjectFile.h"
	#include "llvm/Support/ARMAttributeParser.h"
	#include "llvm/Support/ARMBuildAttributes.h"
	#include "llvm/Support/Path.h"
	#include "llvm/Support/TarWriter.h"
	#include "llvm/Support/raw_ostream.h"

	using namespace llvm;
	using namespace llvm::ELF;
	using namespace llvm::object;
	using namespace llvm::sys;
	using namespace llvm::sys::fs;

	using namespace lld;
	using namespace lld::elf;

	std::vector<BinaryFile *> elf::BinaryFiles;
	std::vector<BitcodeFile *> elf::BitcodeFiles;
	std::vector<InputFile *> elf::ObjectFiles;
	std::vector<InputFile *> elf::SharedFiles;

	TarWriter *elf::Tar;

	InputFile::InputFile(Kind K, MemoryBufferRef M) : MB(M), FileKind(K) {}

	Optional<MemoryBufferRef> elf::readFile(StringRef Path) {
	// The --chroot option changes our virtual root directory.
	// This is useful when you are dealing with files created by --reproduce.
	if (!Config->Chroot.empty() && Path.startswith("/"))
	Path = Saver.save(Config->Chroot + Path);

	log(Path);

	auto MBOrErr = MemoryBuffer::getFile(Path);
	if (auto EC = MBOrErr.getError()) {
	error("cannot open " + Path + ": " + EC.message());
	return None;
	}

	std::unique_ptr<MemoryBuffer> &MB = *MBOrErr;
	MemoryBufferRef MBRef = MB->getMemBufferRef();
	make<std::unique_ptr<MemoryBuffer>>(std::move(MB)); // take MB ownership

	if (Tar)
	Tar->append(relativeToRoot(Path), MBRef.getBuffer());
	return MBRef;
	}

	// Concatenates arguments to construct a string representing an error location.
	static std::string createFileLineMsg(StringRef Path, unsigned Line) {
	std::string Filename = path::filename(Path);
	std::string Lineno = ":" + std::to_string(Line);
	if (Filename == Path)
	return Filename + Lineno;
	return Filename + Lineno + " (" + Path.str() + Lineno + ")";
	}

	template <class ELFT>
	static std::string getSrcMsgAux(ObjFile<ELFT> &File, const Symbol &Sym,
	InputSectionBase &Sec, uint64_t Offset) {
	// In DWARF, functions and variables are stored to different places.
	// First, lookup a function for a given offset.
	if (Optional<DILineInfo> Info = File.getDILineInfo(&Sec, Offset))
	return createFileLineMsg(Info->FileName, Info->Line);

	// If it failed, lookup again as a variable.
	if (Optional<std::pair<std::string, unsigned>> FileLine =
	File.getVariableLoc(Sym.getName()))
	return createFileLineMsg(FileLine->first, FileLine->second);

	// File.SourceFile contains STT_FILE symbol, and that is a last resort.
	return File.SourceFile;
	}

	std::string InputFile::getSrcMsg(const Symbol &Sym, InputSectionBase &Sec,
	uint64_t Offset) {
	if (kind() != ObjKind)
	return "";
	switch (Config->EKind) {
	default:
	llvm_unreachable("Invalid kind");
	case ELF32LEKind:
	return getSrcMsgAux(cast<ObjFile<ELF32LE>>(*this), Sym, Sec, Offset);
	case ELF32BEKind:
	return getSrcMsgAux(cast<ObjFile<ELF32BE>>(*this), Sym, Sec, Offset);
	case ELF64LEKind:
	return getSrcMsgAux(cast<ObjFile<ELF64LE>>(*this), Sym, Sec, Offset);
	case ELF64BEKind:
	return getSrcMsgAux(cast<ObjFile<ELF64BE>>(*this), Sym, Sec, Offset);
	}
	}

	template <class ELFT> void ObjFile<ELFT>::initializeDwarf() {
	DWARFContext Dwarf(make_unique<LLDDwarfObj<ELFT>>(this));
	const DWARFObject &Obj = Dwarf.getDWARFObj();
	DwarfLine.reset(new DWARFDebugLine);
	DWARFDataExtractor LineData(Obj, Obj.getLineSection(), Config->IsLE,
	Config->Wordsize);

	// The second parameter is offset in .debug_line section
	// for compilation unit (CU) of interest. We have only one
	// CU (object file), so offset is always 0.
	// FIXME: Provide the associated DWARFUnit if there is one. DWARF v5
	// needs it in order to find indirect strings.
	const DWARFDebugLine::LineTable *LT =
	DwarfLine->getOrParseLineTable(LineData, 0, nullptr);

	// Return if there is no debug information about CU available.
	if (!Dwarf.getNumCompileUnits())
	return;

	// Loop over variable records and insert them to VariableLoc.
	DWARFCompileUnit *CU = Dwarf.getCompileUnitAtIndex(0);
	for (const auto &Entry : CU->dies()) {
	DWARFDie Die(CU, &Entry);
	// Skip all tags that are not variables.
	if (Die.getTag() != dwarf::DW_TAG_variable)
	continue;

	// Skip if a local variable because we don't need them for generating error
	// messages. In general, only non-local symbols can fail to be linked.
	if (!dwarf::toUnsigned(Die.find(dwarf::DW_AT_external), 0))
	continue;

	// Get the source filename index for the variable.
	unsigned File = dwarf::toUnsigned(Die.find(dwarf::DW_AT_decl_file), 0);
	if (!LT->hasFileAtIndex(File))
	continue;

	// Get the line number on which the variable is declared.
	unsigned Line = dwarf::toUnsigned(Die.find(dwarf::DW_AT_decl_line), 0);

	// Get the name of the variable and add the collected information to
	// VariableLoc. Usually Name is non-empty, but it can be empty if the input
	// object file lacks some debug info.
	StringRef Name = dwarf::toString(Die.find(dwarf::DW_AT_name), "");
	if (!Name.empty())
	VariableLoc.insert({Name, {File, Line}});
	}
	}

	// Returns the pair of file name and line number describing location of data
	// object (variable, array, etc) definition.
	template <class ELFT>
	Optional<std::pair<std::string, unsigned>>
	ObjFile<ELFT>::getVariableLoc(StringRef Name) {
	llvm::call_once(InitDwarfLine, [this]() { initializeDwarf(); });

	// There is always only one CU so it's offset is 0.
	const DWARFDebugLine::LineTable *LT = DwarfLine->getLineTable(0);
	if (!LT)
	return None;

	// Return if we have no debug information about data object.
	auto It = VariableLoc.find(Name);
	if (It == VariableLoc.end())
	return None;

	// Take file name string from line table.
	std::string FileName;
	if (!LT->getFileNameByIndex(
	It->second.first /* File */, nullptr,
	DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath, FileName))
	return None;

	return std::make_pair(FileName, It->second.second /Line/);
	}

	// Returns source line information for a given offset
	// using DWARF debug info.
	template <class ELFT>
	Optional<DILineInfo> ObjFile<ELFT>::getDILineInfo(InputSectionBase *S,
	uint64_t Offset) {
	llvm::call_once(InitDwarfLine, [this]() { initializeDwarf(); });

	// The offset to CU is 0.
	const DWARFDebugLine::LineTable *Tbl = DwarfLine->getLineTable(0);
	if (!Tbl)
	return None;

	// Use fake address calcuated by adding section file offset and offset in
	// section. See comments for ObjectInfo class.
	DILineInfo Info;
	Tbl->getFileLineInfoForAddress(
	S->getOffsetInFile() + Offset, nullptr,
	DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath, Info);
	if (Info.Line == 0)
	return None;
	return Info;
	}

	// Returns source line information for a given offset
	// using DWARF debug info.
	template <class ELFT>
	std::string ObjFile<ELFT>::getLineInfo(InputSectionBase *S, uint64_t Offset) {
	if (Optional<DILineInfo> Info = getDILineInfo(S, Offset))
	return Info->FileName + ":" + std::to_string(Info->Line);
	return "";
	}

	// Returns "<internal>", "foo.a(bar.o)" or "baz.o".
	std::string lld::toString(const InputFile *F) {
	if (!F)
	return "<internal>";

	if (F->ToStringCache.empty()) {
	if (F->ArchiveName.empty())
	F->ToStringCache = F->getName();
	else
	F->ToStringCache = (F->ArchiveName + "(" + F->getName() + ")").str();
	}
	return F->ToStringCache;
	}

	template <class ELFT>
	ELFFileBase<ELFT>::ELFFileBase(Kind K, MemoryBufferRef MB) : InputFile(K, MB) {
	if (ELFT::TargetEndianness == support::little)
	EKind = ELFT::Is64Bits ? ELF64LEKind : ELF32LEKind;
	else
	EKind = ELFT::Is64Bits ? ELF64BEKind : ELF32BEKind;

	EMachine = getObj().getHeader()->e_machine;
	OSABI = getObj().getHeader()->e_ident[llvm::ELF::EI_OSABI];
	}

	template <class ELFT>
	typename ELFT::SymRange ELFFileBase<ELFT>::getGlobalELFSyms() {
	return makeArrayRef(ELFSyms.begin() + FirstNonLocal, ELFSyms.end());
	}

	template <class ELFT>
	uint32_t ELFFileBase<ELFT>::getSectionIndex(const Elf_Sym &Sym) const {
	return CHECK(getObj().getSectionIndex(&Sym, ELFSyms, SymtabSHNDX), this);
	}

	template <class ELFT>
	void ELFFileBase<ELFT>::initSymtab(ArrayRef<Elf_Shdr> Sections,
	const Elf_Shdr *Symtab) {
	FirstNonLocal = Symtab->sh_info;
	ELFSyms = CHECK(getObj().symbols(Symtab), this);
	if (FirstNonLocal == 0 \|\| FirstNonLocal > ELFSyms.size())
	fatal(toString(this) + ": invalid sh_info in symbol table");

	StringTable =
	CHECK(getObj().getStringTableForSymtab(*Symtab, Sections), this);
	}

	template <class ELFT>
	ObjFile<ELFT>::ObjFile(MemoryBufferRef M, StringRef ArchiveName)
	: ELFFileBase<ELFT>(Base::ObjKind, M) {
	this->ArchiveName = ArchiveName;
	}

	template <class ELFT> ArrayRef<Symbol *> ObjFile<ELFT>::getLocalSymbols() {
	if (this->Symbols.empty())
	return {};
	return makeArrayRef(this->Symbols).slice(1, this->FirstNonLocal - 1);
	}

	template <class ELFT>
	void ObjFile<ELFT>::parse(DenseSet<CachedHashStringRef> &ComdatGroups) {
	// Read section and symbol tables.
	initializeSections(ComdatGroups);
	initializeSymbols();
	}

	// Sections with SHT_GROUP and comdat bits define comdat section groups.
	// They are identified and deduplicated by group name. This function
	// returns a group name.
	template <class ELFT>
	StringRef ObjFile<ELFT>::getShtGroupSignature(ArrayRef<Elf_Shdr> Sections,
	const Elf_Shdr &Sec) {
	// Group signatures are stored as symbol names in object files.
	// sh_info contains a symbol index, so we fetch a symbol and read its name.
	if (this->ELFSyms.empty())
	this->initSymtab(
	Sections, CHECK(object::getSection<ELFT>(Sections, Sec.sh_link), this));

	const Elf_Sym *Sym =
	CHECK(object::getSymbol<ELFT>(this->ELFSyms, Sec.sh_info), this);
	StringRef Signature = CHECK(Sym->getName(this->StringTable), this);

	// As a special case, if a symbol is a section symbol and has no name,
	// we use a section name as a signature.
	//
	// Such SHT_GROUP sections are invalid from the perspective of the ELF
	// standard, but GNU gold 1.14 (the neweset version as of July 2017) or
	// older produce such sections as outputs for the -r option, so we need
	// a bug-compatibility.
	if (Signature.empty() && Sym->getType() == STT_SECTION)
	return getSectionName(Sec);
	return Signature;
	}

	template <class ELFT>
	ArrayRef<typename ObjFile<ELFT>::Elf_Word>
	ObjFile<ELFT>::getShtGroupEntries(const Elf_Shdr &Sec) {
	const ELFFile<ELFT> &Obj = this->getObj();
	ArrayRef<Elf_Word> Entries =
	CHECK(Obj.template getSectionContentsAsArray<Elf_Word>(&Sec), this);
	if (Entries.empty() \|\| Entries[0] != GRP_COMDAT)
	fatal(toString(this) + ": unsupported SHT_GROUP format");
	return Entries.slice(1);
	}

	template <class ELFT> bool ObjFile<ELFT>::shouldMerge(const Elf_Shdr &Sec) {
	// We don't merge sections if -O0 (default is -O1). This makes sometimes
	// the linker significantly faster, although the output will be bigger.
	if (Config->Optimize == 0)
	return false;

	// A mergeable section with size 0 is useless because they don't have
	// any data to merge. A mergeable string section with size 0 can be
	// argued as invalid because it doesn't end with a null character.
	// We'll avoid a mess by handling them as if they were non-mergeable.
	if (Sec.sh_size == 0)
	return false;

	// Check for sh_entsize. The ELF spec is not clear about the zero
	// sh_entsize. It says that "the member [sh_entsize] contains 0 if
	// the section does not hold a table of fixed-size entries". We know
	// that Rust 1.13 produces a string mergeable section with a zero
	// sh_entsize. Here we just accept it rather than being picky about it.
	uint64_t EntSize = Sec.sh_entsize;
	if (EntSize == 0)
	return false;
	if (Sec.sh_size % EntSize)
	fatal(toString(this) +
	": SHF_MERGE section size must be a multiple of sh_entsize");

	uint64_t Flags = Sec.sh_flags;
	if (!(Flags & SHF_MERGE))
	return false;
	if (Flags & SHF_WRITE)
	fatal(toString(this) + ": writable SHF_MERGE section is not supported");

	return true;
	}

	template <class ELFT>
	void ObjFile<ELFT>::initializeSections(
	DenseSet<CachedHashStringRef> &ComdatGroups) {
	const ELFFile<ELFT> &Obj = this->getObj();

	ArrayRef<Elf_Shdr> ObjSections = CHECK(this->getObj().sections(), this);
	uint64_t Size = ObjSections.size();
	this->Sections.resize(Size);
	this->SectionStringTable =
	CHECK(Obj.getSectionStringTable(ObjSections), this);

	for (size_t I = 0, E = ObjSections.size(); I < E; I++) {
	if (this->Sections[I] == &InputSection::Discarded)
	continue;
	const Elf_Shdr &Sec = ObjSections[I];

	// SHF_EXCLUDE'ed sections are discarded by the linker. However,
	// if -r is given, we'll let the final link discard such sections.
	// This is compatible with GNU.
	if ((Sec.sh_flags & SHF_EXCLUDE) && !Config->Relocatable) {
	this->Sections[I] = &InputSection::Discarded;
	continue;
	}

	switch (Sec.sh_type) {
	case SHT_GROUP: {
	// De-duplicate section groups by their signatures.
	StringRef Signature = getShtGroupSignature(ObjSections, Sec);
	bool IsNew = ComdatGroups.insert(CachedHashStringRef(Signature)).second;
	this->Sections[I] = &InputSection::Discarded;

	// If it is a new section group, we want to keep group members.
	// Group leader sections, which contain indices of group members, are
	// discarded because they are useless beyond this point. The only
	// exception is the -r option because in order to produce re-linkable
	// object files, we want to pass through basically everything.
	if (IsNew) {
	if (Config->Relocatable)
	this->Sections[I] = createInputSection(Sec);
	continue;
	}

	// Otherwise, discard group members.
	for (uint32_t SecIndex : getShtGroupEntries(Sec)) {
	if (SecIndex >= Size)
	fatal(toString(this) +
	": invalid section index in group: " + Twine(SecIndex));
	this->Sections[SecIndex] = &InputSection::Discarded;
	}
	break;
	}
	case SHT_SYMTAB:
	this->initSymtab(ObjSections, &Sec);
	break;
	case SHT_SYMTAB_SHNDX:
	this->SymtabSHNDX = CHECK(Obj.getSHNDXTable(Sec, ObjSections), this);
	break;
	case SHT_STRTAB:
	case SHT_NULL:
	break;
	default:
	this->Sections[I] = createInputSection(Sec);
	}

	// .ARM.exidx sections have a reverse dependency on the InputSection they
	// have a SHF_LINK_ORDER dependency, this is identified by the sh_link.
	if (Sec.sh_flags & SHF_LINK_ORDER) {
	if (Sec.sh_link >= this->Sections.size())
	fatal(toString(this) +
	": invalid sh_link index: " + Twine(Sec.sh_link));
	this->Sections[Sec.sh_link]->DependentSections.push_back(
	cast<InputSection>(this->Sections[I]));
	}
	}
	}

	// The ARM support in lld makes some use of instructions that are not available
	// on all ARM architectures. Namely:
	// - Use of BLX instruction for interworking between ARM and Thumb state.
	// - Use of the extended Thumb branch encoding in relocation.
	// - Use of the MOVT/MOVW instructions in Thumb Thunks.
	// The ARM Attributes section contains information about the architecture chosen
	// at compile time. We follow the convention that if at least one input object
	// is compiled with an architecture that supports these features then lld is
	// permitted to use them.
	static void updateSupportedARMFeatures(const ARMAttributeParser &Attributes) {
	if (!Attributes.hasAttribute(ARMBuildAttrs::CPU_arch))
	return;
	auto Arch = Attributes.getAttributeValue(ARMBuildAttrs::CPU_arch);
	switch (Arch) {
	case ARMBuildAttrs::Pre_v4:
	case ARMBuildAttrs::v4:
	case ARMBuildAttrs::v4T:
	// Architectures prior to v5 do not support BLX instruction
	break;
	case ARMBuildAttrs::v5T:
	case ARMBuildAttrs::v5TE:
	case ARMBuildAttrs::v5TEJ:
	case ARMBuildAttrs::v6:
	case ARMBuildAttrs::v6KZ:
	case ARMBuildAttrs::v6K:
	Config->ARMHasBlx = true;
	// Architectures used in pre-Cortex processors do not support
	// The J1 = 1 J2 = 1 Thumb branch range extension, with the exception
	// of Architecture v6T2 (arm1156t2-s and arm1156t2f-s) that do.
	break;
	default:
	// All other Architectures have BLX and extended branch encoding
	Config->ARMHasBlx = true;
	Config->ARMJ1J2BranchEncoding = true;
	if (Arch != ARMBuildAttrs::v6_M && Arch != ARMBuildAttrs::v6S_M)
	// All Architectures used in Cortex processors with the exception
	// of v6-M and v6S-M have the MOVT and MOVW instructions.
	Config->ARMHasMovtMovw = true;
	break;
	}
	}

	template <class ELFT>
	InputSectionBase *ObjFile<ELFT>::getRelocTarget(const Elf_Shdr &Sec) {
	uint32_t Idx = Sec.sh_info;
	if (Idx >= this->Sections.size())
	fatal(toString(this) + ": invalid relocated section index: " + Twine(Idx));
	InputSectionBase *Target = this->Sections[Idx];

	// Strictly speaking, a relocation section must be included in the
	// group of the section it relocates. However, LLVM 3.3 and earlier
	// would fail to do so, so we gracefully handle that case.
	if (Target == &InputSection::Discarded)
	return nullptr;

	if (!Target)
	fatal(toString(this) + ": unsupported relocation reference");
	return Target;
	}

	// Create a regular InputSection class that has the same contents
	// as a given section.
	static InputSection toRegularSection(MergeInputSection Sec) {
	return make<InputSection>(Sec->File, Sec->Flags, Sec->Type, Sec->Alignment,
	Sec->Data, Sec->Name);
	}

	template <class ELFT>
	InputSectionBase *ObjFile<ELFT>::createInputSection(const Elf_Shdr &Sec) {
	StringRef Name = getSectionName(Sec);

	switch (Sec.sh_type) {
	case SHT_ARM_ATTRIBUTES: {
	if (Config->EMachine != EM_ARM)
	break;
	ARMAttributeParser Attributes;
	ArrayRef<uint8_t> Contents = check(this->getObj().getSectionContents(&Sec));
	Attributes.Parse(Contents, /isLittle/ Config->EKind == ELF32LEKind);
	updateSupportedARMFeatures(Attributes);
	// FIXME: Retain the first attribute section we see. The eglibc ARM
	// dynamic loaders require the presence of an attribute section for dlopen
	// to work. In a full implementation we would merge all attribute sections.
	if (InX::ARMAttributes == nullptr) {
	InX::ARMAttributes = make<InputSection>(*this, Sec, Name);
	return InX::ARMAttributes;
	}
	return &InputSection::Discarded;
	}
	case SHT_RELA:
	case SHT_REL: {
	// Find the relocation target section and associate this
	// section with it. Target can be discarded, for example
	// if it is a duplicated member of SHT_GROUP section, we
	// do not create or proccess relocatable sections then.
	InputSectionBase *Target = getRelocTarget(Sec);
	if (!Target)
	return nullptr;

	// This section contains relocation information.
	// If -r is given, we do not interpret or apply relocation
	// but just copy relocation sections to output.
	if (Config->Relocatable)
	return make<InputSection>(*this, Sec, Name);

	if (Target->FirstRelocation)
	fatal(toString(this) +
	": multiple relocation sections to one section are not supported");

	// Mergeable sections with relocations are tricky because relocations
	// need to be taken into account when comparing section contents for
	// merging. It's not worth supporting such mergeable sections because
	// they are rare and it'd complicates the internal design (we usually
	// have to determine if two sections are mergeable early in the link
	// process much before applying relocations). We simply handle mergeable
	// sections with relocations as non-mergeable.
	if (auto *MS = dyn_cast<MergeInputSection>(Target)) {
	Target = toRegularSection(MS);
	this->Sections[Sec.sh_info] = Target;
	}

	size_t NumRelocations;
	if (Sec.sh_type == SHT_RELA) {
	ArrayRef<Elf_Rela> Rels = CHECK(this->getObj().relas(&Sec), this);
	Target->FirstRelocation = Rels.begin();
	NumRelocations = Rels.size();
	Target->AreRelocsRela = true;
	} else {
	ArrayRef<Elf_Rel> Rels = CHECK(this->getObj().rels(&Sec), this);
	Target->FirstRelocation = Rels.begin();
	NumRelocations = Rels.size();
	Target->AreRelocsRela = false;
	}
	assert(isUInt<31>(NumRelocations));
	Target->NumRelocations = NumRelocations;

	// Relocation sections processed by the linker are usually removed
	// from the output, so returning `nullptr` for the normal case.
	// However, if -emit-relocs is given, we need to leave them in the output.
	// (Some post link analysis tools need this information.)
	if (Config->EmitRelocs) {
	InputSection RelocSec = make<InputSection>(this, Sec, Name);
	// We will not emit relocation section if target was discarded.
	Target->DependentSections.push_back(RelocSec);
	return RelocSec;
	}
	return nullptr;
	}
	}

	// The GNU linker uses .note.GNU-stack section as a marker indicating
	// that the code in the object file does not expect that the stack is
	// executable (in terms of NX bit). If all input files have the marker,
	// the GNU linker adds a PT_GNU_STACK segment to tells the loader to
	// make the stack non-executable. Most object files have this section as
	// of 2017.
	//
	// But making the stack non-executable is a norm today for security
	// reasons. Failure to do so may result in a serious security issue.
	// Therefore, we make LLD always add PT_GNU_STACK unless it is
	// explicitly told to do otherwise (by -z execstack). Because the stack
	// executable-ness is controlled solely by command line options,
	// .note.GNU-stack sections are simply ignored.
	if (Name == ".note.GNU-stack")
	return &InputSection::Discarded;

	// Split stacks is a feature to support a discontiguous stack. At least
	// as of 2017, it seems that the feature is not being used widely.
	// Only GNU gold supports that. We don't. For the details about that,
	// see https://gcc.gnu.org/wiki/SplitStacks
	if (Name == ".note.GNU-split-stack") {
	error(toString(this) +
	": object file compiled with -fsplit-stack is not supported");
	return &InputSection::Discarded;
	}

	// The linkonce feature is a sort of proto-comdat. Some glibc i386 object
	// files contain definitions of symbol "__x86.get_pc_thunk.bx" in linkonce
	// sections. Drop those sections to avoid duplicate symbol errors.
	// FIXME: This is glibc PR20543, we should remove this hack once that has been
	// fixed for a while.
	if (Name.startswith(".gnu.linkonce."))
	return &InputSection::Discarded;

	// The linker merges EH (exception handling) frames and creates a
	// .eh_frame_hdr section for runtime. So we handle them with a special
	// class. For relocatable outputs, they are just passed through.
	if (Name == ".eh_frame" && !Config->Relocatable)
	return make<EhInputSection>(*this, Sec, Name);

	if (shouldMerge(Sec))
	return make<MergeInputSection>(*this, Sec, Name);
	return make<InputSection>(*this, Sec, Name);
	}

	template <class ELFT>
	StringRef ObjFile<ELFT>::getSectionName(const Elf_Shdr &Sec) {
	return CHECK(this->getObj().getSectionName(&Sec, SectionStringTable), this);
	}

	template <class ELFT> void ObjFile<ELFT>::initializeSymbols() {
	this->Symbols.reserve(this->ELFSyms.size());
	for (const Elf_Sym &Sym : this->ELFSyms)
	this->Symbols.push_back(createSymbol(&Sym));
	}

	template <class ELFT> Symbol ObjFile<ELFT>::createSymbol(const Elf_Sym Sym) {
	int Binding = Sym->getBinding();

	uint32_t SecIdx = this->getSectionIndex(*Sym);
	if (SecIdx >= this->Sections.size())
	fatal(toString(this) + ": invalid section index: " + Twine(SecIdx));

	InputSectionBase *Sec = this->Sections[SecIdx];
	uint8_t StOther = Sym->st_other;
	uint8_t Type = Sym->getType();
	uint64_t Value = Sym->st_value;
	uint64_t Size = Sym->st_size;

	if (Binding == STB_LOCAL) {
	if (Sym->getType() == STT_FILE)
	SourceFile = CHECK(Sym->getName(this->StringTable), this);

	if (this->StringTable.size() <= Sym->st_name)
	fatal(toString(this) + ": invalid symbol name offset");

	StringRefZ Name = this->StringTable.data() + Sym->st_name;
	if (Sym->st_shndx == SHN_UNDEF)
	return make<Undefined>(this, Name, Binding, StOther, Type);

	return make<Defined>(this, Name, Binding, StOther, Type, Value, Size, Sec);
	}

	StringRef Name = CHECK(Sym->getName(this->StringTable), this);

	switch (Sym->st_shndx) {
	case SHN_UNDEF:
	return Symtab->addUndefined<ELFT>(Name, Binding, StOther, Type,
	/CanOmitFromDynSym=/false, this);
	case SHN_COMMON:
	if (Value == 0 \|\| Value >= UINT32_MAX)
	fatal(toString(this) + ": common symbol '" + Name +
	"' has invalid alignment: " + Twine(Value));
	return Symtab->addCommon(Name, Size, Value, Binding, StOther, Type, *this);
	}

	switch (Binding) {
	default:
	fatal(toString(this) + ": unexpected binding: " + Twine(Binding));
	case STB_GLOBAL:
	case STB_WEAK:
	case STB_GNU_UNIQUE:
	if (Sec == &InputSection::Discarded)
	return Symtab->addUndefined<ELFT>(Name, Binding, StOther, Type,
	/CanOmitFromDynSym=/false, this);
	return Symtab->addRegular(Name, StOther, Type, Value, Size, Binding, Sec,
	this);
	}
	}

	ArchiveFile::ArchiveFile(std::unique_ptr<Archive> &&File)
	: InputFile(ArchiveKind, File->getMemoryBufferRef()),
	File(std::move(File)) {}

	template <class ELFT> void ArchiveFile::parse() {
	Symbols.reserve(File->getNumberOfSymbols());
	for (const Archive::Symbol &Sym : File->symbols())
	Symbols.push_back(Symtab->addLazyArchive<ELFT>(Sym.getName(), *this, Sym));
	}

	// Returns a buffer pointing to a member file containing a given symbol.
	std::pair<MemoryBufferRef, uint64_t>
	ArchiveFile::getMember(const Archive::Symbol *Sym) {
	Archive::Child C =
	CHECK(Sym->getMember(), toString(this) +
	": could not get the member for symbol " +
	Sym->getName());

	if (!Seen.insert(C.getChildOffset()).second)
	return {MemoryBufferRef(), 0};

	MemoryBufferRef Ret =
	CHECK(C.getMemoryBufferRef(),
	toString(this) +
	": could not get the buffer for the member defining symbol " +
	Sym->getName());

	if (C.getParent()->isThin() && Tar)
	Tar->append(relativeToRoot(CHECK(C.getFullName(), this)), Ret.getBuffer());
	if (C.getParent()->isThin())
	return {Ret, 0};
	return {Ret, C.getChildOffset()};
	}

	template <class ELFT>
	SharedFile<ELFT>::SharedFile(MemoryBufferRef M, StringRef DefaultSoName)
	: ELFFileBase<ELFT>(Base::SharedKind, M), SoName(DefaultSoName),
	IsNeeded(!Config->AsNeeded) {}

	// Partially parse the shared object file so that we can call
	// getSoName on this object.
	template <class ELFT> void SharedFile<ELFT>::parseSoName() {
	const Elf_Shdr *DynamicSec = nullptr;
	const ELFFile<ELFT> Obj = this->getObj();
	ArrayRef<Elf_Shdr> Sections = CHECK(Obj.sections(), this);

	// Search for .dynsym, .dynamic, .symtab, .gnu.version and .gnu.version_d.
	for (const Elf_Shdr &Sec : Sections) {
	switch (Sec.sh_type) {
	default:
	continue;
	case SHT_DYNSYM:
	this->initSymtab(Sections, &Sec);
	break;
	case SHT_DYNAMIC:
	DynamicSec = &Sec;
	break;
	case SHT_SYMTAB_SHNDX:
	this->SymtabSHNDX = CHECK(Obj.getSHNDXTable(Sec, Sections), this);
	break;
	case SHT_GNU_versym:
	this->VersymSec = &Sec;
	break;
	case SHT_GNU_verdef:
	this->VerdefSec = &Sec;
	break;
	}
	}

	if (this->VersymSec && this->ELFSyms.empty())
	error("SHT_GNU_versym should be associated with symbol table");

	// Search for a DT_SONAME tag to initialize this->SoName.
	if (!DynamicSec)
	return;
	ArrayRef<Elf_Dyn> Arr =
	CHECK(Obj.template getSectionContentsAsArray<Elf_Dyn>(DynamicSec), this);
	for (const Elf_Dyn &Dyn : Arr) {
	if (Dyn.d_tag == DT_SONAME) {
	uint64_t Val = Dyn.getVal();
	if (Val >= this->StringTable.size())
	fatal(toString(this) + ": invalid DT_SONAME entry");
	SoName = this->StringTable.data() + Val;
	return;
	}
	}
	}

	// Parse the version definitions in the object file if present. Returns a vector
	// whose nth element contains a pointer to the Elf_Verdef for version identifier
	// n. Version identifiers that are not definitions map to nullptr. The array
	// always has at least length 1.
	template <class ELFT>
	std::vector<const typename ELFT::Verdef *>
	SharedFile<ELFT>::parseVerdefs(const Elf_Versym *&Versym) {
	std::vector<const Elf_Verdef *> Verdefs(1);
	// We only need to process symbol versions for this DSO if it has both a
	// versym and a verdef section, which indicates that the DSO contains symbol
	// version definitions.
	if (!VersymSec \|\| !VerdefSec)
	return Verdefs;

	// The location of the first global versym entry.
	const char *Base = this->MB.getBuffer().data();
	Versym = reinterpret_cast<const Elf_Versym *>(Base + VersymSec->sh_offset) +
	this->FirstNonLocal;

	// We cannot determine the largest verdef identifier without inspecting
	// every Elf_Verdef, but both bfd and gold assign verdef identifiers
	// sequentially starting from 1, so we predict that the largest identifier
	// will be VerdefCount.
	unsigned VerdefCount = VerdefSec->sh_info;
	Verdefs.resize(VerdefCount + 1);

	// Build the Verdefs array by following the chain of Elf_Verdef objects
	// from the start of the .gnu.version_d section.
	const char *Verdef = Base + VerdefSec->sh_offset;
	for (unsigned I = 0; I != VerdefCount; ++I) {
	auto CurVerdef = reinterpret_cast<const Elf_Verdef >(Verdef);
	Verdef += CurVerdef->vd_next;
	unsigned VerdefIndex = CurVerdef->vd_ndx;
	if (Verdefs.size() <= VerdefIndex)
	Verdefs.resize(VerdefIndex + 1);
	Verdefs[VerdefIndex] = CurVerdef;
	}

	return Verdefs;
	}

	// Fully parse the shared object file. This must be called after parseSoName().
	template <class ELFT> void SharedFile<ELFT>::parseRest() {
	// Create mapping from version identifiers to Elf_Verdef entries.
	const Elf_Versym *Versym = nullptr;
	Verdefs = parseVerdefs(Versym);

	ArrayRef<Elf_Shdr> Sections = CHECK(this->getObj().sections(), this);

	// Add symbols to the symbol table.
	Elf_Sym_Range Syms = this->getGlobalELFSyms();
	for (const Elf_Sym &Sym : Syms) {
	unsigned VersymIndex = VER_NDX_GLOBAL;
	if (Versym) {
	VersymIndex = Versym->vs_index;
	++Versym;
	}
	bool Hidden = VersymIndex & VERSYM_HIDDEN;
	VersymIndex = VersymIndex & ~VERSYM_HIDDEN;

	StringRef Name = CHECK(Sym.getName(this->StringTable), this);
	if (Sym.isUndefined()) {
	Undefs.push_back(Name);
	continue;
	}

	if (Sym.getBinding() == STB_LOCAL) {
	warn("found local symbol '" + Name +
	"' in global part of symbol table in file " + toString(this));
	continue;
	}

	+ if (Config->EMachine == EM_MIPS) {
	+ // FIXME: MIPS BFD linker puts _gp_disp symbol into DSO files
	+ // and incorrectly assigns VER_NDX_LOCAL to this section global
	+ // symbol. Here is a workaround for this bug.
	+ if (Versym && VersymIndex == VER_NDX_LOCAL && Name == "_gp_disp")
	+ continue;
	+ }
	+
	const Elf_Verdef *Ver = nullptr;
	if (VersymIndex != VER_NDX_GLOBAL) {
	if (VersymIndex >= Verdefs.size() \|\| VersymIndex == VER_NDX_LOCAL) {
	error("corrupt input file: version definition index " +
	Twine(VersymIndex) + " for symbol " + Name +
	" is out of bounds\n>>> defined in " + toString(this));
	continue;
	}
	Ver = Verdefs[VersymIndex];
	} else {
	VersymIndex = 0;
	}

	// We do not usually care about alignments of data in shared object
	// files because the loader takes care of it. However, if we promote a
	// DSO symbol to point to .bss due to copy relocation, we need to keep
	// the original alignment requirements. We infer it here.
	uint64_t Alignment = 1;
	if (Sym.st_value)
	Alignment = 1ULL << countTrailingZeros((uint64_t)Sym.st_value);
	if (0 < Sym.st_shndx && Sym.st_shndx < Sections.size()) {
	uint64_t SecAlign = Sections[Sym.st_shndx].sh_addralign;
	Alignment = std::min(Alignment, SecAlign);
	}
	if (Alignment > UINT32_MAX)
	error(toString(this) + ": alignment too large: " + Name);

	if (!Hidden)
	Symtab->addShared(Name, *this, Sym, Alignment, VersymIndex);

	// Also add the symbol with the versioned name to handle undefined symbols
	// with explicit versions.
	if (Ver) {
	StringRef VerName = this->StringTable.data() + Ver->getAux()->vda_name;
	Name = Saver.save(Name + "@" + VerName);
	Symtab->addShared(Name, *this, Sym, Alignment, VersymIndex);
	}
	}
	}

	static ELFKind getBitcodeELFKind(const Triple &T) {
	if (T.isLittleEndian())
	return T.isArch64Bit() ? ELF64LEKind : ELF32LEKind;
	return T.isArch64Bit() ? ELF64BEKind : ELF32BEKind;
	}

	static uint8_t getBitcodeMachineKind(StringRef Path, const Triple &T) {
	switch (T.getArch()) {
	case Triple::aarch64:
	return EM_AARCH64;
	case Triple::arm:
	case Triple::thumb:
	return EM_ARM;
	case Triple::avr:
	return EM_AVR;
	case Triple::mips:
	case Triple::mipsel:
	case Triple::mips64:
	case Triple::mips64el:
	return EM_MIPS;
	case Triple::ppc:
	return EM_PPC;
	case Triple::ppc64:
	return EM_PPC64;
	case Triple::x86:
	return T.isOSIAMCU() ? EM_IAMCU : EM_386;
	case Triple::x86_64:
	return EM_X86_64;
	default:
	fatal(Path + ": could not infer e_machine from bitcode target triple " +
	T.str());
	}
	}

	BitcodeFile::BitcodeFile(MemoryBufferRef MB, StringRef ArchiveName,
	uint64_t OffsetInArchive)
	: InputFile(BitcodeKind, MB) {
	this->ArchiveName = ArchiveName;

	// Here we pass a new MemoryBufferRef which is identified by ArchiveName
	// (the fully resolved path of the archive) + member name + offset of the
	// member in the archive.
	// ThinLTO uses the MemoryBufferRef identifier to access its internal
	// data structures and if two archives define two members with the same name,
	// this causes a collision which result in only one of the objects being
	// taken into consideration at LTO time (which very likely causes undefined
	// symbols later in the link stage).
	MemoryBufferRef MBRef(MB.getBuffer(),
	Saver.save(ArchiveName + MB.getBufferIdentifier() +
	utostr(OffsetInArchive)));
	Obj = CHECK(lto::InputFile::create(MBRef), this);

	Triple T(Obj->getTargetTriple());
	EKind = getBitcodeELFKind(T);
	EMachine = getBitcodeMachineKind(MB.getBufferIdentifier(), T);
	}

	static uint8_t mapVisibility(GlobalValue::VisibilityTypes GvVisibility) {
	switch (GvVisibility) {
	case GlobalValue::DefaultVisibility:
	return STV_DEFAULT;
	case GlobalValue::HiddenVisibility:
	return STV_HIDDEN;
	case GlobalValue::ProtectedVisibility:
	return STV_PROTECTED;
	}
	llvm_unreachable("unknown visibility");
	}

	template <class ELFT>
	static Symbol *createBitcodeSymbol(const std::vector<bool> &KeptComdats,
	const lto::InputFile::Symbol &ObjSym,
	BitcodeFile &F) {
	StringRef NameRef = Saver.save(ObjSym.getName());
	uint32_t Binding = ObjSym.isWeak() ? STB_WEAK : STB_GLOBAL;

	uint8_t Type = ObjSym.isTLS() ? STT_TLS : STT_NOTYPE;
	uint8_t Visibility = mapVisibility(ObjSym.getVisibility());
	bool CanOmitFromDynSym = ObjSym.canBeOmittedFromSymbolTable();

	int C = ObjSym.getComdatIndex();
	if (C != -1 && !KeptComdats[C])
	return Symtab->addUndefined<ELFT>(NameRef, Binding, Visibility, Type,
	CanOmitFromDynSym, &F);

	if (ObjSym.isUndefined())
	return Symtab->addUndefined<ELFT>(NameRef, Binding, Visibility, Type,
	CanOmitFromDynSym, &F);

	if (ObjSym.isCommon())
	return Symtab->addCommon(NameRef, ObjSym.getCommonSize(),
	ObjSym.getCommonAlignment(), Binding, Visibility,
	STT_OBJECT, F);

	return Symtab->addBitcode(NameRef, Binding, Visibility, Type,
	CanOmitFromDynSym, F);
	}

	template <class ELFT>
	void BitcodeFile::parse(DenseSet<CachedHashStringRef> &ComdatGroups) {
	std::vector<bool> KeptComdats;
	for (StringRef S : Obj->getComdatTable())
	KeptComdats.push_back(ComdatGroups.insert(CachedHashStringRef(S)).second);

	for (const lto::InputFile::Symbol &ObjSym : Obj->symbols())
	Symbols.push_back(createBitcodeSymbol<ELFT>(KeptComdats, ObjSym, *this));
	}

	static ELFKind getELFKind(MemoryBufferRef MB) {
	unsigned char Size;
	unsigned char Endian;
	std::tie(Size, Endian) = getElfArchType(MB.getBuffer());

	if (Endian != ELFDATA2LSB && Endian != ELFDATA2MSB)
	fatal(MB.getBufferIdentifier() + ": invalid data encoding");
	if (Size != ELFCLASS32 && Size != ELFCLASS64)
	fatal(MB.getBufferIdentifier() + ": invalid file class");

	size_t BufSize = MB.getBuffer().size();
	if ((Size == ELFCLASS32 && BufSize < sizeof(Elf32_Ehdr)) \|\|
	(Size == ELFCLASS64 && BufSize < sizeof(Elf64_Ehdr)))
	fatal(MB.getBufferIdentifier() + ": file is too short");

	if (Size == ELFCLASS32)
	return (Endian == ELFDATA2LSB) ? ELF32LEKind : ELF32BEKind;
	return (Endian == ELFDATA2LSB) ? ELF64LEKind : ELF64BEKind;
	}

	void BinaryFile::parse() {
	ArrayRef<uint8_t> Data = toArrayRef(MB.getBuffer());
	auto *Section = make<InputSection>(nullptr, SHF_ALLOC \| SHF_WRITE,
	SHT_PROGBITS, 8, Data, ".data");
	Sections.push_back(Section);

	// For each input file foo that is embedded to a result as a binary
	// blob, we define _binary_foo_{start,end,size} symbols, so that
	// user programs can access blobs by name. Non-alphanumeric
	// characters in a filename are replaced with underscore.
	std::string S = "_binary_" + MB.getBufferIdentifier().str();
	for (size_t I = 0; I < S.size(); ++I)
	if (!isAlnum(S[I]))
	S[I] = '_';

	Symtab->addRegular(Saver.save(S + "_start"), STV_DEFAULT, STT_OBJECT, 0, 0,
	STB_GLOBAL, Section, nullptr);
	Symtab->addRegular(Saver.save(S + "_end"), STV_DEFAULT, STT_OBJECT,
	Data.size(), 0, STB_GLOBAL, Section, nullptr);
	Symtab->addRegular(Saver.save(S + "_size"), STV_DEFAULT, STT_OBJECT,
	Data.size(), 0, STB_GLOBAL, nullptr, nullptr);
	}

	static bool isBitcode(MemoryBufferRef MB) {
	using namespace sys::fs;
	return identify_magic(MB.getBuffer()) == file_magic::bitcode;
	}

	InputFile *elf::createObjectFile(MemoryBufferRef MB, StringRef ArchiveName,
	uint64_t OffsetInArchive) {
	if (isBitcode(MB))
	return make<BitcodeFile>(MB, ArchiveName, OffsetInArchive);

	switch (getELFKind(MB)) {
	case ELF32LEKind:
	return make<ObjFile<ELF32LE>>(MB, ArchiveName);
	case ELF32BEKind:
	return make<ObjFile<ELF32BE>>(MB, ArchiveName);
	case ELF64LEKind:
	return make<ObjFile<ELF64LE>>(MB, ArchiveName);
	case ELF64BEKind:
	return make<ObjFile<ELF64BE>>(MB, ArchiveName);
	default:
	llvm_unreachable("getELFKind");
	}
	}

	InputFile *elf::createSharedFile(MemoryBufferRef MB, StringRef DefaultSoName) {
	switch (getELFKind(MB)) {
	case ELF32LEKind:
	return make<SharedFile<ELF32LE>>(MB, DefaultSoName);
	case ELF32BEKind:
	return make<SharedFile<ELF32BE>>(MB, DefaultSoName);
	case ELF64LEKind:
	return make<SharedFile<ELF64LE>>(MB, DefaultSoName);
	case ELF64BEKind:
	return make<SharedFile<ELF64BE>>(MB, DefaultSoName);
	default:
	llvm_unreachable("getELFKind");
	}
	}

	MemoryBufferRef LazyObjFile::getBuffer() {
	if (Seen)
	return MemoryBufferRef();
	Seen = true;
	return MB;
	}

	InputFile *LazyObjFile::fetch() {
	MemoryBufferRef MBRef = getBuffer();
	if (MBRef.getBuffer().empty())
	return nullptr;
	return createObjectFile(MBRef, ArchiveName, OffsetInArchive);
	}

	template <class ELFT> void LazyObjFile::parse() {
	for (StringRef Sym : getSymbolNames())
	Symtab->addLazyObject<ELFT>(Sym, *this);
	}

	template <class ELFT> std::vector<StringRef> LazyObjFile::getElfSymbols() {
	typedef typename ELFT::Shdr Elf_Shdr;
	typedef typename ELFT::Sym Elf_Sym;
	typedef typename ELFT::SymRange Elf_Sym_Range;

	ELFFile<ELFT> Obj = check(ELFFile<ELFT>::create(this->MB.getBuffer()));
	ArrayRef<Elf_Shdr> Sections = CHECK(Obj.sections(), this);
	for (const Elf_Shdr &Sec : Sections) {
	if (Sec.sh_type != SHT_SYMTAB)
	continue;

	Elf_Sym_Range Syms = CHECK(Obj.symbols(&Sec), this);
	uint32_t FirstNonLocal = Sec.sh_info;
	StringRef StringTable =
	CHECK(Obj.getStringTableForSymtab(Sec, Sections), this);
	std::vector<StringRef> V;

	for (const Elf_Sym &Sym : Syms.slice(FirstNonLocal))
	if (Sym.st_shndx != SHN_UNDEF)
	V.push_back(CHECK(Sym.getName(StringTable), this));
	return V;
	}
	return {};
	}

	std::vector<StringRef> LazyObjFile::getBitcodeSymbols() {
	std::unique_ptr<lto::InputFile> Obj =
	CHECK(lto::InputFile::create(this->MB), this);
	std::vector<StringRef> V;
	for (const lto::InputFile::Symbol &Sym : Obj->symbols())
	if (!Sym.isUndefined())
	V.push_back(Saver.save(Sym.getName()));
	return V;
	}

	// Returns a vector of globally-visible defined symbol names.
	std::vector<StringRef> LazyObjFile::getSymbolNames() {
	if (isBitcode(this->MB))
	return getBitcodeSymbols();

	switch (getELFKind(this->MB)) {
	case ELF32LEKind:
	return getElfSymbols<ELF32LE>();
	case ELF32BEKind:
	return getElfSymbols<ELF32BE>();
	case ELF64LEKind:
	return getElfSymbols<ELF64LE>();
	case ELF64BEKind:
	return getElfSymbols<ELF64BE>();
	default:
	llvm_unreachable("getELFKind");
	}
	}

	template void ArchiveFile::parse<ELF32LE>();
	template void ArchiveFile::parse<ELF32BE>();
	template void ArchiveFile::parse<ELF64LE>();
	template void ArchiveFile::parse<ELF64BE>();

	template void BitcodeFile::parse<ELF32LE>(DenseSet<CachedHashStringRef> &);
	template void BitcodeFile::parse<ELF32BE>(DenseSet<CachedHashStringRef> &);
	template void BitcodeFile::parse<ELF64LE>(DenseSet<CachedHashStringRef> &);
	template void BitcodeFile::parse<ELF64BE>(DenseSet<CachedHashStringRef> &);

	template void LazyObjFile::parse<ELF32LE>();
	template void LazyObjFile::parse<ELF32BE>();
	template void LazyObjFile::parse<ELF64LE>();
	template void LazyObjFile::parse<ELF64BE>();

	template class elf::ELFFileBase<ELF32LE>;
	template class elf::ELFFileBase<ELF32BE>;
	template class elf::ELFFileBase<ELF64LE>;
	template class elf::ELFFileBase<ELF64BE>;

	template class elf::ObjFile<ELF32LE>;
	template class elf::ObjFile<ELF32BE>;
	template class elf::ObjFile<ELF64LE>;
	template class elf::ObjFile<ELF64BE>;

	template class elf::SharedFile<ELF32LE>;
	template class elf::SharedFile<ELF32BE>;
	template class elf::SharedFile<ELF64LE>;
	template class elf::SharedFile<ELF64BE>;
	Index: head/contrib/llvm/tools/lld/ELF/Options.td
	===================================================================
	--- head/contrib/llvm/tools/lld/ELF/Options.td (revision 329409)
	+++ head/contrib/llvm/tools/lld/ELF/Options.td (revision 329410)
	@@ -1,437 +1,437 @@
	include "llvm/Option/OptParser.td"

	// For options whose names are multiple letters, either one dash or
	// two can precede the option name except those that start with 'o'.
	class F<string name>: Flag<["--", "-"], name>;
	class J<string name>: Joined<["--", "-"], name>;
	class S<string name>: Separate<["--", "-"], name>;

	multiclass Eq<string name> {
	def "": Separate<["--", "-"], name>;
	def _eq: Joined<["--", "-"], name # "=">, Alias<!cast<Separate>(NAME)>;
	}

	def auxiliary: S<"auxiliary">, HelpText<"Set DT_AUXILIARY field to the specified name">;

	def Bsymbolic: F<"Bsymbolic">, HelpText<"Bind defined symbols locally">;

	def Bsymbolic_functions: F<"Bsymbolic-functions">,
	HelpText<"Bind defined function symbols locally">;

	def Bdynamic: F<"Bdynamic">, HelpText<"Link against shared libraries">;

	def Bstatic: F<"Bstatic">, HelpText<"Do not link against shared libraries">;

	def build_id: F<"build-id">, HelpText<"Generate build ID note">;

	def build_id_eq: J<"build-id=">, HelpText<"Generate build ID note">;

	defm compress_debug_sections : Eq<"compress-debug-sections">,
	HelpText<"Compress DWARF debug sections">;

	defm defsym: Eq<"defsym">, HelpText<"Define a symbol alias">;

	defm library_path: Eq<"library-path">,
	HelpText<"Add a directory to the library search path">, MetaVarName<"<dir>">;

	def O: JoinedOrSeparate<["-"], "O">, HelpText<"Optimize output file size">;

	defm Tbss: Eq<"Tbss">,
	HelpText<"Same as --section-start with .bss as the sectionname">;

	defm Tdata: Eq<"Tdata">,
	HelpText<"Same as --section-start with .data as the sectionname">;

	defm Ttext: Eq<"Ttext">,
	HelpText<"Same as --section-start with .text as the sectionname">;

	def allow_multiple_definition: F<"allow-multiple-definition">,
	HelpText<"Allow multiple definitions">;

	def as_needed: F<"as-needed">,
	HelpText<"Only set DT_NEEDED for shared libraries if used">;

	// -chroot doesn't have a help text because it is an internal option.
	def chroot: S<"chroot">;

	def color_diagnostics: F<"color-diagnostics">,
	HelpText<"Use colors in diagnostics">;

	def color_diagnostics_eq: J<"color-diagnostics=">,
	HelpText<"Use colors in diagnostics">;

	def define_common: F<"define-common">,
	HelpText<"Assign space to common symbols">;

	def demangle: F<"demangle">, HelpText<"Demangle symbol names">;

	def disable_new_dtags: F<"disable-new-dtags">,
	HelpText<"Disable new dynamic tags">;

	def discard_all: F<"discard-all">, HelpText<"Delete all local symbols">;

	def discard_locals: F<"discard-locals">,
	HelpText<"Delete temporary local symbols">;

	def discard_none: F<"discard-none">,
	HelpText<"Keep all symbols in the symbol table">;

	def dynamic_linker: S<"dynamic-linker">,
	HelpText<"Which dynamic linker to use">;

	defm dynamic_list: Eq<"dynamic-list">,
	HelpText<"Read a list of dynamic symbols">;

	def eh_frame_hdr: F<"eh-frame-hdr">,
	HelpText<"Request creation of .eh_frame_hdr section and PT_GNU_EH_FRAME segment header">;

	def emit_relocs: F<"emit-relocs">, HelpText<"Generate relocations in output">;

	def enable_new_dtags: F<"enable-new-dtags">,
	HelpText<"Enable new dynamic tags">;

	def end_lib: F<"end-lib">,
	HelpText<"End a grouping of objects that should be treated as if they were together in an archive">;

	defm entry: Eq<"entry">, HelpText<"Name of entry point symbol">,
	MetaVarName<"<entry>">;

	defm error_limit: Eq<"error-limit">,
	HelpText<"Maximum number of errors to emit before stopping (0 = no limit)">;

	def error_unresolved_symbols: F<"error-unresolved-symbols">,
	HelpText<"Report unresolved symbols as errors">;

	defm exclude_libs: Eq<"exclude-libs">,
	HelpText<"Exclude static libraries from automatic export">;

	def export_dynamic: F<"export-dynamic">,
	HelpText<"Put symbols in the dynamic symbol table">;

	defm export_dynamic_symbol: Eq<"export-dynamic-symbol">,
	HelpText<"Put a symbol in the dynamic symbol table">;

	def fatal_warnings: F<"fatal-warnings">,
	HelpText<"Treat warnings as errors">;

	defm filter: Eq<"filter">,
	HelpText<"Set DT_FILTER field to the specified name">;

	defm fini: Eq<"fini">,
	HelpText<"Specify a finalizer function">, MetaVarName<"<symbol>">;

	def fix_cortex_a53_843419: F<"fix-cortex-a53-843419">,
	HelpText<"Apply fixes for AArch64 Cortex-A53 erratum 843419">;

	def full_shutdown : F<"full-shutdown">,
	HelpText<"Perform a full shutdown instead of calling _exit">;

	defm format: Eq<"format">,
	HelpText<"Change the input format of the inputs following this option">,
	MetaVarName<"<input-format>">;

	def gc_sections: F<"gc-sections">,
	HelpText<"Enable garbage collection of unused sections">;

	def gdb_index: F<"gdb-index">,
	HelpText<"Generate .gdb_index section">;

	defm hash_style: Eq<"hash-style">,
	HelpText<"Specify hash style (sysv, gnu or both)">;

	def help: F<"help">, HelpText<"Print option help">;

	def icf_all: F<"icf=all">, HelpText<"Enable identical code folding">;

	def icf_data: F<"icf-data">,
	HelpText<"Enable ICF to also fold identical read only data">;

	def icf_none: F<"icf=none">, HelpText<"Disable identical code folding">;

	defm image_base : Eq<"image-base">, HelpText<"Set the base address">;

	defm init: Eq<"init">, HelpText<"Specify an initializer function">,
	MetaVarName<"<symbol>">;

	defm library: Eq<"library">, HelpText<"Root name of library to use">,
	MetaVarName<"<libName>">;

	def lto_O: J<"lto-O">, MetaVarName<"<opt-level>">,
	HelpText<"Optimization level for LTO">;

	def m: JoinedOrSeparate<["-"], "m">, HelpText<"Set target emulation">;

	defm Map: Eq<"Map">, HelpText<"Print a link map to the specified file">;

	def merge_exidx_entries: F<"merge-exidx-entries">,
	HelpText<"Enable merging .ARM.exidx entries">;

	def nostdlib: F<"nostdlib">,
	HelpText<"Only search directories specified on the command line">;

	def no_as_needed: F<"no-as-needed">,
	HelpText<"Always DT_NEEDED for shared libraries">;

	def no_color_diagnostics: F<"no-color-diagnostics">,
	HelpText<"Do not use colors in diagnostics">;

	def no_define_common: F<"no-define-common">,
	HelpText<"Do not assign space to common symbols">;

	def no_demangle: F<"no-demangle">,
	HelpText<"Do not demangle symbol names">;

	def no_dynamic_linker: F<"no-dynamic-linker">,
	HelpText<"Inhibit output of .interp section">;

	def no_eh_frame_hdr: F<"no-eh-frame-hdr">,
	HelpText<"Do not create .eh_frame_hdr section">;

	def no_export_dynamic: F<"no-export-dynamic">;
	def no_fatal_warnings: F<"no-fatal-warnings">;

	def no_gc_sections: F<"no-gc-sections">,
	HelpText<"Disable garbage collection of unused sections">;

	def no_gdb_index: F<"no-gdb-index">,
	HelpText<"Do not generate .gdb_index section">;

	def no_gnu_unique: F<"no-gnu-unique">,
	HelpText<"Disable STB_GNU_UNIQUE symbol binding">;

	def no_merge_exidx_entries: F<"no-merge-exidx-entries">,
	HelpText<"Disable merging .ARM.exidx entries">;

	+def no_pie: F<"no-pie">, HelpText<"Do not create a position independent executable">;
	+
	def no_threads: F<"no-threads">,
	HelpText<"Do not run the linker multi-threaded">;

	def no_whole_archive: F<"no-whole-archive">,
	HelpText<"Restores the default behavior of loading archive members">;

	def noinhibit_exec: F<"noinhibit-exec">,
	HelpText<"Retain the executable output file whenever it is still usable">;
	-
	-def nopie: F<"nopie">, HelpText<"Do not create a position independent executable">;

	def no_omagic: Flag<["--"], "no-omagic">, MetaVarName<"<magic>">,
	HelpText<"Do not set the text data sections to be writable">;

	def no_print_gc_sections: F<"no-print-gc-sections">,
	HelpText<"Do not list removed unused sections">;

	def no_rosegment: F<"no-rosegment">,
	HelpText<"Do not put read-only non-executable sections in their own segment">;

	def no_undefined: F<"no-undefined">,
	HelpText<"Report unresolved symbols even if the linker is creating a shared library">;

	def no_undefined_version: F<"no-undefined-version">,
	HelpText<"Report version scripts that refer undefined symbols">;

	def o: JoinedOrSeparate<["-"], "o">, MetaVarName<"<path>">,
	HelpText<"Path to file to write output">;

	def oformat: Separate<["--"], "oformat">, MetaVarName<"<format>">,
	HelpText<"Specify the binary format for the output object file">;

	def omagic: Flag<["--"], "omagic">, MetaVarName<"<magic>">,
	HelpText<"Set the text and data sections to be readable and writable">;

	defm orphan_handling: Eq<"orphan-handling">,
	HelpText<"Control how orphan sections are handled when linker script used">;

	def pack_dyn_relocs_eq: J<"pack-dyn-relocs=">, MetaVarName<"<format>">,
	HelpText<"Pack dynamic relocations in the given format (none or android)">;

	def pie: F<"pie">, HelpText<"Create a position independent executable">;

	def print_gc_sections: F<"print-gc-sections">,
	HelpText<"List removed unused sections">;

	def print_map: F<"print-map">,
	HelpText<"Print a link map to the standard output">;

	defm reproduce: Eq<"reproduce">,
	HelpText<"Dump linker invocation and input files for debugging">;

	defm rpath: Eq<"rpath">, HelpText<"Add a DT_RUNPATH to the output">;

	def relocatable: F<"relocatable">, HelpText<"Create relocatable object file">;

	defm retain_symbols_file: Eq<"retain-symbols-file">,
	HelpText<"Retain only the symbols listed in the file">,
	MetaVarName<"<file>">;

	defm script: Eq<"script">, HelpText<"Read linker script">;

	def section_start: S<"section-start">, MetaVarName<"<address>">,
	HelpText<"Set address of section">;

	def shared: F<"shared">, HelpText<"Build a shared object">;

	defm soname: Eq<"soname">, HelpText<"Set DT_SONAME">;

	defm sort_section: Eq<"sort-section">,
	HelpText<"Specifies sections sorting rule when linkerscript is used">;

	def start_lib: F<"start-lib">,
	HelpText<"Start a grouping of objects that should be treated as if they were together in an archive">;

	def strip_all: F<"strip-all">, HelpText<"Strip all symbols">;

	def strip_debug: F<"strip-debug">, HelpText<"Strip debugging information">;

	def symbol_ordering_file: S<"symbol-ordering-file">,
	HelpText<"Layout sections in the order specified by symbol file">;

	defm sysroot: Eq<"sysroot">, HelpText<"Set the system root">;

	def target1_rel: F<"target1-rel">, HelpText<"Interpret R_ARM_TARGET1 as R_ARM_REL32">;

	def target1_abs: F<"target1-abs">, HelpText<"Interpret R_ARM_TARGET1 as R_ARM_ABS32">;

	defm target2: Eq<"target2">,
	HelpText<"Interpret R_ARM_TARGET2 as <type>, where <type> is one of rel, abs, or got-rel">,
	MetaVarName<"<type>">;

	def threads: F<"threads">, HelpText<"Run the linker multi-threaded">;

	def trace: F<"trace">, HelpText<"Print the names of the input files">;

	defm trace_symbol : Eq<"trace-symbol">, HelpText<"Trace references to symbols">;

	defm undefined: Eq<"undefined">,
	HelpText<"Force undefined symbol during linking">;

	defm unresolved_symbols: Eq<"unresolved-symbols">,
	HelpText<"Determine how to handle unresolved symbols">;

	defm rsp_quoting: Eq<"rsp-quoting">,
	HelpText<"Quoting style for response files. Values supported: windows\|posix">;

	def v: Flag<["-"], "v">, HelpText<"Display the version number">;

	def verbose: F<"verbose">, HelpText<"Verbose mode">;

	def version: F<"version">, HelpText<"Display the version number and exit">;

	defm version_script: Eq<"version-script">, HelpText<"Read a version script">;

	def warn_common: F<"warn-common">,
	HelpText<"Warn about duplicate common symbols">;

	def warn_unresolved_symbols: F<"warn-unresolved-symbols">,
	HelpText<"Report unresolved symbols as warnings">;

	def whole_archive: F<"whole-archive">,
	HelpText<"Force load of all members in a static library">;

	defm wrap: Eq<"wrap">, HelpText<"Use wrapper functions for symbol">,
	MetaVarName<"<symbol>">;

	def z: JoinedOrSeparate<["-"], "z">, MetaVarName<"<option>">,
	HelpText<"Linker option extensions">;

	// Aliases
	def alias_auxiliary: Separate<["-"], "f">, Alias<auxiliary>;
	def alias_Bdynamic_call_shared: F<"call_shared">, Alias<Bdynamic>;
	def alias_Bdynamic_dy: F<"dy">, Alias<Bdynamic>;
	def alias_Bstatic_dn: F<"dn">, Alias<Bstatic>;
	def alias_Bstatic_non_shared: F<"non_shared">, Alias<Bstatic>;
	def alias_Bstatic_static: F<"static">, Alias<Bstatic>;
	def alias_define_common_d: Flag<["-"], "d">, Alias<define_common>;
	def alias_define_common_dc: F<"dc">, Alias<define_common>;
	def alias_define_common_dp: F<"dp">, Alias<define_common>;
	def alias_discard_all_x: Flag<["-"], "x">, Alias<discard_all>;
	def alias_discard_locals_X: Flag<["-"], "X">, Alias<discard_locals>;
	def alias_emit_relocs: Flag<["-"], "q">, Alias<emit_relocs>;
	def alias_entry_e: JoinedOrSeparate<["-"], "e">, Alias<entry>;
	def alias_export_dynamic_E: Flag<["-"], "E">, Alias<export_dynamic>;
	def alias_filter: Separate<["-"], "F">, Alias<filter>;
	def alias_format_b: S<"b">, Alias<format>;
	def alias_library: JoinedOrSeparate<["-"], "l">, Alias<library>;
	def alias_library_path: JoinedOrSeparate<["-"], "L">, Alias<library_path>;
	def alias_omagic: Flag<["-"], "N">, Alias<omagic>;
	def alias_o_output: Joined<["--"], "output=">, Alias<o>;
	def alias_o_output2 : Separate<["--"], "output">, Alias<o>;
	def alias_pie_pic_executable: F<"pic-executable">, Alias<pie>;
	def alias_print_map_M: Flag<["-"], "M">, Alias<print_map>;
	def alias_relocatable_r: Flag<["-"], "r">, Alias<relocatable>;
	def alias_rpath_R: JoinedOrSeparate<["-"], "R">, Alias<rpath>;
	def alias_script_T: JoinedOrSeparate<["-"], "T">, Alias<script>;
	def alias_shared_Bshareable: F<"Bshareable">, Alias<shared>;
	def alias_soname_h: JoinedOrSeparate<["-"], "h">, Alias<soname>;
	def alias_strip_all: Flag<["-"], "s">, Alias<strip_all>;
	def alias_strip_debug_S: Flag<["-"], "S">, Alias<strip_debug>;
	def alias_trace: Flag<["-"], "t">, Alias<trace>;
	def alias_trace_symbol_y : JoinedOrSeparate<["-"], "y">, Alias<trace_symbol>;
	def alias_Ttext_segment: S<"Ttext-segment">, Alias<Ttext>;
	def alias_Ttext_segment_eq: J<"Ttext-segment=">, Alias<Ttext>;
	def alias_undefined_u: JoinedOrSeparate<["-"], "u">, Alias<undefined>;
	def alias_version_V: Flag<["-"], "V">, Alias<version>;

	// Our symbol resolution algorithm handles symbols in archive files differently
	// than traditional linkers, so we don't need --start-group and --end-group.
	// These options are recongized for compatibility but ignored.
	def end_group: F<"end-group">;
	def end_group_paren: Flag<["-"], ")">;
	def start_group: F<"start-group">;
	def start_group_paren: Flag<["-"], "(">;

	// LTO-related options.
	def lto_aa_pipeline: J<"lto-aa-pipeline=">,
	HelpText<"AA pipeline to run during LTO. Used in conjunction with -lto-newpm-passes">;
	def lto_newpm_passes: J<"lto-newpm-passes=">,
	HelpText<"Passes to run during LTO">;
	def lto_partitions: J<"lto-partitions=">,
	HelpText<"Number of LTO codegen partitions">;
	def disable_verify: F<"disable-verify">;
	def mllvm: S<"mllvm">;
	def opt_remarks_filename: Separate<["--"], "opt-remarks-filename">,
	HelpText<"YAML output file for optimization remarks">;
	def opt_remarks_with_hotness: Flag<["--"], "opt-remarks-with-hotness">,
	HelpText<"Include hotness informations in the optimization remarks file">;
	defm plugin_opt: Eq<"plugin-opt">,
	HelpText<"specifies LTO options for compatibility with GNU linkers">;
	def save_temps: F<"save-temps">;
	def thinlto_cache_dir: J<"thinlto-cache-dir=">,
	HelpText<"Path to ThinLTO cached object file directory">;
	def thinlto_cache_policy: S<"thinlto-cache-policy">,
	HelpText<"Pruning policy for the ThinLTO cache">;
	def thinlto_jobs: J<"thinlto-jobs=">, HelpText<"Number of ThinLTO jobs">;

	// Ignore LTO plugin-related options.
	// clang -flto passes -plugin and -plugin-opt to the linker. This is required
	// for ld.gold and ld.bfd to get LTO working. But it's not for lld which doesn't
	// rely on a plugin. Instead of detecting which linker is used on clang side we
	// just ignore the option on lld side as it's easier. In fact, the linker could
	// be called 'ld' and understanding which linker is used would require parsing of
	// --version output.
	def plugin: S<"plugin">;
	def plugin_eq: J<"plugin=">;

	// Options listed below are silently ignored for now for compatibility.
	def allow_shlib_undefined: F<"allow-shlib-undefined">;
	def cref: F<"cref">;
	def detect_odr_violations: F<"detect-odr-violations">;
	def g: Flag<["-"], "g">;
	def long_plt: F<"long-plt">;
	def no_add_needed: F<"no-add-needed">;
	def no_allow_shlib_undefined: F<"no-allow-shlib-undefined">;
	def no_copy_dt_needed_entries: F<"no-copy-dt-needed-entries">;
	def no_ctors_in_init_array: F<"no-ctors-in-init-array">;
	def no_keep_memory: F<"no-keep-memory">;
	def no_mmap_output_file: F<"no-mmap-output-file">;
	def no_warn_common: F<"no-warn-common">;
	def no_warn_mismatch: F<"no-warn-mismatch">;
	def rpath_link: S<"rpath-link">;
	def rpath_link_eq: J<"rpath-link=">;
	def sort_common: F<"sort-common">;
	def stats: F<"stats">;
	def warn_execstack: F<"warn-execstack">;
	def warn_once: F<"warn-once">;
	def warn_shared_textrel: F<"warn-shared-textrel">;
	def EB : F<"EB">;
	def EL : F<"EL">;
	def G: JoinedOrSeparate<["-"], "G">;
	def Qy : F<"Qy">;
	Index: head/contrib/llvm/tools/lld
	===================================================================
	--- head/contrib/llvm/tools/lld (revision 329409)
	+++ head/contrib/llvm/tools/lld (revision 329410)

	Property changes on: head/contrib/llvm/tools/lld
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/lld/dist-release_60:r328795-329405
	Index: head/contrib/llvm/tools/lldb/source/Plugins/Platform/NetBSD/PlatformNetBSD.cpp
	===================================================================
	--- head/contrib/llvm/tools/lldb/source/Plugins/Platform/NetBSD/PlatformNetBSD.cpp (revision 329409)
	+++ head/contrib/llvm/tools/lldb/source/Plugins/Platform/NetBSD/PlatformNetBSD.cpp (revision 329410)
	@@ -1,436 +1,374 @@
	//===-- PlatformNetBSD.cpp -------------------------------------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#include "PlatformNetBSD.h"
	#include "lldb/Host/Config.h"

	// C Includes
	#include <stdio.h>
	#ifndef LLDB_DISABLE_POSIX
	#include <sys/utsname.h>
	#endif

	// C++ Includes
	// Other libraries and framework includes
	// Project includes
	#include "lldb/Core/Debugger.h"
	#include "lldb/Core/PluginManager.h"
	#include "lldb/Core/State.h"
	#include "lldb/Host/HostInfo.h"
	#include "lldb/Target/Process.h"
	#include "lldb/Target/Target.h"
	#include "lldb/Utility/FileSpec.h"
	#include "lldb/Utility/Log.h"
	#include "lldb/Utility/Status.h"
	#include "lldb/Utility/StreamString.h"

	// Define these constants from NetBSD mman.h for use when targeting
	// remote netbsd systems even when host has different values.
	#define MAP_PRIVATE 0x0002
	#define MAP_ANON 0x1000

	using namespace lldb;
	using namespace lldb_private;
	using namespace lldb_private::platform_netbsd;

	static uint32_t g_initialize_count = 0;

	//------------------------------------------------------------------

	PlatformSP PlatformNetBSD::CreateInstance(bool force, const ArchSpec *arch) {
	Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PLATFORM));
	- if (log) {
	- const char *arch_name;
	- if (arch && arch->GetArchitectureName())
	- arch_name = arch->GetArchitectureName();
	- else
	- arch_name = "<null>";
	+ LLDB_LOG(log, "force = {0}, arch=({1}, {2})", force,
	+ arch ? arch->GetArchitectureName() : "<null>",
	+ arch ? arch->GetTriple().getTriple() : "<null>");

	- const char *triple_cstr =
	- arch ? arch->GetTriple().getTriple().c_str() : "<null>";
	-
	- log->Printf("PlatformNetBSD::%s(force=%s, arch={%s,%s})", __FUNCTION__,
	- force ? "true" : "false", arch_name, triple_cstr);
	- }
	-
	bool create = force;
	if (create == false && arch && arch->IsValid()) {
	const llvm::Triple &triple = arch->GetTriple();
	switch (triple.getOS()) {
	case llvm::Triple::NetBSD:
	create = true;
	break;

	default:
	break;
	}
	}

	+ LLDB_LOG(log, "create = {0}", create);
	if (create) {
	- if (log)
	- log->Printf("PlatformNetBSD::%s() creating remote-netbsd platform",
	- __FUNCTION__);
	return PlatformSP(new PlatformNetBSD(false));
	}
	-
	- if (log)
	- log->Printf(
	- "PlatformNetBSD::%s() aborting creation of remote-netbsd platform",
	- __FUNCTION__);
	-
	return PlatformSP();
	}

	ConstString PlatformNetBSD::GetPluginNameStatic(bool is_host) {
	if (is_host) {
	static ConstString g_host_name(Platform::GetHostPlatformName());
	return g_host_name;
	} else {
	static ConstString g_remote_name("remote-netbsd");
	return g_remote_name;
	}
	}

	const char *PlatformNetBSD::GetPluginDescriptionStatic(bool is_host) {
	if (is_host)
	return "Local NetBSD user platform plug-in.";
	else
	return "Remote NetBSD user platform plug-in.";
	}

	ConstString PlatformNetBSD::GetPluginName() {
	return GetPluginNameStatic(IsHost());
	}

	void PlatformNetBSD::Initialize() {
	PlatformPOSIX::Initialize();

	if (g_initialize_count++ == 0) {
	#if defined(__NetBSD__)
	PlatformSP default_platform_sp(new PlatformNetBSD(true));
	default_platform_sp->SetSystemArchitecture(HostInfo::GetArchitecture());
	Platform::SetHostPlatform(default_platform_sp);
	#endif
	PluginManager::RegisterPlugin(
	PlatformNetBSD::GetPluginNameStatic(false),
	PlatformNetBSD::GetPluginDescriptionStatic(false),
	PlatformNetBSD::CreateInstance, nullptr);
	}
	}

	void PlatformNetBSD::Terminate() {
	if (g_initialize_count > 0) {
	if (--g_initialize_count == 0) {
	PluginManager::UnregisterPlugin(PlatformNetBSD::CreateInstance);
	}
	}

	PlatformPOSIX::Terminate();
	}

	//------------------------------------------------------------------
	/// Default Constructor
	//------------------------------------------------------------------
	PlatformNetBSD::PlatformNetBSD(bool is_host)
	: PlatformPOSIX(is_host) // This is the local host platform
	{}

	PlatformNetBSD::~PlatformNetBSD() = default;

	bool PlatformNetBSD::GetSupportedArchitectureAtIndex(uint32_t idx,
	ArchSpec &arch) {
	if (IsHost()) {
	ArchSpec hostArch = HostInfo::GetArchitecture(HostInfo::eArchKindDefault);
	if (hostArch.GetTriple().isOSNetBSD()) {
	if (idx == 0) {
	arch = hostArch;
	return arch.IsValid();
	} else if (idx == 1) {
	// If the default host architecture is 64-bit, look for a 32-bit variant
	if (hostArch.IsValid() && hostArch.GetTriple().isArch64Bit()) {
	arch = HostInfo::GetArchitecture(HostInfo::eArchKind32);
	return arch.IsValid();
	}
	}
	}
	} else {
	if (m_remote_platform_sp)
	return m_remote_platform_sp->GetSupportedArchitectureAtIndex(idx, arch);

	llvm::Triple triple;
	// Set the OS to NetBSD
	triple.setOS(llvm::Triple::NetBSD);
	// Set the architecture
	switch (idx) {
	case 0:
	triple.setArchName("x86_64");
	break;
	case 1:
	triple.setArchName("i386");
	break;
	default:
	return false;
	}
	// Leave the vendor as "llvm::Triple:UnknownVendor" and don't specify the
	// vendor by
	// calling triple.SetVendorName("unknown") so that it is a "unspecified
	// unknown".
	// This means when someone calls triple.GetVendorName() it will return an
	// empty string
	// which indicates that the vendor can be set when two architectures are
	// merged

	// Now set the triple into "arch" and return true
	arch.SetTriple(triple);
	return true;
	}
	return false;
	}

	void PlatformNetBSD::GetStatus(Stream &strm) {
	Platform::GetStatus(strm);

	#ifndef LLDB_DISABLE_POSIX
	// Display local kernel information only when we are running in host mode.
	// Otherwise, we would end up printing non-NetBSD information (when running
	// on Mac OS for example).
	if (IsHost()) {
	struct utsname un;

	if (uname(&un))
	return;

	strm.Printf(" Kernel: %s\n", un.sysname);
	strm.Printf(" Release: %s\n", un.release);
	strm.Printf(" Version: %s\n", un.version);
	}
	#endif
	}

	int32_t
	PlatformNetBSD::GetResumeCountForLaunchInfo(ProcessLaunchInfo &launch_info) {
	int32_t resume_count = 0;

	// Always resume past the initial stop when we use eLaunchFlagDebug
	if (launch_info.GetFlags().Test(eLaunchFlagDebug)) {
	// Resume past the stop for the final exec into the true inferior.
	++resume_count;
	}

	// If we're not launching a shell, we're done.
	const FileSpec &shell = launch_info.GetShell();
	if (!shell)
	return resume_count;

	std::string shell_string = shell.GetPath();
	// We're in a shell, so for sure we have to resume past the shell exec.
	++resume_count;

	// Figure out what shell we're planning on using.
	const char *shell_name = strrchr(shell_string.c_str(), '/');
	if (shell_name == NULL)
	shell_name = shell_string.c_str();
	else
	shell_name++;

	if (strcmp(shell_name, "csh") == 0 \|\| strcmp(shell_name, "tcsh") == 0 \|\|
	strcmp(shell_name, "zsh") == 0 \|\| strcmp(shell_name, "sh") == 0) {
	// These shells seem to re-exec themselves. Add another resume.
	++resume_count;
	}

	return resume_count;
	}

	bool PlatformNetBSD::CanDebugProcess() {
	if (IsHost()) {
	return true;
	} else {
	// If we're connected, we can debug.
	return IsConnected();
	}
	}

	// For local debugging, NetBSD will override the debug logic to use llgs-launch
	-// rather than
	-// lldb-launch, llgs-attach. This differs from current lldb-launch,
	-// debugserver-attach
	-// approach on MacOSX.
	-lldb::ProcessSP PlatformNetBSD::DebugProcess(
	- ProcessLaunchInfo &launch_info, Debugger &debugger,
	- Target *target, // Can be NULL, if NULL create a new
	- // target, else use existing one
	- Status &error) {
	+// rather than lldb-launch, llgs-attach. This differs from current lldb-launch,
	+// debugserver-attach approach on MacOSX.
	+lldb::ProcessSP
	+PlatformNetBSD::DebugProcess(ProcessLaunchInfo &launch_info, Debugger &debugger,
	+ Target *target, // Can be NULL, if NULL create a new
	+ // target, else use existing one
	+ Status &error) {
	Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PLATFORM));
	- if (log)
	- log->Printf("PlatformNetBSD::%s entered (target %p)", __FUNCTION__,
	- static_cast<void *>(target));
	+ LLDB_LOG(log, "target {0}", target);

	// If we're a remote host, use standard behavior from parent class.
	if (!IsHost())
	return PlatformPOSIX::DebugProcess(launch_info, debugger, target, error);

	//
	// For local debugging, we'll insist on having ProcessGDBRemote create the
	// process.
	//

	ProcessSP process_sp;

	// Make sure we stop at the entry point
	launch_info.GetFlags().Set(eLaunchFlagDebug);

	// We always launch the process we are going to debug in a separate process
	// group, since then we can handle ^C interrupts ourselves w/o having to worry
	// about the target getting them as well.
	launch_info.SetLaunchInSeparateProcessGroup(true);

	// Ensure we have a target.
	if (target == nullptr) {
	- if (log)
	- log->Printf("PlatformNetBSD::%s creating new target", __FUNCTION__);
	-
	+ LLDB_LOG(log, "creating new target");
	TargetSP new_target_sp;
	error = debugger.GetTargetList().CreateTarget(debugger, "", "", false,
	nullptr, new_target_sp);
	if (error.Fail()) {
	- if (log)
	- log->Printf("PlatformNetBSD::%s failed to create new target: %s",
	- __FUNCTION__, error.AsCString());
	+ LLDB_LOG(log, "failed to create new target: {0}", error);
	return process_sp;
	}

	target = new_target_sp.get();
	if (!target) {
	error.SetErrorString("CreateTarget() returned nullptr");
	- if (log)
	- log->Printf("PlatformNetBSD::%s failed: %s", __FUNCTION__,
	- error.AsCString());
	+ LLDB_LOG(log, "error: {0}", error);
	return process_sp;
	}
	- } else {
	- if (log)
	- log->Printf("PlatformNetBSD::%s using provided target", __FUNCTION__);
	}

	// Mark target as currently selected target.
	debugger.GetTargetList().SetSelectedTarget(target);

	// Now create the gdb-remote process.
	- if (log)
	- log->Printf(
	- "PlatformNetBSD::%s having target create process with gdb-remote plugin",
	- __FUNCTION__);
	+ LLDB_LOG(log, "having target create process with gdb-remote plugin");
	process_sp = target->CreateProcess(
	launch_info.GetListenerForProcess(debugger), "gdb-remote", nullptr);

	if (!process_sp) {
	error.SetErrorString("CreateProcess() failed for gdb-remote process");
	- if (log)
	- log->Printf("PlatformNetBSD::%s failed: %s", __FUNCTION__,
	- error.AsCString());
	+ LLDB_LOG(log, "error: {0}", error);
	return process_sp;
	- } else {
	- if (log)
	- log->Printf("PlatformNetBSD::%s successfully created process",
	- __FUNCTION__);
	}

	+ LLDB_LOG(log, "successfully created process");
	// Adjust launch for a hijacker.
	ListenerSP listener_sp;
	if (!launch_info.GetHijackListener()) {
	- if (log)
	- log->Printf("PlatformNetBSD::%s setting up hijacker", __FUNCTION__);
	-
	+ LLDB_LOG(log, "setting up hijacker");
	listener_sp =
	Listener::MakeListener("lldb.PlatformNetBSD.DebugProcess.hijack");
	launch_info.SetHijackListener(listener_sp);
	process_sp->HijackProcessEvents(listener_sp);
	}

	// Log file actions.
	if (log) {
	- log->Printf(
	- "PlatformNetBSD::%s launching process with the following file actions:",
	- __FUNCTION__);
	-
	+ LLDB_LOG(log, "launching process with the following file actions:");
	StreamString stream;
	size_t i = 0;
	const FileAction *file_action;
	while ((file_action = launch_info.GetFileActionAtIndex(i++)) != nullptr) {
	file_action->Dump(stream);
	- log->PutCString(stream.GetData());
	+ LLDB_LOG(log, "{0}", stream.GetData());
	stream.Clear();
	}
	}

	// Do the launch.
	error = process_sp->Launch(launch_info);
	if (error.Success()) {
	// Handle the hijacking of process events.
	if (listener_sp) {
	const StateType state = process_sp->WaitForProcessToStop(
	llvm::None, NULL, false, listener_sp);

	- if (state == eStateStopped) {
	- if (log)
	- log->Printf("PlatformNetBSD::%s pid %" PRIu64 " state %s\n",
	- __FUNCTION__, process_sp->GetID(), StateAsCString(state));
	- } else {
	- if (log)
	- log->Printf("PlatformNetBSD::%s pid %" PRIu64
	- " state is not stopped - %s\n",
	- __FUNCTION__, process_sp->GetID(), StateAsCString(state));
	- }
	+ LLDB_LOG(log, "pid {0} state {0}", process_sp->GetID(), state);
	}

	// Hook up process PTY if we have one (which we should for local debugging
	// with llgs).
	int pty_fd = launch_info.GetPTY().ReleaseMasterFileDescriptor();
	if (pty_fd != PseudoTerminal::invalid_fd) {
	process_sp->SetSTDIOFileDescriptor(pty_fd);
	- if (log)
	- log->Printf("PlatformNetBSD::%s pid %" PRIu64
	- " hooked up STDIO pty to process",
	- __FUNCTION__, process_sp->GetID());
	- } else {
	- if (log)
	- log->Printf("PlatformNetBSD::%s pid %" PRIu64
	- " not using process STDIO pty",
	- __FUNCTION__, process_sp->GetID());
	- }
	+ LLDB_LOG(log, "hooked up STDIO pty to process");
	+ } else
	+ LLDB_LOG(log, "not using process STDIO pty");
	} else {
	- if (log)
	- log->Printf("PlatformNetBSD::%s process launch failed: %s", __FUNCTION__,
	- error.AsCString());
	+ LLDB_LOG(log, "process launch failed: {0}", error);
	// FIXME figure out appropriate cleanup here. Do we delete the target? Do
	// we delete the process? Does our caller do that?
	}

	return process_sp;
	}

	void PlatformNetBSD::CalculateTrapHandlerSymbolNames() {
	m_trap_handlers.push_back(ConstString("_sigtramp"));
	}

	MmapArgList PlatformNetBSD::GetMmapArgumentList(const ArchSpec &arch,
	addr_t addr, addr_t length,
	unsigned prot, unsigned flags,
	addr_t fd, addr_t offset) {
	uint64_t flags_platform = 0;

	if (flags & eMmapFlagsPrivate)
	flags_platform \|= MAP_PRIVATE;
	if (flags & eMmapFlagsAnon)
	flags_platform \|= MAP_ANON;

	MmapArgList args({addr, length, prot, flags_platform, fd, offset});
	return args;
	}
	Index: head/contrib/llvm/tools/lldb/source/Plugins/Process/NetBSD/NativeProcessNetBSD.cpp
	===================================================================
	--- head/contrib/llvm/tools/lldb/source/Plugins/Process/NetBSD/NativeProcessNetBSD.cpp (revision 329409)
	+++ head/contrib/llvm/tools/lldb/source/Plugins/Process/NetBSD/NativeProcessNetBSD.cpp (revision 329410)
	@@ -1,914 +1,914 @@
	//===-- NativeProcessNetBSD.cpp ------------------------------- -- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#include "NativeProcessNetBSD.h"

	// C Includes

	// C++ Includes

	// Other libraries and framework includes
	#include "Plugins/Process/POSIX/ProcessPOSIXLog.h"
	#include "lldb/Core/State.h"
	#include "lldb/Host/HostProcess.h"
	#include "lldb/Host/common/NativeBreakpoint.h"
	#include "lldb/Host/common/NativeRegisterContext.h"
	#include "lldb/Host/posix/ProcessLauncherPosixFork.h"
	#include "lldb/Target/Process.h"
	#include "llvm/Support/Errno.h"

	// System includes - They have to be included after framework includes because
	// they define some
	// macros which collide with variable names in other modules
	// clang-format off
	#include <sys/types.h>
	#include <sys/ptrace.h>
	#include <sys/sysctl.h>
	#include <sys/wait.h>
	#include <uvm/uvm_prot.h>
	#include <elf.h>
	#include <util.h>
	// clang-format on

	using namespace lldb;
	using namespace lldb_private;
	using namespace lldb_private::process_netbsd;
	using namespace llvm;

	// Simple helper function to ensure flags are enabled on the given file
	// descriptor.
	static Status EnsureFDFlags(int fd, int flags) {
	Status error;

	int status = fcntl(fd, F_GETFL);
	if (status == -1) {
	error.SetErrorToErrno();
	return error;
	}

	if (fcntl(fd, F_SETFL, status \| flags) == -1) {
	error.SetErrorToErrno();
	return error;
	}

	return error;
	}

	// -----------------------------------------------------------------------------
	// Public Static Methods
	// -----------------------------------------------------------------------------

	llvm::Expected<std::unique_ptr<NativeProcessProtocol>>
	NativeProcessNetBSD::Factory::Launch(ProcessLaunchInfo &launch_info,
	NativeDelegate &native_delegate,
	MainLoop &mainloop) const {
	Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));

	Status status;
	::pid_t pid = ProcessLauncherPosixFork()
	.LaunchProcess(launch_info, status)
	.GetProcessId();
	LLDB_LOG(log, "pid = {0:x}", pid);
	if (status.Fail()) {
	LLDB_LOG(log, "failed to launch process: {0}", status);
	return status.ToError();
	}

	// Wait for the child process to trap on its call to execve.
	int wstatus;
	::pid_t wpid = llvm::sys::RetryAfterSignal(-1, ::waitpid, pid, &wstatus, 0);
	assert(wpid == pid);
	(void)wpid;
	if (!WIFSTOPPED(wstatus)) {
	LLDB_LOG(log, "Could not sync with inferior process: wstatus={1}",
	WaitStatus::Decode(wstatus));
	return llvm::make_error<StringError>("Could not sync with inferior process",
	llvm::inconvertibleErrorCode());
	}
	LLDB_LOG(log, "inferior started, now in stopped state");

	ArchSpec arch;
	if ((status = ResolveProcessArchitecture(pid, arch)).Fail())
	return status.ToError();

	// Set the architecture to the exe architecture.
	LLDB_LOG(log, "pid = {0:x}, detected architecture {1}", pid,
	arch.GetArchitectureName());

	std::unique_ptr<NativeProcessNetBSD> process_up(new NativeProcessNetBSD(
	pid, launch_info.GetPTY().ReleaseMasterFileDescriptor(), native_delegate,
	arch, mainloop));

	status = process_up->ReinitializeThreads();
	if (status.Fail())
	return status.ToError();

	for (const auto &thread : process_up->m_threads)
	static_cast<NativeThreadNetBSD &>(*thread).SetStoppedBySignal(SIGSTOP);
	- process_up->SetState(StateType::eStateStopped);
	+ process_up->SetState(StateType::eStateStopped, false);

	return std::move(process_up);
	}

	llvm::Expected<std::unique_ptr<NativeProcessProtocol>>
	NativeProcessNetBSD::Factory::Attach(
	lldb::pid_t pid, NativeProcessProtocol::NativeDelegate &native_delegate,
	MainLoop &mainloop) const {
	Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));
	LLDB_LOG(log, "pid = {0:x}", pid);

	// Retrieve the architecture for the running process.
	ArchSpec arch;
	Status status = ResolveProcessArchitecture(pid, arch);
	if (!status.Success())
	return status.ToError();

	std::unique_ptr<NativeProcessNetBSD> process_up(
	new NativeProcessNetBSD(pid, -1, native_delegate, arch, mainloop));

	status = process_up->Attach();
	if (!status.Success())
	return status.ToError();

	return std::move(process_up);
	}

	// -----------------------------------------------------------------------------
	// Public Instance Methods
	// -----------------------------------------------------------------------------

	NativeProcessNetBSD::NativeProcessNetBSD(::pid_t pid, int terminal_fd,
	NativeDelegate &delegate,
	const ArchSpec &arch,
	MainLoop &mainloop)
	: NativeProcessProtocol(pid, terminal_fd, delegate), m_arch(arch) {
	if (m_terminal_fd != -1) {
	Status status = EnsureFDFlags(m_terminal_fd, O_NONBLOCK);
	assert(status.Success());
	}

	Status status;
	m_sigchld_handle = mainloop.RegisterSignal(
	SIGCHLD, [this](MainLoopBase &) { SigchldHandler(); }, status);
	assert(m_sigchld_handle && status.Success());
	}

	// Handles all waitpid events from the inferior process.
	void NativeProcessNetBSD::MonitorCallback(lldb::pid_t pid, int signal) {
	switch (signal) {
	case SIGTRAP:
	return MonitorSIGTRAP(pid);
	case SIGSTOP:
	return MonitorSIGSTOP(pid);
	default:
	return MonitorSignal(pid, signal);
	}
	}

	void NativeProcessNetBSD::MonitorExited(lldb::pid_t pid, WaitStatus status) {
	Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));

	LLDB_LOG(log, "got exit signal({0}) , pid = {1}", status, pid);

	/* Stop Tracking All Threads attached to Process */
	m_threads.clear();

	SetExitStatus(status, true);

	// Notify delegate that our process has exited.
	SetState(StateType::eStateExited, true);
	}

	void NativeProcessNetBSD::MonitorSIGSTOP(lldb::pid_t pid) {
	ptrace_siginfo_t info;

	const auto siginfo_err =
	PtraceWrapper(PT_GET_SIGINFO, pid, &info, sizeof(info));

	// Get details on the signal raised.
	if (siginfo_err.Success()) {
	// Handle SIGSTOP from LLGS (LLDB GDB Server)
	if (info.psi_siginfo.si_code == SI_USER &&
	info.psi_siginfo.si_pid == ::getpid()) {
	/* Stop Tracking all Threads attached to Process */
	for (const auto &thread : m_threads) {
	static_cast<NativeThreadNetBSD &>(*thread).SetStoppedBySignal(
	SIGSTOP, &info.psi_siginfo);
	}
	}
	}
	}

	void NativeProcessNetBSD::MonitorSIGTRAP(lldb::pid_t pid) {
	Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));
	ptrace_siginfo_t info;

	const auto siginfo_err =
	PtraceWrapper(PT_GET_SIGINFO, pid, &info, sizeof(info));

	// Get details on the signal raised.
	if (siginfo_err.Fail()) {
	return;
	}

	switch (info.psi_siginfo.si_code) {
	case TRAP_BRKPT:
	for (const auto &thread : m_threads) {
	static_cast<NativeThreadNetBSD &>(*thread).SetStoppedByBreakpoint();
	FixupBreakpointPCAsNeeded(static_cast<NativeThreadNetBSD &>(*thread));
	}
	SetState(StateType::eStateStopped, true);
	break;
	case TRAP_TRACE:
	for (const auto &thread : m_threads)
	static_cast<NativeThreadNetBSD &>(*thread).SetStoppedByTrace();
	SetState(StateType::eStateStopped, true);
	break;
	case TRAP_EXEC: {
	Status error = ReinitializeThreads();
	if (error.Fail()) {
	SetState(StateType::eStateInvalid);
	return;
	}

	// Let our delegate know we have just exec'd.
	NotifyDidExec();

	for (const auto &thread : m_threads)
	static_cast<NativeThreadNetBSD &>(*thread).SetStoppedByExec();
	SetState(StateType::eStateStopped, true);
	} break;
	case TRAP_DBREG: {
	// If a watchpoint was hit, report it
	uint32_t wp_index;
	Status error = static_cast<NativeThreadNetBSD &>(*m_threads[info.psi_lwpid])
	.GetRegisterContext()
	.GetWatchpointHitIndex(
	wp_index, (uintptr_t)info.psi_siginfo.si_addr);
	if (error.Fail())
	LLDB_LOG(log,
	"received error while checking for watchpoint hits, pid = "
	"{0}, LWP = {1}, error = {2}",
	GetID(), info.psi_lwpid, error);
	if (wp_index != LLDB_INVALID_INDEX32) {
	for (const auto &thread : m_threads)
	static_cast<NativeThreadNetBSD &>(*thread).SetStoppedByWatchpoint(
	wp_index);
	SetState(StateType::eStateStopped, true);
	break;
	}

	// If a breakpoint was hit, report it
	uint32_t bp_index;
	error = static_cast<NativeThreadNetBSD &>(*m_threads[info.psi_lwpid])
	.GetRegisterContext()
	.GetHardwareBreakHitIndex(bp_index,
	(uintptr_t)info.psi_siginfo.si_addr);
	if (error.Fail())
	LLDB_LOG(log,
	"received error while checking for hardware "
	"breakpoint hits, pid = {0}, LWP = {1}, error = {2}",
	GetID(), info.psi_lwpid, error);
	if (bp_index != LLDB_INVALID_INDEX32) {
	for (const auto &thread : m_threads)
	static_cast<NativeThreadNetBSD &>(*thread).SetStoppedByBreakpoint();
	SetState(StateType::eStateStopped, true);
	break;
	}
	} break;
	}
	}

	void NativeProcessNetBSD::MonitorSignal(lldb::pid_t pid, int signal) {
	ptrace_siginfo_t info;
	const auto siginfo_err =
	PtraceWrapper(PT_GET_SIGINFO, pid, &info, sizeof(info));

	for (const auto &thread : m_threads) {
	static_cast<NativeThreadNetBSD &>(*thread).SetStoppedBySignal(
	info.psi_siginfo.si_signo, &info.psi_siginfo);
	}
	SetState(StateType::eStateStopped, true);
	}

	Status NativeProcessNetBSD::PtraceWrapper(int req, lldb::pid_t pid, void *addr,
	int data, int *result) {
	Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PTRACE));
	Status error;
	int ret;

	errno = 0;
	ret = ptrace(req, static_cast<::pid_t>(pid), addr, data);

	if (ret == -1)
	error.SetErrorToErrno();

	if (result)
	*result = ret;

	LLDB_LOG(log, "ptrace({0}, {1}, {2}, {3})={4:x}", req, pid, addr, data, ret);

	if (error.Fail())
	LLDB_LOG(log, "ptrace() failed: {0}", error);

	return error;
	}

	Status NativeProcessNetBSD::GetSoftwareBreakpointPCOffset(
	uint32_t &actual_opcode_size) {
	// FIXME put this behind a breakpoint protocol class that can be
	// set per architecture. Need ARM, MIPS support here.
	static const uint8_t g_i386_opcode[] = {0xCC};
	switch (m_arch.GetMachine()) {
	case llvm::Triple::x86_64:
	actual_opcode_size = static_cast<uint32_t>(sizeof(g_i386_opcode));
	return Status();
	default:
	assert(false && "CPU type not supported!");
	return Status("CPU type not supported");
	}
	}

	Status
	NativeProcessNetBSD::FixupBreakpointPCAsNeeded(NativeThreadNetBSD &thread) {
	Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_BREAKPOINTS));
	Status error;
	// Find out the size of a breakpoint (might depend on where we are in the
	// code).
	NativeRegisterContext& context = thread.GetRegisterContext();
	uint32_t breakpoint_size = 0;
	error = GetSoftwareBreakpointPCOffset(breakpoint_size);
	if (error.Fail()) {
	LLDB_LOG(log, "GetBreakpointSize() failed: {0}", error);
	return error;
	} else
	LLDB_LOG(log, "breakpoint size: {0}", breakpoint_size);
	// First try probing for a breakpoint at a software breakpoint location: PC
	// - breakpoint size.
	const lldb::addr_t initial_pc_addr =
	context.GetPCfromBreakpointLocation();
	lldb::addr_t breakpoint_addr = initial_pc_addr;
	if (breakpoint_size > 0) {
	// Do not allow breakpoint probe to wrap around.
	if (breakpoint_addr >= breakpoint_size)
	breakpoint_addr -= breakpoint_size;
	}
	// Check if we stopped because of a breakpoint.
	NativeBreakpointSP breakpoint_sp;
	error = m_breakpoint_list.GetBreakpoint(breakpoint_addr, breakpoint_sp);
	if (!error.Success() \|\| !breakpoint_sp) {
	// We didn't find one at a software probe location. Nothing to do.
	LLDB_LOG(log,
	"pid {0} no lldb breakpoint found at current pc with "
	"adjustment: {1}",
	GetID(), breakpoint_addr);
	return Status();
	}
	// If the breakpoint is not a software breakpoint, nothing to do.
	if (!breakpoint_sp->IsSoftwareBreakpoint()) {
	LLDB_LOG(
	log,
	"pid {0} breakpoint found at {1:x}, not software, nothing to adjust",
	GetID(), breakpoint_addr);
	return Status();
	}
	//
	// We have a software breakpoint and need to adjust the PC.
	//
	// Sanity check.
	if (breakpoint_size == 0) {
	// Nothing to do! How did we get here?
	LLDB_LOG(log,
	"pid {0} breakpoint found at {1:x}, it is software, but the "
	"size is zero, nothing to do (unexpected)",
	GetID(), breakpoint_addr);
	return Status();
	}
	//
	// We have a software breakpoint and need to adjust the PC.
	//
	// Sanity check.
	if (breakpoint_size == 0) {
	// Nothing to do! How did we get here?
	LLDB_LOG(log,
	"pid {0} breakpoint found at {1:x}, it is software, but the "
	"size is zero, nothing to do (unexpected)",
	GetID(), breakpoint_addr);
	return Status();
	}
	// Change the program counter.
	LLDB_LOG(log, "pid {0} tid {1}: changing PC from {2:x} to {3:x}", GetID(),
	thread.GetID(), initial_pc_addr, breakpoint_addr);
	error = context.SetPC(breakpoint_addr);
	if (error.Fail()) {
	LLDB_LOG(log, "pid {0} tid {1}: failed to set PC: {2}", GetID(),
	thread.GetID(), error);
	return error;
	}
	return error;
	}

	Status NativeProcessNetBSD::Resume(const ResumeActionList &resume_actions) {
	Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));
	LLDB_LOG(log, "pid {0}", GetID());

	const auto &thread = m_threads[0];
	const ResumeAction *const action =
	resume_actions.GetActionForThread(thread->GetID(), true);

	if (action == nullptr) {
	LLDB_LOG(log, "no action specified for pid {0} tid {1}", GetID(),
	thread->GetID());
	return Status();
	}

	Status error;

	switch (action->state) {
	case eStateRunning: {
	// Run the thread, possibly feeding it the signal.
	error = NativeProcessNetBSD::PtraceWrapper(PT_CONTINUE, GetID(), (void *)1,
	action->signal);
	if (!error.Success())
	return error;
	for (const auto &thread : m_threads)
	static_cast<NativeThreadNetBSD &>(*thread).SetRunning();
	SetState(eStateRunning, true);
	break;
	}
	case eStateStepping:
	// Run the thread, possibly feeding it the signal.
	error = NativeProcessNetBSD::PtraceWrapper(PT_STEP, GetID(), (void *)1,
	action->signal);
	if (!error.Success())
	return error;
	for (const auto &thread : m_threads)
	static_cast<NativeThreadNetBSD &>(*thread).SetStepping();
	SetState(eStateStepping, true);
	break;

	case eStateSuspended:
	case eStateStopped:
	llvm_unreachable("Unexpected state");

	default:
	return Status("NativeProcessNetBSD::%s (): unexpected state %s specified "
	"for pid %" PRIu64 ", tid %" PRIu64,
	__FUNCTION__, StateAsCString(action->state), GetID(),
	thread->GetID());
	}

	return Status();
	}

	Status NativeProcessNetBSD::Halt() {
	Status error;

	if (kill(GetID(), SIGSTOP) != 0)
	error.SetErrorToErrno();

	return error;
	}

	Status NativeProcessNetBSD::Detach() {
	Status error;

	// Stop monitoring the inferior.
	m_sigchld_handle.reset();

	// Tell ptrace to detach from the process.
	if (GetID() == LLDB_INVALID_PROCESS_ID)
	return error;

	return PtraceWrapper(PT_DETACH, GetID());
	}

	Status NativeProcessNetBSD::Signal(int signo) {
	Status error;

	if (kill(GetID(), signo))
	error.SetErrorToErrno();

	return error;
	}

	Status NativeProcessNetBSD::Kill() {
	Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));
	LLDB_LOG(log, "pid {0}", GetID());

	Status error;

	switch (m_state) {
	case StateType::eStateInvalid:
	case StateType::eStateExited:
	case StateType::eStateCrashed:
	case StateType::eStateDetached:
	case StateType::eStateUnloaded:
	// Nothing to do - the process is already dead.
	LLDB_LOG(log, "ignored for PID {0} due to current state: {1}", GetID(),
	StateAsCString(m_state));
	return error;

	case StateType::eStateConnected:
	case StateType::eStateAttaching:
	case StateType::eStateLaunching:
	case StateType::eStateStopped:
	case StateType::eStateRunning:
	case StateType::eStateStepping:
	case StateType::eStateSuspended:
	// We can try to kill a process in these states.
	break;
	}

	if (kill(GetID(), SIGKILL) != 0) {
	error.SetErrorToErrno();
	return error;
	}

	return error;
	}

	Status NativeProcessNetBSD::GetMemoryRegionInfo(lldb::addr_t load_addr,
	MemoryRegionInfo &range_info) {

	if (m_supports_mem_region == LazyBool::eLazyBoolNo) {
	// We're done.
	return Status("unsupported");
	}

	Status error = PopulateMemoryRegionCache();
	if (error.Fail()) {
	return error;
	}

	lldb::addr_t prev_base_address = 0;
	// FIXME start by finding the last region that is <= target address using
	// binary search. Data is sorted.
	// There can be a ton of regions on pthreads apps with lots of threads.
	for (auto it = m_mem_region_cache.begin(); it != m_mem_region_cache.end();
	++it) {
	MemoryRegionInfo &proc_entry_info = it->first;
	// Sanity check assumption that memory map entries are ascending.
	assert((proc_entry_info.GetRange().GetRangeBase() >= prev_base_address) &&
	"descending memory map entries detected, unexpected");
	prev_base_address = proc_entry_info.GetRange().GetRangeBase();
	UNUSED_IF_ASSERT_DISABLED(prev_base_address);
	// If the target address comes before this entry, indicate distance to
	// next region.
	if (load_addr < proc_entry_info.GetRange().GetRangeBase()) {
	range_info.GetRange().SetRangeBase(load_addr);
	range_info.GetRange().SetByteSize(
	proc_entry_info.GetRange().GetRangeBase() - load_addr);
	range_info.SetReadable(MemoryRegionInfo::OptionalBool::eNo);
	range_info.SetWritable(MemoryRegionInfo::OptionalBool::eNo);
	range_info.SetExecutable(MemoryRegionInfo::OptionalBool::eNo);
	range_info.SetMapped(MemoryRegionInfo::OptionalBool::eNo);
	return error;
	} else if (proc_entry_info.GetRange().Contains(load_addr)) {
	// The target address is within the memory region we're processing here.
	range_info = proc_entry_info;
	return error;
	}
	// The target memory address comes somewhere after the region we just
	// parsed.
	}
	// If we made it here, we didn't find an entry that contained the given
	// address. Return the
	// load_addr as start and the amount of bytes betwwen load address and the
	// end of the memory as size.
	range_info.GetRange().SetRangeBase(load_addr);
	range_info.GetRange().SetRangeEnd(LLDB_INVALID_ADDRESS);
	range_info.SetReadable(MemoryRegionInfo::OptionalBool::eNo);
	range_info.SetWritable(MemoryRegionInfo::OptionalBool::eNo);
	range_info.SetExecutable(MemoryRegionInfo::OptionalBool::eNo);
	range_info.SetMapped(MemoryRegionInfo::OptionalBool::eNo);
	return error;
	}

	Status NativeProcessNetBSD::PopulateMemoryRegionCache() {
	Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));
	// If our cache is empty, pull the latest. There should always be at least
	// one memory region if memory region handling is supported.
	if (!m_mem_region_cache.empty()) {
	LLDB_LOG(log, "reusing {0} cached memory region entries",
	m_mem_region_cache.size());
	return Status();
	}

	struct kinfo_vmentry *vm;
	size_t count, i;
	vm = kinfo_getvmmap(GetID(), &count);
	if (vm == NULL) {
	m_supports_mem_region = LazyBool::eLazyBoolNo;
	Status error;
	error.SetErrorString("not supported");
	return error;
	}
	for (i = 0; i < count; i++) {
	MemoryRegionInfo info;
	info.Clear();
	info.GetRange().SetRangeBase(vm[i].kve_start);
	info.GetRange().SetRangeEnd(vm[i].kve_end);
	info.SetMapped(MemoryRegionInfo::OptionalBool::eYes);

	if (vm[i].kve_protection & VM_PROT_READ)
	info.SetReadable(MemoryRegionInfo::OptionalBool::eYes);
	else
	info.SetReadable(MemoryRegionInfo::OptionalBool::eNo);

	if (vm[i].kve_protection & VM_PROT_WRITE)
	info.SetWritable(MemoryRegionInfo::OptionalBool::eYes);
	else
	info.SetWritable(MemoryRegionInfo::OptionalBool::eNo);

	if (vm[i].kve_protection & VM_PROT_EXECUTE)
	info.SetExecutable(MemoryRegionInfo::OptionalBool::eYes);
	else
	info.SetExecutable(MemoryRegionInfo::OptionalBool::eNo);

	if (vm[i].kve_path[0])
	info.SetName(vm[i].kve_path);

	m_mem_region_cache.emplace_back(
	info, FileSpec(info.GetName().GetCString(), true));
	}
	free(vm);

	if (m_mem_region_cache.empty()) {
	// No entries after attempting to read them. This shouldn't happen.
	// Assume we don't support map entries.
	LLDB_LOG(log, "failed to find any vmmap entries, assuming no support "
	"for memory region metadata retrieval");
	m_supports_mem_region = LazyBool::eLazyBoolNo;
	Status error;
	error.SetErrorString("not supported");
	return error;
	}
	LLDB_LOG(log, "read {0} memory region entries from process {1}",
	m_mem_region_cache.size(), GetID());
	// We support memory retrieval, remember that.
	m_supports_mem_region = LazyBool::eLazyBoolYes;
	return Status();
	}

	Status NativeProcessNetBSD::AllocateMemory(size_t size, uint32_t permissions,
	lldb::addr_t &addr) {
	return Status("Unimplemented");
	}

	Status NativeProcessNetBSD::DeallocateMemory(lldb::addr_t addr) {
	return Status("Unimplemented");
	}

	lldb::addr_t NativeProcessNetBSD::GetSharedLibraryInfoAddress() {
	// punt on this for now
	return LLDB_INVALID_ADDRESS;
	}

	size_t NativeProcessNetBSD::UpdateThreads() { return m_threads.size(); }

	Status NativeProcessNetBSD::SetBreakpoint(lldb::addr_t addr, uint32_t size,
	bool hardware) {
	if (hardware)
	return Status("NativeProcessNetBSD does not support hardware breakpoints");
	else
	return SetSoftwareBreakpoint(addr, size);
	}

	Status NativeProcessNetBSD::GetSoftwareBreakpointTrapOpcode(
	size_t trap_opcode_size_hint, size_t &actual_opcode_size,
	const uint8_t *&trap_opcode_bytes) {
	static const uint8_t g_i386_opcode[] = {0xCC};

	switch (m_arch.GetMachine()) {
	case llvm::Triple::x86:
	case llvm::Triple::x86_64:
	trap_opcode_bytes = g_i386_opcode;
	actual_opcode_size = sizeof(g_i386_opcode);
	return Status();
	default:
	assert(false && "CPU type not supported!");
	return Status("CPU type not supported");
	}
	}

	Status NativeProcessNetBSD::GetLoadedModuleFileSpec(const char *module_path,
	FileSpec &file_spec) {
	return Status("Unimplemented");
	}

	Status NativeProcessNetBSD::GetFileLoadAddress(const llvm::StringRef &file_name,
	lldb::addr_t &load_addr) {
	load_addr = LLDB_INVALID_ADDRESS;
	return Status();
	}

	void NativeProcessNetBSD::SigchldHandler() {
	Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_PROCESS));
	// Process all pending waitpid notifications.
	int status;
	::pid_t wait_pid =
	llvm::sys::RetryAfterSignal(-1, waitpid, GetID(), &status, WALLSIG \| WNOHANG);

	if (wait_pid == 0)
	return; // We are done.

	if (wait_pid == -1) {
	Status error(errno, eErrorTypePOSIX);
	LLDB_LOG(log, "waitpid ({0}, &status, _) failed: {1}", GetID(), error);
	}

	WaitStatus wait_status = WaitStatus::Decode(status);
	bool exited = wait_status.type == WaitStatus::Exit \|\|
	(wait_status.type == WaitStatus::Signal &&
	wait_pid == static_cast<::pid_t>(GetID()));

	LLDB_LOG(log,
	"waitpid ({0}, &status, _) => pid = {1}, status = {2}, exited = {3}",
	GetID(), wait_pid, status, exited);

	if (exited)
	MonitorExited(wait_pid, wait_status);
	else {
	assert(wait_status.type == WaitStatus::Stop);
	MonitorCallback(wait_pid, wait_status.status);
	}
	}

	bool NativeProcessNetBSD::HasThreadNoLock(lldb::tid_t thread_id) {
	for (const auto &thread : m_threads) {
	assert(thread && "thread list should not contain NULL threads");
	if (thread->GetID() == thread_id) {
	// We have this thread.
	return true;
	}
	}

	// We don't have this thread.
	return false;
	}

	NativeThreadNetBSD &NativeProcessNetBSD::AddThread(lldb::tid_t thread_id) {

	Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_THREAD));
	LLDB_LOG(log, "pid {0} adding thread with tid {1}", GetID(), thread_id);

	assert(!HasThreadNoLock(thread_id) &&
	"attempted to add a thread by id that already exists");

	// If this is the first thread, save it as the current thread
	if (m_threads.empty())
	SetCurrentThreadID(thread_id);

	m_threads.push_back(llvm::make_unique<NativeThreadNetBSD>(*this, thread_id));
	return static_cast<NativeThreadNetBSD &>(*m_threads.back());
	}

	Status NativeProcessNetBSD::Attach() {
	// Attach to the requested process.
	// An attach will cause the thread to stop with a SIGSTOP.
	Status status = PtraceWrapper(PT_ATTACH, m_pid);
	if (status.Fail())
	return status;

	int wstatus;
	// Need to use WALLSIG otherwise we receive an error with errno=ECHLD
	// At this point we should have a thread stopped if waitpid succeeds.
	if ((wstatus = waitpid(m_pid, NULL, WALLSIG)) < 0)
	return Status(errno, eErrorTypePOSIX);

	/* Initialize threads */
	status = ReinitializeThreads();
	if (status.Fail())
	return status;

	for (const auto &thread : m_threads)
	static_cast<NativeThreadNetBSD &>(*thread).SetStoppedBySignal(SIGSTOP);

	// Let our process instance know the thread has stopped.
	SetState(StateType::eStateStopped);
	return Status();
	}

	Status NativeProcessNetBSD::ReadMemory(lldb::addr_t addr, void *buf,
	size_t size, size_t &bytes_read) {
	unsigned char dst = static_cast<unsigned char >(buf);
	struct ptrace_io_desc io;

	Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_MEMORY));
	LLDB_LOG(log, "addr = {0}, buf = {1}, size = {2}", addr, buf, size);

	bytes_read = 0;
	io.piod_op = PIOD_READ_D;
	io.piod_len = size;

	do {
	io.piod_offs = (void *)(addr + bytes_read);
	io.piod_addr = dst + bytes_read;

	Status error = NativeProcessNetBSD::PtraceWrapper(PT_IO, GetID(), &io);
	if (error.Fail())
	return error;

	bytes_read = io.piod_len;
	io.piod_len = size - bytes_read;
	} while (bytes_read < size);

	return Status();
	}

	Status NativeProcessNetBSD::ReadMemoryWithoutTrap(lldb::addr_t addr, void *buf,
	size_t size,
	size_t &bytes_read) {
	Status error = ReadMemory(addr, buf, size, bytes_read);
	if (error.Fail())
	return error;
	return m_breakpoint_list.RemoveTrapsFromBuffer(addr, buf, size);
	}

	Status NativeProcessNetBSD::WriteMemory(lldb::addr_t addr, const void *buf,
	size_t size, size_t &bytes_written) {
	const unsigned char src = static_cast<const unsigned char >(buf);
	Status error;
	struct ptrace_io_desc io;

	Log *log(ProcessPOSIXLog::GetLogIfAllCategoriesSet(POSIX_LOG_MEMORY));
	LLDB_LOG(log, "addr = {0}, buf = {1}, size = {2}", addr, buf, size);

	bytes_written = 0;
	io.piod_op = PIOD_WRITE_D;
	io.piod_len = size;

	do {
	io.piod_addr = const_cast<void >(static_cast<const void >(src + bytes_written));
	io.piod_offs = (void *)(addr + bytes_written);

	Status error = NativeProcessNetBSD::PtraceWrapper(PT_IO, GetID(), &io);
	if (error.Fail())
	return error;

	bytes_written = io.piod_len;
	io.piod_len = size - bytes_written;
	} while (bytes_written < size);

	return error;
	}

	llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>>
	NativeProcessNetBSD::GetAuxvData() const {
	/*
	* ELF_AUX_ENTRIES is currently restricted to kernel
	* (<sys/exec_elf.h> r. 1.155 specifies 15)
	*
	* ptrace(2) returns the whole AUXV including extra fiels after AT_NULL this
	* information isn't needed.
	*/
	size_t auxv_size = 100 * sizeof(AuxInfo);

	ErrorOr<std::unique_ptr<MemoryBuffer>> buf =
	llvm::MemoryBuffer::getNewMemBuffer(auxv_size);

	struct ptrace_io_desc io;
	io.piod_op = PIOD_READ_AUXV;
	io.piod_offs = 0;
	io.piod_addr = const_cast<void >(static_cast<const void >(buf.get()->getBufferStart()));
	io.piod_len = auxv_size;

	Status error = NativeProcessNetBSD::PtraceWrapper(PT_IO, GetID(), &io);

	if (error.Fail())
	return std::error_code(error.GetError(), std::generic_category());

	if (io.piod_len < 1)
	return std::error_code(ECANCELED, std::generic_category());

	return buf;
	}

	Status NativeProcessNetBSD::ReinitializeThreads() {
	// Clear old threads
	m_threads.clear();

	// Initialize new thread
	struct ptrace_lwpinfo info = {};
	Status error = PtraceWrapper(PT_LWPINFO, GetID(), &info, sizeof(info));
	if (error.Fail()) {
	return error;
	}
	// Reinitialize from scratch threads and register them in process
	while (info.pl_lwpid != 0) {
	AddThread(info.pl_lwpid);
	error = PtraceWrapper(PT_LWPINFO, GetID(), &info, sizeof(info));
	if (error.Fail()) {
	return error;
	}
	}

	return error;
	}
	Index: head/contrib/llvm/tools/lldb
	===================================================================
	--- head/contrib/llvm/tools/lldb (revision 329409)
	+++ head/contrib/llvm/tools/lldb (revision 329410)

	Property changes on: head/contrib/llvm/tools/lldb
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/lldb/dist-release_60:r328795-329405
	Index: head/contrib/llvm
	===================================================================
	--- head/contrib/llvm (revision 329409)
	+++ head/contrib/llvm (revision 329410)

	Property changes on: head/contrib/llvm
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/llvm/dist-release_60:r328795-329405
	Index: head/lib/clang/include/clang/Basic/Version.inc
	===================================================================
	--- head/lib/clang/include/clang/Basic/Version.inc (revision 329409)
	+++ head/lib/clang/include/clang/Basic/Version.inc (revision 329410)
	@@ -1,11 +1,11 @@
	/* $FreeBSD$ */

	#define CLANG_VERSION 6.0.0
	#define CLANG_VERSION_STRING "6.0.0"
	#define CLANG_VERSION_MAJOR 6
	#define CLANG_VERSION_MINOR 0
	#define CLANG_VERSION_PATCHLEVEL 0

	#define CLANG_VENDOR "FreeBSD "

	-#define SVN_REVISION "324090"
	+#define SVN_REVISION "325330"
	Index: head/lib/clang/include/lld/Common/Version.inc
	===================================================================
	--- head/lib/clang/include/lld/Common/Version.inc (revision 329409)
	+++ head/lib/clang/include/lld/Common/Version.inc (revision 329410)
	@@ -1,8 +1,8 @@
	// $FreeBSD$

	#define LLD_VERSION 6.0.0
	#define LLD_VERSION_STRING "6.0.0"
	#define LLD_VERSION_MAJOR 6
	#define LLD_VERSION_MINOR 0
	-#define LLD_REVISION_STRING "324090"
	+#define LLD_REVISION_STRING "325330"
	#define LLD_REPOSITORY_STRING "FreeBSD"
	Index: head/lib/clang/include/llvm/Support/VCSRevision.h
	===================================================================
	--- head/lib/clang/include/llvm/Support/VCSRevision.h (revision 329409)
	+++ head/lib/clang/include/llvm/Support/VCSRevision.h (revision 329410)
	@@ -1,2 +1,2 @@
	/* $FreeBSD$ */
	-#define LLVM_REVISION "svn-r324090"
	+#define LLVM_REVISION "svn-r325330"

File Metadata

Mime Type: application/octet-stream
Expires: Thu, Oct 16, 9:45 AM (1 d, 23 h)
Storage Engine: chunks
Storage Format: Chunks
Storage Handle: xtl7xtfQG.ll
Default Alt Text: (6 MB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions